In [1]:
import pandas as pd

# District Dataset

In [10]:
# This file has all states and UT of india with 2001 and 2011 pops.
dist_pop_01_11 = pd.read_csv(
    'data_files/raw_data/district wise population for year 2001 and 2011.csv')

In [88]:
dist_pop_01_11.rename(columns={
    'Population in 2001': '2001',
    'Population in 2011': '2011'
}, inplace=True)
dist_pop_01_11.columns = dist_pop_01_11.columns.str.lower()

## Some conclusions:
- `District` column has 5 duplicated values because some district names are present in other states also. **And this is not a problem**.


## Additional Features:
- Calculate the percentage of growth in population in 10 years as `(2011/2001) * 100`.
- Calculate the actual growth in population in 10 years as `2011 - 2001`.
- Per year poplation growth `(2011-2001) / 10`.

In [6]:
# Population growth in 10 growth
dist_pop_01_11['pop_growth'] = dist_pop_01_11['2011'].sub(dist_pop_01_11['2001'])

In [4]:
# Population growth in percentage(%)
dist_pop_01_11['pct_growth'] = ((dist_pop_01_11['2011'].sub(dist_pop_01_11['2001']) / dist_pop_01_11['2001'])
                                .mul(100).round().astype('int'))

In [79]:
dist_pop_01_11.head()

Unnamed: 0,State,District,Population in 2001,Population in 2011
0,Andaman & Nicobar Islands,Nicobar,42068,36842
1,Andaman & Nicobar Islands,North & Middle Andaman,105613,105597
2,Andaman & Nicobar Islands,South Andaman,208471,238142
3,Andhra Pradesh,Anantapur,3640478,4081148
4,Andhra Pradesh,Chittoor,3745875,4174064


### Export the data into CSV formate.

In [9]:
# dist_pop_01_11.to_csv('data_files/District_pop_with_latlong_01_11.csv', index=False)

# District dataset merged with another features

## Important columns:

After analysis the dataset I found that the most important cols for this project are following:

```python
use_cols = [
    'State name', 'District name', 'Population', 'Male', 'Female', 'Literate',
    # Male & Female
    'Male_Literate', 'Female_Literate', 
    # SC & ST Caste
    'SC', 'ST',
    # Religion
    'Hindus', 'Muslims', 'Christians', 'Sikhs', 'Buddhists', 'Jains', 'Others_Religions', 'Religion_Not_Stated', 
    # Rural and Urban household no.
    'Rural_Households', 'Urban_Households',
    # Education
    'Primary_Education', 'Middle_Education', 'Secondary_Education', 'Higher_Education', 'Graduate_Education',
    # Age groups
    'Age_Group_0_29', 'Age_Group_30_49', 'Age_Group_50', 'Age not stated',
    # Households
    'Household_size_1_person_Households', 'Household_size_2_persons_Households', 
    'Household_size_1_to_2_persons', 'Household_size_3_persons_Households', 
    'Household_size_3_to_5_persons_Households', 'Household_size_4_persons_Households',
]
```


In [2]:
use_cols = [
    'State name', 'District name', 'Population', 'Male', 'Female', 'Literate',
    # Male & Female
    'Male_Literate', 'Female_Literate', 
    # SC & ST Caste
    'SC', 'ST',
    # Religion
    'Hindus', 'Muslims', 'Christians', 'Sikhs', 'Buddhists', 'Jains', 'Others_Religions', 'Religion_Not_Stated', 
    # Rural and Urban household no.
    'Rural_Households', 'Urban_Households',
    # Education
    'Primary_Education', 'Middle_Education', 'Secondary_Education', 'Higher_Education', 'Graduate_Education',
    # Age groups
    'Age_Group_0_29', 'Age_Group_30_49', 'Age_Group_50', 'Age not stated',
    # Households
    'Household_size_1_person_Households', 'Household_size_2_persons_Households', 
    'Household_size_1_to_2_persons', 'Household_size_3_persons_Households', 
    'Household_size_3_to_5_persons_Households', 'Household_size_4_persons_Households',
]


In [3]:
ind_dist_11 = pd.read_csv('../data/raw/india-districts-census-2011.csv', usecols=use_cols)

In [4]:
dist_latlong = pd.read_csv('../data/raw/district wise centroids.csv')

### Some important moves for not lossing some data.

In [5]:
# Dataset improvement
dist_latlong['District'] = dist_latlong['District'].str.replace(r' \(.*', '', regex=True)
ind_dist_11['District name'] = ind_dist_11['District name'].str.replace(r' \(.*', '', regex=True)    # +4 cols

dist_latlong['District'] = dist_latlong['District'].str.lower()
ind_dist_11['District name'] = ind_dist_11['District name'].str.lower()    # +2 cols

In [6]:
district = dist_latlong.merge(ind_dist_11, how='inner', left_on='District', right_on='District name')

In [7]:
# Drop the cols which contains same data.
district.drop(columns=['District name', 'State name'], inplace=True)

# Make the District as it is before.
district['District'] = district['District'].str.title()

In [8]:
# Add some features: Sex Ratio, Literacy Rate.
# Sex Ratio
district['Sex Ratio'] = district['Male'].mul(100).div(district['Female']).round().astype('int')

# Literacy Rate
district['Literacy Rate'] = district['Literate'].div(district['Population']).mul(100).round().astype('int')
district['Male Literacy Rate'] = district['Male_Literate'].div(district['Male']).mul(100).round().astype('int')
district['Female Literacy Rate'] = district['Female_Literate'].div(district['Female']).mul(100).round().astype('int')

In [9]:
district.head()

Unnamed: 0,State,District,Latitude,Longitude,Population,Male,Female,Literate,Male_Literate,Female_Literate,...,Household_size_1_person_Households,Household_size_2_persons_Households,Household_size_1_to_2_persons,Household_size_3_persons_Households,Household_size_3_to_5_persons_Households,Household_size_4_persons_Households,Sex Ratio,Literacy Rate,Male Literacy Rate,Female Literacy Rate
0,Andhra Pradesh,Adilabad,19.284514,78.813212,2741239,1369597,1371642,1483347,856350,626997,...,27018,80780,107798,103066,417387,182282,100,54,63,46
1,Andhra Pradesh,Anantapur,14.312066,77.460158,4081148,2064495,2016653,2310960,1338474,972486,...,36633,106460,143093,156107,644263,291388,102,57,65,48
2,Andhra Pradesh,Chittoor,13.331093,78.927639,4174064,2090204,2083860,2667878,1484794,1183084,...,55151,132169,187320,175853,678054,311451,100,64,71,57
3,Andhra Pradesh,East Godavari,16.782718,82.243207,5154296,2569688,2584608,3288577,1716933,1571644,...,95856,239150,335006,275218,952492,468704,99,64,67,61
4,Andhra Pradesh,Guntur,15.884926,80.586576,4887813,2440521,2447292,2960441,1634726,1325715,...,83026,212589,295615,229537,842985,413160,100,61,67,54


### Export the fianl `df`.

In [12]:
district.to_csv('../data/District_census_2011.csv', index=False)

# Make a `States` centric data to analyse

In [15]:
dist_df = pd.read_csv('../data/District_census_2011.csv')
dist_df.drop(columns=['District', 'Sex Ratio', 'Literacy Rate', 'Male Literacy Rate', 'Female Literacy Rate'], inplace=True)
states_centroids = pd.read_csv('../data/raw/state wise centroids_2011.csv')

# Make a State centric dataset for analysis of states.
state = (dist_df.drop(columns=['Latitude', 'Longitude'])
            .merge(states_centroids, how='inner', on='State')
            .groupby('State')
            .sum())

state= (state.drop(columns=['Latitude', 'Longitude'])
            .merge(states_centroids, how='inner', on='State'))
# Add some features: Sex Ratio, Literacy Rate.
# Sex Ratio
state['Sex Ratio'] = state['Male'].mul(100).div(state['Female']).round().astype('int')

# Literacy Rate
state['Literacy Rate'] = state['Literate'].div(state['Population']).mul(100).round().astype('int')
state['Male Literacy Rate'] = state['Male_Literate'].div(state['Male']).mul(100).round().astype('int')
state['Female Literacy Rate'] = state['Female_Literate'].div(state['Female']).mul(100).round().astype('int')

# Drop the extra cols - Done this only in state dataset
state.drop(columns=['Male', 'Female' ,'Literate', 'Male_Literate', 'Female_Literate'], inplace=True)
state.reset_index(drop=True)


In [16]:
state.head()

Unnamed: 0,State,Population,SC,ST,Hindus,Muslims,Christians,Sikhs,Buddhists,Jains,...,Household_size_1_to_2_persons,Household_size_3_persons_Households,Household_size_3_to_5_persons_Households,Household_size_4_persons_Households,Longitude,Latitude,Sex Ratio,Literacy Rate,Male Literacy Rate,Female Literacy Rate
0,Andhra Pradesh,69147421,11764168,4718933,61229003,6636078,858657,28836,33457,44636,...,3250219,2923281,11268474,5274920,79.916203,16.554124,100,59,66,52
1,Arunachal Pradesh,1354556,0,929669,385647,26279,418238,3007,162045,757,...,35430,30237,115082,42586,94.545327,27.725765,106,55,62,49
2,Assam,23415900,1748550,2688192,14409242,8021766,858621,15059,45291,13765,...,500715,687090,2650366,1039693,92.65731,26.321341,104,62,67,57
3,Bihar,105473507,16596291,1421368,86533796,18124944,143441,28566,326516,49692,...,2247602,2057404,8610298,3068648,85.636774,25.771394,109,51,59,42
4,Chandigarh,1055450,199086,0,852574,51447,8720,138329,1160,1960,...,39616,36024,144460,64040,76.758725,30.7426,122,76,80,72


In [17]:
# Export the dataset
state.to_csv('../data/State_census_2011.csv', index=True)