In [5]:
import pandas as pd

# District Dataset

In [10]:
# This file has all states and UT of india with 2001 and 2011 pops.
dist_pop_01_11 = pd.read_csv(
    'data_files/raw_data/district wise population for year 2001 and 2011.csv')

In [88]:
dist_pop_01_11.rename(columns={
    'Population in 2001': '2001',
    'Population in 2011': '2011'
}, inplace=True)
dist_pop_01_11.columns = dist_pop_01_11.columns.str.lower()

## Some conclusions:
- `District` column has 5 duplicated values because some district names are present in other states also. **And this is not a problem**.


## Additional Features:
- Calculate the percentage of growth in population in 10 years as `(2011/2001) * 100`.
- Calculate the actual growth in population in 10 years as `2011 - 2001`.
- Per year poplation growth `(2011-2001) / 10`.

In [6]:
# Population growth in 10 growth
dist_pop_01_11['pop_growth'] = dist_pop_01_11['2011'].sub(dist_pop_01_11['2001'])

In [4]:
# Population growth in percentage(%)
dist_pop_01_11['pct_growth'] = ((dist_pop_01_11['2011'].sub(dist_pop_01_11['2001']) / dist_pop_01_11['2001'])
                                .mul(100).round().astype('int'))

In [79]:
dist_pop_01_11.head()

Unnamed: 0,State,District,Population in 2001,Population in 2011
0,Andaman & Nicobar Islands,Nicobar,42068,36842
1,Andaman & Nicobar Islands,North & Middle Andaman,105613,105597
2,Andaman & Nicobar Islands,South Andaman,208471,238142
3,Andhra Pradesh,Anantapur,3640478,4081148
4,Andhra Pradesh,Chittoor,3745875,4174064


### Export the data into CSV formate.

In [9]:
# dist_pop_01_11.to_csv('data_files/District_pop_with_latlong_01_11.csv', index=False)

# --- --- --- --- --- --- ---

# District dataset merged with another features

## Important columns:

After analysis the dataset I found that the most important cols for this project are following:

```python
use_cols = [
    'State name', 'District name', 'Population', 'Male', 'Female', 'Literate',
    # Male & Female
    'Male_Literate', 'Female_Literate', 
    # SC & ST Caste
    'SC', 'ST',
    # Religion
    'Hindus', 'Muslims', 'Christians', 'Sikhs', 'Buddhists', 'Jains', 'Others_Religions', 'Religion_Not_Stated', 
    # Rural and Urban household no.
    'Rural_Households', 'Urban_Households',
    # Education
    'Primary_Education', 'Middle_Education', 'Secondary_Education', 'Higher_Education', 'Graduate_Education',
    # Age groups
    'Age_Group_0_29', 'Age_Group_30_49', 'Age_Group_50', 'Age not stated',
    # Households
    'Household_size_1_person_Households', 'Household_size_2_persons_Households', 
    'Household_size_1_to_2_persons', 'Household_size_3_persons_Households', 
    'Household_size_3_to_5_persons_Households', 'Household_size_4_persons_Households',
]
```


In [5]:
use_cols = [
    'State name', 'District name', 'Population', 'Male', 'Female', 'Literate',
    # Male & Female
    'Male_Literate', 'Female_Literate', 
    # SC & ST Caste
    'SC', 'ST',
    # Religion
    'Hindus', 'Muslims', 'Christians', 'Sikhs', 'Buddhists', 'Jains', 'Others_Religions', 'Religion_Not_Stated', 
    # Rural and Urban household no.
    'Rural_Households', 'Urban_Households',
    # Education
    'Primary_Education', 'Middle_Education', 'Secondary_Education', 'Higher_Education', 'Graduate_Education',
    # Age groups
    'Age_Group_0_29', 'Age_Group_30_49', 'Age_Group_50', 'Age not stated',
    # Households
    'Household_size_1_person_Households', 'Household_size_2_persons_Households', 
    'Household_size_1_to_2_persons', 'Household_size_3_persons_Households', 
    'Household_size_3_to_5_persons_Households', 'Household_size_4_persons_Households',
]


In [None]:
ind_dist_11 = pd.read_csv('data_files/raw_data/india-districts-census-2011.csv', usecols=use_cols)
ind_dist_11.shape

In [None]:
dist_latlong = pd.read_csv('data_files/raw_data/district wise centroids.csv')
dist_latlong.shape

(594, 4)

### Some important moves for not lossing some data.

In [None]:
# Dataset improvement
dist_latlong['District'] = dist_latlong['District'].str.replace(r' \(.*', '', regex=True)
ind_dist_11['District name'] = ind_dist_11['District name'].str.replace(r' \(.*', '', regex=True)    # +4 cols

dist_latlong['District'] = dist_latlong['District'].str.lower()
ind_dist_11['District name'] = ind_dist_11['District name'].str.lower()    # +2 cols

In [None]:
df = dist_latlong.merge(ind_dist_11, 'inner', left_on='District', right_on='District name')

In [None]:
df.shape

(522, 37)

In [None]:
# Drop the cols which contains same data.
df.drop(columns=['District name', 'State name'], inplace=True)

# Make the District as it is before.
df['District'] = df['District'].str.title()

### Export the fianl `df`.

In [None]:
# df.to_csv('data_files/India_census_2011.csv', index=False)

# Make a States centric data to analyse

In [10]:
dist_df = pd.read_csv('../data/District_census_2011.csv')
states_centroids = pd.read_csv('../data/raw/state wise centroids_2011.csv')

# Make a State centric dataset for analysis of states.
state_df = (dist_df.drop(columns=['Latitude', 'Longitude'])
            .merge(states_centroids, how='inner', on='State')
            .groupby('State')
            .sum(numeric_only=True))

state_df = (state_df.drop(columns=['Latitude', 'Longitude'])
            .merge(states_centroids, how='inner', on='State'))

# Export the dataset
state_df.to_csv('../data/State_census_2011.csv', index=True)