# Using Naive Bayes to Predict What State You're From (If You Live in India)

In [41]:
import numpy as np
import pandas as pd
import requests

base_url = 'http://digital-library.census.ihsn.org/index.php/api/tables/'

I have some preemptive metadata in a csv file. The India Census API uses codes for many of the different fields, some of which is provided alongside code labels with each request, but others are common and inferred through this metadata.

In [42]:
# Create maps for state, district, 
# subdistrict, and town-village
# 0 means ALL
sdsdtv = pd.read_csv('data/PC11_TV_DIR.csv')

# States map
states = sdsdtv.loc[ (sdsdtv['District Code'] == 0) \
                    & (sdsdtv['Sub District Code'] == 0) \
                    & (sdsdtv['Town-Village Code'] == 0) ]
states = states[[
    'State Code', 
    'Town-Village Name'
]]
states = states.set_index('State Code')
states = states['Town-Village Name']
states.at[0] = 'ALL'
states = states.sort_index()
states = states.rename('State Name')

# Districts map
districts = sdsdtv.loc[ (sdsdtv['Sub District Code'] == 0) \
                    & (sdsdtv['Town-Village Code'] == 0) ]
districts = districts[[ 'District Code', 'Town-Village Name' ]]
districts = districts.drop_duplicates(subset=['District Code'])
districts = districts.set_index('District Code')
districts = districts['Town-Village Name']
districts.at[0] = 'All'
districts = districts.sort_index()
districts = districts.rename('District Name')

# Sub Districts map
sub_districts = sdsdtv.loc[ (sdsdtv['Town-Village Code'] == 0) ]
sub_districts = sub_districts[[
    'Sub District Code',
    'Town-Village Name'
]]
sub_districts = sub_districts.drop_duplicates(subset=['Sub District Code'])
sub_districts = sub_districts.set_index('Sub District Code')
sub_districts = sub_districts['Town-Village Name']
sub_districts.at[0] = 'All'
sub_districts = sub_districts.sort_index()
sub_districts = sub_districts.rename('Sub District Name')

# Town/Villages map
town_villages = sdsdtv[[ 'Town-Village Code', 'Town-Village Name' ]]
town_villages = town_villages.drop_duplicates(subset=['Town-Village Code'])
town_villages = town_villages.set_index('Town-Village Code')
town_villages = town_villages['Town-Village Name']
town_villages.at[0] = 'All'
town_villages = town_villages.sort_index()
town_villages = town_villages.rename('Sub District Name')

In [43]:
common_features = {
    'state': states,
    'district': districts,
    'subdistrict': sub_districts,
    'town': town_villages
}

def get_dataset(table, **kwargs):
    # Get info of dataset (features)
    year = kwargs.pop('year', '2011')
    info_url = base_url + f'/info/{year}/{table}'
    info = requests.get(info_url).json()
    features = info['result']['result_']['features']
    feature_map = { 
        feature['feature_name'] : pd.Series(
            data=[c['label'] for c in feature['code_list']],
            index=[c['code'] for c in feature['code_list']])
        for feature in features }
    feature_map |= common_features
    
    # Get data
    data_url = base_url + f'/data/{year}/{table}'
    query_string = '?' + '&'.join(f'{k}={v}' for k,v in kwargs.items()) if kwargs else ''
    body = requests.get(data_url+query_string).json()
    
    # Map codes to features in dataset
    df = pd.DataFrame(body['data'])
    for feature, labels in feature_map.items():
        if feature in df:
            df[feature] = df[feature].map(lambda r: labels.at[r])
        
    # Return dataframe
    return df

In [44]:
# Get total population (as of 2011)
population = get_dataset('PC11_C01', 
                         state='0', 
                         urbrur='0', 
                         geo_level='0',
                         sex='0',
                         religion='0',
                         fields='value')
population = population['value'].at[0]
print('Total Population:', population)

Total Population: 1210854977


In [45]:
# Get state populations
states = get_dataset('PC11_C01',
                     state='1-35',
                     urbrur='0',
                     geo_level='1',
                     sex='0',
                     religion='0', 
                     fields='state,value')
states = states.set_index('state')['value']
states = states.rename('state')
states

state
JAMMU & KASHMIR               12541302
HIMACHAL PRADESH               6864602
PUNJAB                        27743338
CHANDIGARH                     1055450
UTTARAKHAND                   10086292
HARYANA                       25351462
NCT OF DELHI                  16787941
RAJASTHAN                     68548437
UTTAR PRADESH                199812341
BIHAR                        104099452
SIKKIM                          610577
ARUNACHAL PRADESH              1383727
NAGALAND                       1978502
MANIPUR                        2855794
MIZORAM                        1097206
TRIPURA                        3673917
MEGHALAYA                      2966889
ASSAM                         31205576
WEST BENGAL                   91276115
JHARKHAND                     32988134
ODISHA                        41974218
CHHATTISGARH                  25545198
MADHYA PRADESH                72626809
GUJARAT                       60439692
DAMAN & DIU                     243247
DADRA & NAGAR HAVEL

In [48]:
# Relative religions in the country
religion = get_dataset('PC11_C01',
                      state='0',
                      sex='0',
                      urbrur='0',
                      religion='1-6',
                      fields='religion,value')
religion = religion.set_index('religion')['value']
religion = religion.rename('religion')
religion /= population
religion

religion
Hindu        0.797996
Muslim       0.142251
Christian    0.022975
Sikh         0.017205
Buddhist     0.006973
Jain         0.003677
Name: religion, dtype: float64

In [51]:
# Religions per state
religion_given_state = get_dataset('PC11_C01',
                                 sex='0',
                                 urbrur='0',
                                 religion='1-6',
                                 state='1-35',
                                 district='0',
                                 limit=(35*6))
religion_given_state = religion_given_state.set_index(['state', 'religion'])['value']
religion_given_state /= states
religion_given_state

state                      religion
JAMMU & KASHMIR            Hindu       0.284394
HIMACHAL PRADESH           Hindu       0.951660
PUNJAB                     Hindu       0.384890
CHANDIGARH                 Hindu       0.807782
UTTARAKHAND                Hindu       0.829704
                                         ...   
LAKSHADWEEP                Jain        0.000171
KERALA                     Jain        0.000134
TAMIL NADU                 Jain        0.001237
PUDUCHERRY                 Jain        0.001122
ANDAMAN & NICOBAR ISLANDS  Jain        0.000081
Name: value, Length: 210, dtype: float64