In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

In [2]:
#ASSIGNING COL NAMES
column_names = ['State', 'District', 'Name', 'TRU','Age-group','Persons.2','Males.2','Females.2','Persons.3','Males.3','Females.3']
#READING DATA
df = pd.read_excel('DDW-C18-0000.xlsx',names=column_names,skiprows=5)

In [3]:
#TAKING ONLY REQUIRED COLUMNS "TWO or MORE LANGUAGES" and "THREE OR MORE LANGUAGES"
df = df[['State','Name','TRU','Age-group','Persons.2','Persons.3']]

In [4]:
# REMOVING URBAN AND RURAL ROWS AND AGE-GROUP CATEGORIES OTHER THAN TOTAL
idx = df.loc[(df['TRU']!='Total') | (df['Age-group'] !='Total')].index
df = df.drop(idx)

In [5]:
df.reset_index(inplace=True,drop=True)

In [6]:
df.head()

Unnamed: 0,State,Name,TRU,Age-group,Persons.2,Persons.3
0,0,INDIA,Total,Total,314988770,86009580
1,1,JAMMU & KASHMIR,Total,Total,6183190,2096220
2,2,HIMACHAL PRADESH,Total,Total,1242705,347286
3,3,PUNJAB,Total,Total,13035224,7829913
4,4,CHANDIGARH,Total,Total,579920,321979


In [7]:
# CALCULATING EXACTLY TWO-LANG
df['only2lang'] = df['Persons.2'] - df['Persons.3']

In [8]:
#CREATING A COPY FOR FURTHER USAGE
org_df = df.copy()

PREPROCESSING CENSUSDATA

In [9]:
censuscols = ['State','Level','Name','TRU','TOT_P']
census_data = pd.read_excel(r'DDW_PCA0000_2011_Indiastatedist.xlsx',usecols=censuscols)

In [10]:
#dropping rows OF TRU =(rural,urban) as not needed
a = census_data.loc[(census_data['TRU']=='Rural') | (census_data['TRU']=='Urban') | (census_data['Level'] == 'DISTRICT')].index
census_data = census_data.drop(a)

In [11]:
census_data.reset_index(inplace=True,drop=True)

### RATIO OF 1LANG TO 2lang

In [12]:
#CALCULATING ONLY ONE LANG SPEAKERS  and we use the saved dataframe above
org_df['only1lang'] = census_data['TOT_P'] - org_df['Persons.2']

In [13]:
org_df.head()

Unnamed: 0,State,Name,TRU,Age-group,Persons.2,Persons.3,only2lang,only1lang
0,0,INDIA,Total,Total,314988770,86009580,228979190,895866207
1,1,JAMMU & KASHMIR,Total,Total,6183190,2096220,4086970,6358112
2,2,HIMACHAL PRADESH,Total,Total,1242705,347286,895419,5621897
3,3,PUNJAB,Total,Total,13035224,7829913,5205311,14708114
4,4,CHANDIGARH,Total,Total,579920,321979,257941,475530


In [14]:
#CALCULATING RATIOS
org_df['r2to1'] = org_df['only2lang'] / org_df['only1lang']

In [15]:
#SORTING DATAFRAME
org_df = org_df.sort_values(['r2to1'])

In [16]:
#AN EMPTY DATAFRAME
newdf2 = pd.DataFrame()

In [17]:
high2 = org_df.iloc[[-1,-2,-3],[0,-1]]                 # SELECTING TOP 3 RATIOS as data is sorted in ASCENDING ORDER OF RATIOS
low2 = org_df.iloc[[0,1,2],[0,-1]]                     # SELECTING LEAST 3 RATIOS

In [18]:
#high2
#low2

In [19]:
#APPENDING INTO A SINGLE DATAFRAME
newdf2 = newdf2.append([high2,low2])

newdf2.reset_index(inplace=True,drop=True)

In [20]:
newdf2

Unnamed: 0,State,r2to1
0,35,1.499862
1,30,1.157976
2,12,0.938953
3,8,0.105877
4,9,0.114661
5,22,0.133643


In [21]:
#WRITING TO CSV
newdf2.to_csv('2-to-1-ratio.csv',index = False)