## 1)percentage of population of India who speaks only one language, exactly two languages, and  three languages or more

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

In [2]:
#ASSIGNING COL NAMES
column_names = ['State', 'District', 'Name', 'TRU','Age-group','Persons.2','Males.2','Females.2','Persons.3','Males.3','Females.3']
#READING DATA
df = pd.read_excel('DDW-C18-0000.xlsx',names=column_names,skiprows=5)

In [3]:
#TAKING ONLY REQUIRED COLUMNS
df = df[['State','Name','TRU','Age-group','Persons.2','Persons.3']]
# REMOVING URBAN AND RURAL ROWS
idx = df.loc[(df['TRU']=='Urban') | (df['TRU']=='Rural') | (df['Age-group'] !='Total')].index
df = df.drop(idx)

In [4]:
df.reset_index(inplace=True,drop=True)

In [5]:
df.head()

Unnamed: 0,State,Name,TRU,Age-group,Persons.2,Persons.3
0,0,INDIA,Total,Total,314988770,86009580
1,1,JAMMU & KASHMIR,Total,Total,6183190,2096220
2,2,HIMACHAL PRADESH,Total,Total,1242705,347286
3,3,PUNJAB,Total,Total,13035224,7829913
4,4,CHANDIGARH,Total,Total,579920,321979


In [6]:
df.shape

(36, 6)

PREPROCESSING CENSUS POPULATION DATA

In [7]:
censuscols = ['State','Level','Name','TRU','TOT_P']
census_data = pd.read_excel(r'DDW_PCA0000_2011_Indiastatedist.xlsx',usecols=censuscols)

In [8]:
#dropping rows OF TRU =(rural,urban) as not needed
a = census_data.loc[(census_data['TRU']=='Rural') | (census_data['TRU']=='Urban') | (census_data['Level'] == 'DISTRICT')].index
census_data = census_data.drop(a)

In [9]:
census_data.reset_index(inplace=True,drop=True)

In [10]:
print(census_data.loc[0])
census_data.iloc[0,[1,2]] = ['INDIA','INDIA']


State             0
Level         India
Name          India
TRU           Total
TOT_P    1210854977
Name: 0, dtype: object


In [11]:
ls1 = df['Name'].values            #STATE NAMES IN C18FILE
ls2 = census_data['Name'].values   #STATE NAMES IN CENSUSPOPULATION FILE

[i for i in ls1 if i not in ls2]  # checking all states name are matching or not

[]

In [12]:
census_data.head()

Unnamed: 0,State,Level,Name,TRU,TOT_P
0,0,INDIA,INDIA,Total,1210854977
1,1,STATE,JAMMU & KASHMIR,Total,12541302
2,2,STATE,HIMACHAL PRADESH,Total,6864602
3,3,STATE,PUNJAB,Total,27743338
4,4,STATE,CHANDIGARH,Total,1055450


CALCULATING PERCENTAGES OF LANGUAGE SPEAKERS

In [13]:
#CREATING A COPY OF DATAFRAME
newdf = df.copy()

##### IN C18 FILE
column--"Number speaking second language" is "NUMBER OF PEOPLE SPEAKING TWO OR MORE LANGUAGES "

column--"Number speaking third language"  is "NUMBER OF PEOPLE SPEAKING THREE OR MORE LANGUAGES "


In [14]:
#COMMENT--
# COLUMN SECOND LANGUAGE IN C18 cannot be "exactly two language" because this leads to some of the states 
# have percentages more than 100 so it has to be "TWO OR MORE LANGUAGES"

In [15]:
#NUMBER OF SPEAKERS OF EACH CATEGORY
newdf['only1lang'] = census_data['TOT_P'] - newdf['Persons.2']   #EXACTLY ONE LANG
newdf['only2lang'] = newdf['Persons.2'] - newdf['Persons.3']    #EXACTLY TWO LANGUAGES

In [16]:
#ADDING POPULATION COLUMN TO THE LANGUAGE SPEAKERS DATAFRAME
newdf['censuspopulation'] = census_data['TOT_P']

In [17]:
newdf.head()

Unnamed: 0,State,Name,TRU,Age-group,Persons.2,Persons.3,only1lang,only2lang,censuspopulation
0,0,INDIA,Total,Total,314988770,86009580,895866207,228979190,1210854977
1,1,JAMMU & KASHMIR,Total,Total,6183190,2096220,6358112,4086970,12541302
2,2,HIMACHAL PRADESH,Total,Total,1242705,347286,5621897,895419,6864602
3,3,PUNJAB,Total,Total,13035224,7829913,14708114,5205311,27743338
4,4,CHANDIGARH,Total,Total,579920,321979,475530,257941,1055450


In [18]:
#CALCULATING PERCENTAGES OF EACH CATEGORY SPEAKERS
newdf['percent-one'] = (newdf['only1lang']/newdf['censuspopulation'])*100
newdf['percent-two'] = (newdf['only2lang']/newdf['censuspopulation'])*100
newdf['percent-three'] = (newdf['Persons.3']/newdf['censuspopulation'])*100

In [19]:
#RENAMING COLUMMN NAME
newdf.rename(columns={'State': 'state-code'}, inplace=True)

In [20]:
#TAKING ONLY REQUIRED COLUMNS FOR OUTPUT
newdf = newdf[['state-code','percent-one','percent-two', 'percent-three']]

In [21]:
#WRITING TO CSV
newdf.to_csv('percent-india.csv',index=False)