## 3)RURAL and URBAN population speaking who speaks only one language, exactly two languages, and  three languages or more

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns',30)
pd.set_option('display.max_rows',10000)
pd.set_option('display.width',400)

#ASSIGNING COL NAMES
column_names = ['State', 'District', 'Name', 'TRU','Age-group','Persons.2','Males.2','Females.2','Persons.3','Males.3','Females.3']
#READING DATA
df = pd.read_excel('DDW-C18-0000.xlsx',names=column_names,skiprows=5)

In [3]:
df.columns

Index(['State', 'District', 'Name', 'TRU', 'Age-group', 'Persons.2', 'Males.2', 'Females.2', 'Persons.3', 'Males.3', 'Females.3'], dtype='object')

In [4]:
#TAKING ONLY REQUIRED COLUMNS
df = df[['State','Name','TRU','Age-group','Persons.2','Persons.3']]
# REMOVING URBAN AND RURAL ROWS
idx = df.loc[(df['TRU']=='Total') | (df['Age-group'] !='Total')].index
df = df.drop(idx)

In [5]:
df.reset_index(inplace=True,drop=True)

In [6]:
df.head()

Unnamed: 0,State,Name,TRU,Age-group,Persons.2,Persons.3
0,0,INDIA,Rural,Total,162641485,35383989
1,0,INDIA,Urban,Total,152347285,50625591
2,1,JAMMU & KASHMIR,Rural,Total,4167238,1258724
3,1,JAMMU & KASHMIR,Urban,Total,2015952,837496
4,2,HIMACHAL PRADESH,Rural,Total,981518,280817


PREPROCESSING CENSUS POPULATION DATA

In [7]:
censuscols = ['State','Level','Name','TRU','TOT_P']
census_data = pd.read_excel(r'DDW_PCA0000_2011_Indiastatedist.xlsx',usecols=censuscols)

In [8]:
#dropping rows OF TRU =(rural,urban)  and LEVEL = (district)as not needed
a = census_data.loc[(census_data['TRU']=='Total') | (census_data['Level'] == 'DISTRICT')].index
census_data = census_data.drop(a)

In [9]:
census_data.reset_index(inplace=True,drop=True)

In [10]:
census_data.head()

Unnamed: 0,State,Level,Name,TRU,TOT_P
0,0,India,India,Rural,833748852
1,0,India,India,Urban,377106125
2,1,STATE,JAMMU & KASHMIR,Rural,9108060
3,1,STATE,JAMMU & KASHMIR,Urban,3433242
4,2,STATE,HIMACHAL PRADESH,Rural,6176050


In [11]:
# census_data.iloc[1,-1]/census_data.iloc[0,-1]
census_data.iloc[1,-1]/census_data.iloc[0,-1]

0.4523018221799003

### CALCULATING PERCENTAGES OF LANGUAGE SPEAKERS

In [12]:
#CREATING A COPY OF DATAFRAME
newdf = df.copy()

In [13]:
#NUMBER OF SPEAKERS OF EACH CATEGORY
newdf['only1lang'] = census_data['TOT_P'] - newdf['Persons.2']   #EXACTLY ONE LANG
newdf['only2lang'] = newdf['Persons.2'] - newdf['Persons.3']    #EXACTLY TWO LANGUAGES

In [14]:
newdf.head()

Unnamed: 0,State,Name,TRU,Age-group,Persons.2,Persons.3,only1lang,only2lang
0,0,INDIA,Rural,Total,162641485,35383989,671107367,127257496
1,0,INDIA,Urban,Total,152347285,50625591,224758840,101721694
2,1,JAMMU & KASHMIR,Rural,Total,4167238,1258724,4940822,2908514
3,1,JAMMU & KASHMIR,Urban,Total,2015952,837496,1417290,1178456
4,2,HIMACHAL PRADESH,Rural,Total,981518,280817,5194532,700701


In [15]:
newdf.columns

Index(['State', 'Name', 'TRU', 'Age-group', 'Persons.2', 'Persons.3', 'only1lang', 'only2lang'], dtype='object')

In [16]:
#CALCULATING RURAL and URBAN percentages ROWWISE
newdf['rp1'] = (newdf['only1lang'] / census_data['TOT_P'])*100
newdf['rp2'] = (newdf['only2lang'] / census_data['TOT_P'])*100
newdf['rp3'] = (newdf['Persons.3'] / census_data['TOT_P'])*100

In [17]:
#CONVERTING rural and urban as columns(to ease calculation) and calculating URBAN TO RURAL RATIOS
def f1(df1,lang=''):               #"lang" parameter is used in naming URBAN TO RURAL RATIO CALCULATION COLUMN
    s = df1.State.unique()
    n =  df1.Name.unique()
    
    rp = df1.iloc[::2,-2].values  #evenrows for rural percentage
    up = df1.iloc[1::2,-2].values  #oddrows for urban percentage
    
    r = df1.iloc[::2,-1].values  #evenrows for rural percentage
    u = df1.iloc[1::2,-1].values  #oddrows for urban percentage
    
    fd = pd.DataFrame()             #CREATING A NEWDATAFRAME AND ADDING COLUMNS
    fd['State'] = s
    fd['Name'] = n
    
    fd['rural-percentage'] = rp
    fd['urban-percentage'] = up
    
    fd['rural'] = r
    fd['urban'] = u
    
    fd['ru'+lang] = fd['urban']/fd['rural']         ##URBAN TO RURAL PERCENTAGE
    
    return fd

In [18]:
#SELECTING ONLY REQUIRED COLUMNS
one_lang_df = newdf[['State', 'Name','rp1','only1lang']]
two_lang_df = newdf[['State', 'Name','rp2','only2lang']]
three_lang_df = newdf[['State', 'Name','rp3','Persons.3']]

In [19]:
#CALLING THE ABOVE FUNC
one_lang_df = f1(one_lang_df,'1')
two_lang_df = f1(two_lang_df,'2')
three_lang_df = f1(three_lang_df,'3')
census_data = f1(census_data)

### P-val calculation

In [20]:
#STORING URBAN TO RURAL RATIOS IN A LIST
ru1 = one_lang_df['ru1'].values
ru2 = two_lang_df['ru2'].values
ru3 = three_lang_df['ru3'].values
ru = census_data['ru'].values

In [21]:
#CALCULATING P-VALUE
ls_p = stats.ttest_1samp([ru1,ru2,ru3],popmean=ru)[1]

In [22]:
#ADDING P-VALUE COLUMN
one_lang_df['p-value'] = ls_p
two_lang_df['p-value'] = ls_p
three_lang_df['p-value'] = ls_p

In [23]:
#SELECTING COLUMNS IN REQUIRED FORMAT
one_lang_df = one_lang_df[['State', 'urban-percentage','rural-percentage','p-value']]
two_lang_df = two_lang_df[['State', 'urban-percentage','rural-percentage','p-value']]
three_lang_df = three_lang_df[['State', 'urban-percentage','rural-percentage','p-value']]

In [24]:
#WRTITNG TO CSVS

one_lang_df.to_csv('geography-india-a.csv',index=False)
two_lang_df.to_csv('geography-india-b.csv',index=False)
three_lang_df.to_csv('geography-india-c.csv',index=False)

### TEST ANALYSIS

NULL HYPOTHESIS:  URBAN/RURAL SPEAKERS RATIO OF STATES IS SAME AS THAT OF URBAN/RURAL SPEAKERS RATIO OF INDIA
ALTERNATE HYPOTHESIS: URBAN/RURAL SPEAKERS RATIO OF STATES IS "NOT" SAME AS THAT OF URBAN/RURAL SPEAKERS RATIO OF INDIA

cond of analysis:
(i) if p <= alpha  then we REJECT NULL HYPOTHESIS
(ii) if p > alpha  then we fail to reject NULL HYPOTHESIS with level of significance(alpha)

In [25]:
#storing STATEIDS AND P-val
data = one_lang_df[['State','p-value']]

In [28]:
#level of Significance list and computing for different alphas
alpha = [0.05,0.2,0.3,0.4]
for i in alpha:
    st = data[data['p-value'] <= i]['State'].values      #condition checking
    if len(st) == 0:
        print('ALL STATES-- HAVE FAIL TO REJECT NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: {}'.format(i))
    else:
        for z in st:
            print('{} STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: {}'.format(z,i))
    print('*****************')
    print()
        

ALL STATES-- HAVE FAIL TO REJECT NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.05
*****************

ALL STATES-- HAVE FAIL TO REJECT NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.2
*****************

2 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
5 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
6 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
8 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
9 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
23 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
*****************

0 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.4
2 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.4
5 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.4
6 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.4
7 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGIN

In [27]:
#FINISHED