# 2)males and females speaking who speaks only one language, exactly two languages, and  three languages or more

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns',30)
pd.set_option('display.max_rows',10000)
pd.set_option('display.width',400)

In [3]:
#ASSIGNING COL NAMES
column_names = ['State', 'District', 'Name', 'TRU','Age-group','Persons.2','Males.2','Females.2','Persons.3','Males.3','Females.3']
#READING DATA
df = pd.read_excel('DDW-C18-0000.xlsx',names=column_names,skiprows=5)

In [4]:
df.columns

Index(['State', 'District', 'Name', 'TRU', 'Age-group', 'Persons.2', 'Males.2', 'Females.2', 'Persons.3', 'Males.3', 'Females.3'], dtype='object')

In [5]:
#TAKING ONLY REQUIRED COLUMNS
df = df[['State','Name','TRU','Age-group', 'Males.2', 'Females.2','Males.3', 'Females.3']]
# REMOVING URBAN AND RURAL ROWS
idx = df.loc[(df['TRU']!='Total') | (df['Age-group'] !='Total')].index
df = df.drop(idx)

In [6]:
df.reset_index(inplace=True,drop=True)

In [7]:
# df.head()

PREPROCESSING CENSUS POPULATION DATA

In [8]:
censuscols = ['State','Level','Name','TRU','TOT_P','TOT_M','TOT_F']
census_data = pd.read_excel(r'DDW_PCA0000_2011_Indiastatedist.xlsx',usecols=censuscols)

In [9]:
#dropping rows OF TRU =(rural,urban)  and LEVEL = (district)as not needed
a = census_data.loc[(census_data['TRU']=='Rural') | (census_data['TRU']=='Urban') | (census_data['Level'] == 'DISTRICT')].index
census_data = census_data.drop(a)

In [10]:
census_data.reset_index(inplace=True,drop=True)

In [11]:
# census_data.head()

In [12]:
census_data['mf'] = census_data['TOT_M']/census_data['TOT_F']  #MALE/FEMALE RATIO


CALCULATING PERCENTAGES OF LANGUAGE SPEAKERS

In [13]:
#CREATING A COPY OF DATAFRAME
newdf = df.copy()

In [14]:
#NUMBER OF SPEAKERS OF EACH CATEGORY
newdf['only1lang_m'] = census_data['TOT_M'] - newdf['Males.2']   #EXACTLY ONE LANG MALE
newdf['only1lang_f'] = census_data['TOT_F'] - newdf['Females.2']   #EXACTLY ONE LANG FEMALE

newdf['only2lang_m'] = newdf['Males.2'] - newdf['Males.3']   #EXACTLY TWO LANGUAGES MALE
newdf['only2lang_f'] = newdf['Females.2'] - newdf['Females.3']   #EXACTLY TWO LANGUAGES FEMALE


In [15]:
# df.shape,census_data.shape

In [16]:
#AS STATES HAVE RELATIVE ORDER IN BOTH FILES WE CAN DIRECTLY PERFORM OPERATIONS

In [17]:
newdf['mp-1'] = (newdf['only1lang_m'] / census_data['TOT_M'])*100
newdf['fp-1'] = (newdf['only1lang_f'] / census_data['TOT_F'])*100


newdf['mp-2'] = (newdf['only2lang_m'] / census_data['TOT_M'])*100
newdf['fp-2'] = (newdf['only2lang_f'] / census_data['TOT_F'])*100

newdf['mp-3'] = (newdf['Males.3'] / census_data['TOT_M'])*100
newdf['fp-3'] = (newdf['Females.3'] / census_data['TOT_F'])*100

In [18]:
newdf.columns

Index(['State', 'Name', 'TRU', 'Age-group', 'Males.2', 'Females.2', 'Males.3', 'Females.3', 'only1lang_m', 'only1lang_f', 'only2lang_m', 'only2lang_f', 'mp-1', 'fp-1', 'mp-2', 'fp-2', 'mp-3', 'fp-3'], dtype='object')

In [19]:
#calculating MALE/FEMALE RATIO FOR ALL LANGUAGE CATEGORIES
newdf['mf1'] = newdf['only1lang_m']/newdf['only1lang_f']
newdf['mf2'] = newdf['only2lang_m']/newdf['only2lang_f']
newdf['mf3'] = newdf['Males.3']/newdf['Females.3']
newdf['mf'] = census_data['mf']        #ADDING ORIGINAL POPULATION M/F RATIO AS A COLUMN

In [20]:
newdf['p-value'] = newdf.apply(lambda row:stats.ttest_1samp([row.mf1,row.mf2,row.mf3],popmean=row.mf)[1],axis=1)

In [21]:
# newdf['p-value']

In [22]:
# newdf

In [23]:
one_lang_df = newdf[['State','mp-1', 'fp-1','p-value']]
two_lang_df = newdf[['State','mp-2', 'fp-2','p-value']]
three_lang_df = newdf[['State','mp-3', 'fp-3','p-value']]

In [24]:
one_lang_df.columns = ['State','male-percentage','female-percentage', 'p-value']
two_lang_df.columns = ['State','male-percentage','female-percentage', 'p-value']
three_lang_df.columns = ['State','male-percentage','female-percentage', 'p-value']

In [25]:
# three_lang_df.head()

In [26]:
#WRTITNG TO CSVS

one_lang_df.to_csv('gender-india-a.csv',index=False)
two_lang_df.to_csv('gender-india-b.csv',index=False)
three_lang_df.to_csv('gender-india-c.csv',index=False)

#### TEST ANALYSIS

NULL HYPOTHESIS:  MALE/FEMALE SPEAKERS RATIO OF STATES IS SAME AS THAT OF MALE/FEMALE SPEAKERS RATIO OF INDIA
ALTERNATE HYPOTHESIS: MALE/FEMALE SPEAKERS RATIO OF STATES IS NOT SAME AS THAT OF MALE/FEMALE SPEAKERS RATIO OF INDIA

cond of analysis:
(i) if p <= alpha  then we REJECT NULL HYPOTHESIS
(ii) if p > alpha  then we fail to reject NULL HYPOTHESIS with level of significance(alpha)

In [29]:
#storing STATEIDS AND P-val
data = one_lang_df[['State','p-value']]

In [34]:
#level of Significance list and computing for different alphas
alpha = [0.05,0.2,0.3,0.4]
for i in alpha:
    st = data[data['p-value'] <= i]['State'].values      #condition checking
    if len(st) == 0:
        print('ALL STATES-- HAVE FAIL TO REJECT NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: {}'.format(i))
    else:
        for z in st:
            print('{} STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: {}'.format(z,i))
    print('*****************')
    print()
        

ALL STATES-- HAVE FAIL TO REJECT NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.05
*****************

ALL STATES-- HAVE FAIL TO REJECT NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.2
*****************

5 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
6 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
8 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
9 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
19 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
23 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.3
*****************

0 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.4
2 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.4
5 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.4
6 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGINFICANCE: 0.4
8 STATE-ID HAS REJECTED NULL HYPOTHESIS with LEVEL OF SIGI

In [28]:
#FINISHED