## 7)the top three most spoken languages in a region

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

In [2]:
#file codes BASED ON REGION
north_c = ['0100','0200','0300','0400','0500','0600','0700']
west_c =  ['0800','2400','2500','2600','2700','3000']
central_c = ['0900','2200','2300']
east_c = ['1000','1900','2000','2100']
south_c = ['2800','2900','3100','3200','3300','3400']
north_east_c = ['1100','1200','1300','1400','1500','1600','1700','1800','3500']

In [3]:
print('TOTAL NO.OF STATES: ',len(north_c)+len(west_c)+len(central_c)+len(east_c)+len(south_c)+len(north_east_c))

TOTAL NO.OF STATES:  35


In [4]:
#STORING ALL FILECODES INTO A SINGLE LIST
region_codes = [north_c,west_c,central_c,east_c,south_c,north_east_c]

In [5]:
region_codes

[['0100', '0200', '0300', '0400', '0500', '0600', '0700'],
 ['0800', '2400', '2500', '2600', '2700', '3000'],
 ['0900', '2200', '2300'],
 ['1000', '1900', '2000', '2100'],
 ['2800', '2900', '3100', '3200', '3300', '3400'],
 ['1100', '1200', '1300', '1400', '1500', '1600', '1700', '1800', '3500']]

## READING ALL STATE/UT FILES 

In [6]:
#NAMES OF COLUMNS
col_names = ['Code', 'region', 'Code.1', 'Name', 'Persons', 'Males', 'Females', 'Code.2', '1 st subsidiary languages', 'Persons.2', 'Males.2', 'Females.2', 'Code.3', '2nd subsidiary languages', 'Persons.3', 'Males.3', 'Females.3']

In [7]:
#STORING ALL STATE/UT DATAFRAMES INTO A LIST OF DATAFRAMES
mother_tongue_df = []    # for part-a
all_df = [] # for part-b

READING FILES AND PERFORMING SUMS OF SPEAKERS AS NEEDED

In [8]:
for i in region_codes:
    for c in i:
        f_name = 'c17\DDW-C17-'+c+'.XLSX'                 #NAME OF THE FILE
        file = pd.read_excel(f_name,skiprows=5,names=col_names)    # READING FILE ONE BY ONE
        
        l2 = file[['Code.2','Persons.2']]               #SECOND LANG SPEAKERS
        l3 = file[['Code.3','Persons.3']]               #THIRD LANG SPEAKERS
        
        file = file[['Code', 'region', 'Code.1', 'Name', 'Persons']]  #TAKING ONLY REQUIRED COLUMN(TOTAL SPEAKERS OF LANGUAGE)
        
        file.dropna(inplace=True)              #DROPPING NA values
        only_mt = file.copy()                    #storing COPY OF DATAFRAME FOR MOTHERTONGUE CALCULATION
        
        l2.dropna(inplace=True)
        l3.dropna(inplace=True)
        
        l2 = l2.groupby(['Code.2']).agg({'Persons.2':sum})
        l3 = l3.groupby(['Code.3']).agg({'Persons.3':sum})
        
        l2.reset_index(inplace=True)
        l3.reset_index(inplace=True)
        
        file = pd.merge(file,l2,how='left',left_on='Code.1',right_on='Code.2')    #GROUPING ON LANGUAGE
        file = pd.merge(file,l3,how='left',left_on='Code.1',right_on='Code.3')
        
        file['Persons'] = file['Persons'] + file['Persons.2'] + file['Persons.3']   #ADDING SPEAKERS OF RESPECTIVE LANG
        
        
        
        file = file.sort_values('Persons',ascending=False)    # SORTING BASED ON NUMBER OF SPEAKERS
        only_mt = only_mt.sort_values('Persons',ascending=False) 
        
        all_df.append(file)                #APPENDING DF TO A LIST for partB
        mother_tongue_df.append(only_mt)   #APPENDING DF TO A LIST for partA

### CLASSIFYING STATES/UT DATAFRAMES INTO RESPECTIVE REGIONS

In [9]:
#REGION SLICING
regions = [slice(0,7),slice(7,13),slice(13,16),slice(16,20),slice(20,26),slice(26,35)]

In [10]:
#COLLECTING TOP THREE LANGUAGES FOR EACH REGION
newdf = []
newdf_mt = []

In [11]:
for i in range(len(regions)):
    names = ['North', 'West', 'Central', 'East', 'South', 'North-East']
    #CONCATENITING RESPECTIVE REGION STATES
    r1 = pd.concat(all_df[regions[i]])               #PARTB
    r2 = pd.concat(mother_tongue_df[regions[i]])     #PARTA
    
    #SUMMING SPEAKERS
    r1 = r1.groupby(['Name']).agg({'Persons':sum})
    r1 = r1.sort_values('Persons',ascending=False)
    r1.reset_index(inplace=True)
    
    r2 = r2.groupby(['Name']).agg({'Persons':sum})
    r2 = r2.sort_values('Persons',ascending=False)
    r2.reset_index(inplace=True)
    
    #GETTTING TOP 3 HIGHEST SPOKEN LANGUAGE IN THAT REGION
    r1 = r1[0:3]
    r2 = r2[0:3]
    
    r1['region'] = names[i]
    r2['region'] = names[i]
    
    newdf.append(r1)
    newdf_mt.append(r2)

In [12]:
#CONCATINATING ALL REGIONS INTO A SINGLE DATAFRAME
finaldf = pd.concat(newdf)
finaldf_mt = pd.concat(newdf_mt)

### CREATING THE RESULTANT DATAFRAME

In [13]:
reg = finaldf.region.unique()   #STORING REGIONS FOR PARTB
t = finaldf.Name.values         # STORING LANGUAGES AS A LIST
# ----
reg_mt = finaldf_mt.region.unique()   #STORING REGIONS  FOR PARTA
t_mt = finaldf_mt.Name.values         # STORING LANGUAGES AS A LIST

In [14]:
#RESULTANT DATAFRAME
resultdf = pd.DataFrame(columns=['region','language-1','language-2', 'language-3'])
resultdf_mt = pd.DataFrame(columns=['region','language-1','language-2', 'language-3'])

In [15]:
for i in range(6):
    resultdf.loc[i] = [reg[i],t[i*3+0],t[i*3+1],t[i*3+2]]     #APPENDING EACH REGION WITH 3 LANGUAGES TO "resultdf"
    resultdf_mt.loc[i] = [reg_mt[i],t_mt[i*3+0],t_mt[i*3+1],t_mt[i*3+2]]     #APPENDING EACH REGION WITH 3 LANGUAGES TO "resultdf"

In [16]:
# resultdf

In [17]:
# resultdf_mt

In [18]:
#WRITING TO CSV
resultdf_mt.to_csv('region-india-a.csv',index=False)
resultdf.to_csv('region-india-b.csv',index=False)