In [1]:
import pandas as pd
from functools import reduce
import xlsxwriter
import pprint

#### 1. Read the financial inclusion file
##### a. Navigate to the right working dir
##### b. Read all the sheets/tabs of the excel file

In [29]:
import os
os.getcwd()
folder = '/Users/anshulsaxena/MicroSave/cohort 3/contingency plan/Data Sources/wip'
os.chdir(folder)
file_name = 'World Bank data 7 July MSC R4 countries.xlsx' # This file has been taken from the World Bank site
path = folder+'/'+file_name
df_raw_fin_data = []
df_read_all_sheets = pd.read_excel(path, sheet_name=None,index_col=0) # None value for sheet_name reads all sheets; index_col=0 will drop unnamed index col whilw reading file



df_read_sheet = input(f"Enter the sheet name to be read from the workbook {file_name}: ")

df_raw_fin_data = read_all_sheets[read_sheet]
df_raw_fin_data.head()

Enter the sheet name to be read from the workbook World Bank data 7 July MSC R4 countries.xlsx:  Data


Unnamed: 0,Country Name,Country Code,Series Name,Series Code,2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019]
0,Bangladesh,BGD,"Population, total",SP.POP.TOTL,156256276,157970840.0,159670593,161356039,163046161
1,Bangladesh,BGD,Population growth (annual %),SP.POP.GROW,1.11728,1.0913,1.07024,1.05004,1.042
2,Bangladesh,BGD,Income share held by lowest 20%,SI.DST.FRST.20,..,8.6,..,..,..
3,Bangladesh,BGD,GDP (current US$),NY.GDP.MKTP.CD,1.95079e+11,221415000000.0,2.49711e+11,2.74039e+11,3.02571e+11
4,Bangladesh,BGD,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,6.55264,7.1135,7.28418,7.86371,8.15268


#### 2. Tidying the data
##### World Bank data is to be converted from wide to long format esp. for tableau viz
##### a. Drop unnecessary columns
##### b. Compress the different 'year' columns into a single 'year' column

In [35]:
selected_cols = ['Country Name', 'Series Code', 'Series Name']
unselected_cols = ['Country Code']

In [38]:
df_fin_data_long_format = pd.melt(raw_fin_data.drop(columns=unselected_cols) # melt() converts wide to long format
                               , id_vars=selected_cols, var_name= 'year'\
                               , value_name = 'value')
                                
#rename columns
df_fin_data_long_format.rename(columns={
    "Country Name" : "country",
    "Series Name" : "indicator", "Series Code" : "code"
}, inplace=True)   

df_fin_data_long_format.head(10)

Unnamed: 0,country,code,indicator,year,value
0,Bangladesh,SP.POP.TOTL,"Population, total",2015 [YR2015],156256276
1,Bangladesh,SP.POP.GROW,Population growth (annual %),2015 [YR2015],1.11728
2,Bangladesh,SI.DST.FRST.20,Income share held by lowest 20%,2015 [YR2015],..
3,Bangladesh,NY.GDP.MKTP.CD,GDP (current US$),2015 [YR2015],1.95079e+11
4,Bangladesh,NY.GDP.MKTP.KD.ZG,GDP growth (annual %),2015 [YR2015],6.55264
5,Bangladesh,NY.GDP.DEFL.KD.ZG,"Inflation, GDP deflator (annual %)",2015 [YR2015],5.87278
6,Bangladesh,NV.IND.TOTL.ZS,"Industry (including construction), value added...",2015 [YR2015],26.8314
7,Bangladesh,NE.EXP.GNFS.ZS,Exports of goods and services (% of GDP),2015 [YR2015],17.3367
8,Bangladesh,NE.IMP.GNFS.ZS,Imports of goods and services (% of GDP),2015 [YR2015],24.7493
9,Bangladesh,NE.GDI.TOTL.ZS,Gross capital formation (% of GDP),2015 [YR2015],28.8867


#### 3. Writing the long formatted data back to the file
##### We'll write the new long formatted table back to the excel file
##### Since ExcelWriter overwrites the previous version of the file, we have to re-insert the remaining sheets as well
##### This step is an intermediate step. After executing this step, our file's "long format" sheet will be ready for tableau viz

In [43]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(file_name, engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
df_fin_data_long_format.to_excel(writer, sheet_name='long format', index=False)
df_read_all_sheets[read_sheet].to_excel(writer, sheet_name=read_sheet)
df_read_all_sheets['Series - Metadata'].to_excel(writer, sheet_name='Series - Metadata')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

#### 4. Segregating the financial inclusion indicators into different column headers
##### The World Bank's data has all the indicators listed in a single column, called indicators
##### This makes it difficult to focus on any single indicator for analysis or visualization
##### So, in this step, we'll seggregate the indicators (and their values) into columns 

#####  4a. Group the data based on different indicators 

In [44]:
grouped = fin_data_long_format.groupby(fin_data_long_format.code) 

##### 4b. Formatting each group and prepping for the final processing step
##### A lot of pre-processing is happening in this step!
##### We will re-label the 'value' column with the indicator name. Look at how we have done that!


In [53]:
lst_indicator = []
for name, group in grouped:
    indicator_name = group.iloc[0]['indicator'] # we need to rename the column "value" to the relevant indicator name
    group.rename(columns={"value":indicator_name},inplace=True) # How did we do this? We extracted the value of any indicator column of each group (using iloc) to re-label "value" column
    group.drop(columns=['code','indicator'],inplace=True) 
    lst_indicator.append(group) # We are collecting all the grouped dataframes into a list of indicator dataframes

#### 5. Final processing step
##### We will now combine all the dataframes collected in the list of dataframes above
##### to finally have all the indicator columns and their values in a single dataframe

In [54]:
df_final_processed_data = reduce(lambda df1,df2: pd.merge(df1,df2,on=['country','year']), lst_indicator[:])


#### 6. Writing back to the file

In [59]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter(file_name, engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
df_fin_data_long_format.to_excel(writer, sheet_name='long format', index=False)
df_final_processed_data.to_excel(writer, sheet_name='final processed data', index=False)
read_all_sheets[read_sheet].to_excel(writer, sheet_name=read_sheet)
read_all_sheets['Series - Metadata'].to_excel(writer, sheet_name='Series - Metadata')

# Close the Pandas Excel writer and output the Excel file.
writer.save()