In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks‚Äù
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [1]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd
import numpy as np
# Store excel file locations to variables

EI_2012 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\25. Section 6 Part 2\2012_s6p3_m.xlsx"
EI_2012_5 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\25. Section 6 Part 2\2012_5_s7p3.xlsx"
EI_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\25. Section 6 Part 2\2013_s6p2_m.xlsx"
EI_2014 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\25. Section 6 Part 2\2014_s6p2_m.xlsx"

# Read excel files 
df_2012 = pd.read_excel(EI_2012)
df_2012_5 = pd.read_excel(EI_2012_5)
df_2013 = pd.read_excel(EI_2013)
df_2014 = pd.read_excel(EI_2014)



In [2]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df to df_2014 column names
df_2012.rename(columns={
    'pid': 'r_pid',
    'S6P3AQ2': 's6p2_q2',
    'S6P3AQ3': 's6p2_q3',
    'S6P3AQ4MAR': 's6p2_q4_m14',
    'S6P3AQ4FEB': 's6p2_q4_f14',
    'S6P3AQ3JAN': 's6p2_q4_j14',
    'S6P3AQ4DEC': 's6p2_q4_d13',
    'S6P3AQ4NOV': 's6p2_q4_n13',
    'S6P3AQ4OCT': 's6p2_q4_o13',
    'S6P3AQ4SEP': 's6p2_q4_s13',
    'S6P3AQ4AUG': 's6p2_q4_a13',
    'S6P3AQ4JUL': 's6p2_q4_jul13',
    'S6P3AQ4JUN': 's6p2_q4_jun13',
    'S6P3AQ4MAY': 's6p2_q4_m13',
    'S6P3AQ4APR': 's6p2_q4_ap13',
    'S6P3AQ5': 's6p2_q6',
    'S6P3AQ6': 's6p2_q7_code',
    'S6P3AQ11': 's6p2_q10',
    'S6P3AQ12MAR': 's6p2_q11_m14',
    'S6P3AQ12FEB': 's6p2_q11_f14',
    'S6P3AQ12JAN': 's6p2_q11_j14',
    'S6P3AQ12DEC': 's6p2_q11_d13',
    'S6P3AQ12NOV': 's6p2_q11_n13',
    'S6P3AQ12OCT': 's6p2_q11_o13',
    'S6P3AQ12SEP': 's6p2_q11_s13',
    'S6P3AQ12AUG': 's6p2_q11_a13',
    'S6P3AQ12JUL': 's6p2_q11_jul13',
    'S6P3AQ12JUN': 's6p2_q11_jun13',
    'S6P3AQ12MAY': 's6p2_q11_m13',
    'S6P3AQ12APR': 's6p2_q11_ap13',
    'S6P3AQ13': 's6p2_q13',
    'S6P3AQ14': 's6p2_q14_code',
    'S6P3AQ19': 's6p2_q16',
    'S6P3AQ20': 's6p2_q17',
    'S6P3AQ7': 'PNFW_PJ_Wloc',
    'S6P3AQ8': 'PNFW_PJ_Wloc_Type',
    'S6P3AQ9': 'PNFW_PJ_Wloc_Dist',
    'S6P3AQ10': 'PNFW_PJ_Wloc_MoT',
    'S6P3AQ15': 'PNFW_SJ_Wloc',
    'S6P3AQ16': 'PNFW_SJ_Wloc_Type',
    'S6P3AQ17': 'PNFW_SJ_Wloc_Dist',
    'S6P3AQ18': 'PNFW_SJ_Wloc_MoT',
    'PROVINCE_ID': 'P_ID',
    'DISTRICT_ID': 'D_ID',
    'TEHSIL_ID': 'T_ID',
    'UC_ID': 'UC_ID',
    'MAUZA_ID': 'M_ID'
}, inplace=True)

df_2012_5.rename(columns={
    'Round': 'round',
    'PID1': 'r_pid',
    'S7P3Q2': 's6p2_q2',
    'S7P3Q3': 's6p2_q3',
    'Q4MAR2012': 's6p2_q4_m14',
    'Q4FEB2012': 's6p2_q4_f14',
    'Q4JAN2012': 's6p2_q4_j14',
    'Q4DEC2011': 's6p2_q4_d13',
    'Q4NOV2011': 's6p2_q4_n13',
    'Q4OCT2011': 's6p2_q4_o13',
    'Q4SEP2011': 's6p2_q4_s13',
    'Q4AUG2011': 's6p2_q4_a13',
    'Q4JUL2011': 's6p2_q4_jul13',
    'Q4JUN2011': 's6p2_q4_jun13',
    'Q4MAY2011': 's6p2_q4_m13',
    'Q4APR2012': 's6p2_q4_ap13',

    'S7P3Q5': 's6p2_q6',
    'S7P3Q6': 's6p2_q7_code',
    'S7P3Q7': 's6p2_q10',
    'Q8MAR2012': 's6p2_q11_m14',
    'Q8FEB2012': 's6p2_q11_f14',
    'Q8JAN2012': 's6p2_q11_j14',
    'Q8DEC2011': 's6p2_q11_d13',
    'Q8NOV2011': 's6p2_q11_n13',
    'Q8OCT2011': 's6p2_q11_o13',
    'Q8SEP2011': 's6p2_q11_s13',
    'Q8AUG2011': 's6p2_q11_a13',
    'Q8JUL2011': 's6p2_q11_jul13',
    'Q8JUN2011': 's6p2_q11_jun13',
    'Q8MAY2011': 's6p2_q11_m13',
    'Q8APR2012': 's6p2_q11_ap13',
    'S7P3Q9': 's6p2_q13',
    'S7P3Q10': 's6p2_q14_code',
    'S7P3Q11': 's6p2_q16',
    'S7P3Q12': 's6p2_q17',
    'C_PROVINCE': 'P_ID',
    'C_DISTRICT': 'D_ID',
    'C_TEHSIL': 'T_ID',
    'C_UC': 'UC_ID',
    'C_MOUZA': 'M_ID'
    
}, inplace=True)


df_2013.rename(columns={
    's6p2_q4_m13': 's6p2_q4_m14',
    's6p2_q4_f13': 's6p2_q4_f14',
    's6p2_q4_j13': 's6p2_q4_j14',
    's6p2_q4_d12': 's6p2_q4_d13',
    's6p2_q4_n12': 's6p2_q4_n13',
    's6p2_q4_o12': 's6p2_q4_o13',
    's6p2_q4_s12': 's6p2_q4_s13',
    's6p2_q4_a12': 's6p2_q4_a13',
    's6p2_q4_jul12': 's6p2_q4_jul13',
    's6p2_q4_jun12': 's6p2_q4_jun13',
    's6p2_q4_m12': 's6p2_q4_m13',
    's6p2_q4_ap12': 's6p2_q4_ap13',
    's6p2_q11_m13': 's6p2_q11_m14',
    's6p2_q11_f13': 's6p2_q11_f14',
    's6p2_q11_j13': 's6p2_q11_j14',
    's6p2_q11_d12': 's6p2_q11_d13',
    's6p2_q11_n12': 's6p2_q11_n13',
    's6p2_q11_o12': 's6p2_q11_o13',
    's6p2_q11_s12': 's6p2_q11_s13',
    's6p2_q11_a12': 's6p2_q11_a13',
    's6p2_q11_jul12': 's6p2_q11_jul13',
    's6p2_q11_jun12': 's6p2_q11_jun13',
    's6p2_q11_m12': 's6p2_q11_m13',
    's6p2_q11_ap12': 's6p2_q11_ap13'

}, inplace=True)


# df_2014 doesn't need renaming as it is the reference

In [8]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:


mapping_2012 = [
    'hid', 'round', 'r_pid', 's6p2_q2', 's6p2_q3', 's6p2_q4_m14', 's6p2_q4_f14', 
    's6p2_q4_j14', 's6p2_q4_d13', 's6p2_q4_n13', 's6p2_q4_o13', 's6p2_q4_s13', 
    's6p2_q4_a13', 's6p2_q4_jul13', 's6p2_q4_jun13', 's6p2_q4_m13', 's6p2_q4_ap13', 
    None, 's6p2_q6', None, 's6p2_q7_code', None, None, None, 's6p2_q10', 
    's6p2_q11_m14', 's6p2_q11_f14', 's6p2_q11_j14', 's6p2_q11_d13', 's6p2_q11_n13', 
    's6p2_q11_o13', 's6p2_q11_s13', 's6p2_q11_a13', 's6p2_q11_jul13', 's6p2_q11_jun13', 
    's6p2_q11_m13', 's6p2_q11_ap13', None, 's6p2_q13', None, 's6p2_q14_code', 
    None, None, 's6p2_q16', 's6p2_q17', None, None, 'PNFW_PJ_Wloc', 'PNFW_PJ_Wloc_Type', 
    'PNFW_PJ_Wloc_Dist', 'PNFW_PJ_Wloc_MoT', 'PNFW_SJ_Wloc', 'PNFW_SJ_Wloc_Type', 
    'PNFW_SJ_Wloc_Dist', 'PNFW_SJ_Wloc_MoT', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID', None
]

mapping_2012_5 = [
    'hid', 'round', 'r_pid', 's6p2_q2', 's6p2_q3', 's6p2_q4_m14', 's6p2_q4_f14', 
    's6p2_q4_j14', 's6p2_q4_d13', 's6p2_q4_n13', 's6p2_q4_o13', 's6p2_q4_s13', 
    's6p2_q4_a13', 's6p2_q4_jul13', 's6p2_q4_jun13', 's6p2_q4_m13', 's6p2_q4_ap13', 
    None, 's6p2_q6', None, 's6p2_q7_code', None, None, None, 's6p2_q10', 
    's6p2_q11_m14', 's6p2_q11_f14', 's6p2_q11_j14', 's6p2_q11_d13', 's6p2_q11_n13', 
    's6p2_q11_o13', 's6p2_q11_s13', 's6p2_q11_a13', 's6p2_q11_jul13', 's6p2_q11_jun13', 
    's6p2_q11_m13', 's6p2_q11_ap13', None, 's6p2_q13', None, 's6p2_q14_code', 
    None, None, 's6p2_q16', 's6p2_q17', None, None, None, None, None, None, None, None, 
    None, None, 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID', 'C_HH_NUM'
    
]

mapping_2013 = [
    'hid', 'round', 'r_pid', 's6p2_q2', 's6p2_q3', 's6p2_q4_m14', 's6p2_q4_f14', 
    's6p2_q4_j14', 's6p2_q4_d13', 's6p2_q4_n13', 's6p2_q4_o13', 's6p2_q4_s13', 
    's6p2_q4_a13', 's6p2_q4_jul13', 's6p2_q4_jun13', 's6p2_q4_m13', 's6p2_q4_ap13', 
    's6p2_q5', 's6p2_q6', 's6p2_q7_des', 's6p2_q7_code', 's6p2_q8_des', 
    's6p2_q8_code', 's6p2_q9', 's6p2_q10', 's6p2_q11_m14', 's6p2_q11_f14', 
    's6p2_q11_j14', 's6p2_q11_d13', 's6p2_q11_n13', 's6p2_q11_o13', 's6p2_q11_s13', 
    's6p2_q11_a13', 's6p2_q11_jul13', 's6p2_q11_jun13', 's6p2_q11_m13', 's6p2_q11_ap13', 
    's6p2_q12', 's6p2_q13', 's6p2_q14_des', 's6p2_q14_code', 's6p2_q15_des', 
    's6p2_q15_code', 's6p2_q16', 's6p2_q17', None, None, None, None, None, None, None, None, 
    None, None, None, None, None, None, None, None
]


mapping_2014 = [
    'hid', 'round', 'r_pid', 's6p2_q2', 's6p2_q3', 's6p2_q4_m14', 's6p2_q4_f14', 's6p2_q4_j14', 's6p2_q4_d13', 
    's6p2_q4_n13', 's6p2_q4_o13', 's6p2_q4_s13', 's6p2_q4_a13', 's6p2_q4_jul13', 's6p2_q4_jun13', 's6p2_q4_m13', 
    's6p2_q4_ap13', 's6p2_q5', 's6p2_q6', 's6p2_q7_des', 's6p2_q7_code', 's6p2_q8_des', 's6p2_q8_code', 's6p2_q9', 
    's6p2_q10', 's6p2_q11_m14', 's6p2_q11_f14', 's6p2_q11_j14', 's6p2_q11_d13', 's6p2_q11_n13', 's6p2_q11_o13', 
    's6p2_q11_s13', 's6p2_q11_a13', 's6p2_q11_jul13', 's6p2_q11_jun13', 's6p2_q11_m13', 's6p2_q11_ap13', 's6p2_q12', 
    's6p2_q13', 's6p2_q14_des', 's6p2_q14_code', 's6p2_q15_des', 's6p2_q15_code', 's6p2_q16', 's6p2_q17', 's6p2_q18',	's6p2_q19',
    None, None, None, None, None, None, None, None, None, None, None, None, None, None
]





In [9]:
# Create a list of all possible columns in the correct order
all_columns = []

for col in mapping_2012:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2012_5:
    if col and col not in all_columns:
        all_columns.append(col)     
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [10]:
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col.strip()  # Remove leading/trailing whitespace
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if ref_col in df.columns:
                    print(f"Appending data for column {ref_col}")
                    if isinstance(df[ref_col], pd.Series):
                        merged_data[ref_col].extend(df[ref_col].tolist())
                    elif isinstance(df[ref_col], pd.DataFrame):
                        print(f"Column {ref_col} is duplicated in DataFrame. Appending data for each duplicate.")
                        for _, series in df[ref_col].items():
                            merged_data[ref_col].extend(series.tolist())
                else:
                    print(f"Column {ref_col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df


In [11]:
# Usage with dataframes and mappings
dfs = [df_2012, df_2012_5, df_2013, df_2014]
mappings = [mapping_2012, mapping_2012_5, mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 'r_pid', 's6p2_q2', 's6p2_q3', 's6p2_q4_j14', 's6p2_q4_f14', 's6p2_q4_m14', 's6p2_q4_ap13', 's6p2_q4_m13', 's6p2_q4_jun13', 's6p2_q4_jul13', 's6p2_q4_a13', 's6p2_q4_s13', 's6p2_q4_o13', 's6p2_q4_n13', 's6p2_q4_d13', 's6p2_q6', 's6p2_q7_code', 'PNFW_PJ_Wloc', 'PNFW_PJ_Wloc_Type', 'PNFW_PJ_Wloc_Dist', 'PNFW_PJ_Wloc_MoT', 's6p2_q10', 's6p2_q11_j14', 's6p2_q11_f14', 's6p2_q11_m14', 's6p2_q11_ap13', 's6p2_q11_m13', 's6p2_q11_jun13', 's6p2_q11_jul13', 's6p2_q11_a13', 's6p2_q11_s13', 's6p2_q11_o13', 's6p2_q11_n13', 's6p2_q11_d13', 's6p2_q13', 's6p2_q14_code', 'PNFW_SJ_Wloc', 'PNFW_SJ_Wloc_Type', 'PNFW_SJ_Wloc_Dist', 'PNFW_SJ_Wloc_MoT', 's6p2_q16', 's6p2_q17', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID']
Appending data for column hid
Appending data for column round
Appending data for column r_pid
Appending data for column s6p2_q2
Appending data for column s6p2_q3
Appending data for column s6p2_q4_m14
Appending data for column s6p2_q

In [12]:
# Rename columns for the merged file (if needed)
rename_mapping = {
    'hid': 'HID',
    'round': 'Survey_Round',
    'r_pid': 'PID',
    's6p2_q2': 'PNFW',
    's6p2_q3': 'PNFW_PJ_EmpStatus',
    's6p2_q4_m14': 'PNFW_PJ_DpM_Mar',
    's6p2_q4_f14': 'PNFW_PJ_DpM_Feb',
    's6p2_q4_j14': 'PNFW_PJ_DpM_Jan',
    's6p2_q4_d13': 'PNFW_PJ_DpM_Dec',
    's6p2_q4_n13': 'PNFW_PJ_DpM_Nov',
    's6p2_q4_o13': 'PNFW_PJ_DpM_Oct',
    's6p2_q4_s13': 'PNFW_PJ_DpM_Sep',
    's6p2_q4_a13': 'PNFW_PJ_DpM_Aug',
    's6p2_q4_jul13': 'PNFW_PJ_DpM_Jul',
    's6p2_q4_jun13': 'PNFW_PJ_DpM_Jun',
    's6p2_q4_m13': 'PNFW_PJ_DpM_May',
    's6p2_q4_ap13': 'PNFW_PJ_DpM_Apr',
    's6p2_q5': 'PNFW_PJ_PoP',
    's6p2_q6': 'PNFW_PJ_TE',
    's6p2_q7_des': 'PNFW_PJ_Des',
    's6p2_q7_code': 'PNFW_PJ_Code',
    's6p2_q8_des': 'PNFW_PJ_EA_Des',
    's6p2_q8_code': 'PNFW_PJ_EA_Code',
    's6p2_q9': 'PNFW_SJ_YN',
    's6p2_q10': 'PNFW_SJ_EmpStatus',
    's6p2_q11_m14': 'PNFW_SJ_DpM_Mar',
    's6p2_q11_f14': 'PNFW_SJ_DpM_Feb',
    's6p2_q11_j14': 'PNFW_SJ_DpM_Jan',
    's6p2_q11_d13': 'PNFW_SJ_DpM_Dec',
    's6p2_q11_n13': 'PNFW_SJ_DpM_Nov',
    's6p2_q11_o13': 'PNFW_SJ_DpM_Oct',
    's6p2_q11_s13': 'PNFW_SJ_DpM_Sep',
    's6p2_q11_a13': 'PNFW_SJ_DpM_Aug',
    's6p2_q11_jul13': 'PNFW_SJ_DpM_Jul',
    's6p2_q11_jun13': 'PNFW_SJ_DpM_Jun',
    's6p2_q11_m13': 'PNFW_SJ_DpM_May',
    's6p2_q11_ap13': 'PNFW_SJ_DpM_Apr',
    's6p2_q12': 'PNFW_SJ_PoP',
    's6p2_q13': 'PNFW_SJ_TE',
    's6p2_q14_des': 'PNFW_SJ_Des',
    's6p2_q14_code': 'PNFW_SJ_Code',
    's6p2_q15_des': 'PNFW_SJ_EA_Des',
    's6p2_q15_code': 'PNFW_SJ_EA_Code',
    's6p2_q16': 'PNFW_SJ_HC',
    's6p2_q17': 'PNFW_SJ_pp',
    's6p2_q18': 'PNFW_SJ_pp_12m',
    's6p2_q19': 'PNFW_SJ_pp_Val'
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_6_part_2.csv', index=False)

