In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks”
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [2]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd
import numpy as np
# Store excel file locations to variables

EI_2012 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\28. Section 6 Part 5 Migration\2012_s6p5a-5b_m.xlsx"
EI_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\28. Section 6 Part 5 Migration\2013_s6p4_m.xlsx"
EI_2014 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\28. Section 6 Part 5 Migration\2014_s6p4_m.xlsx"

# Read excel files 

df_2012 = pd.read_excel(EI_2012)
df_2013 = pd.read_excel(EI_2013)
df_2014 = pd.read_excel(EI_2014)



In [3]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df to df_2014 column names


df_2012.rename(columns={
     'pid': 'r_pid',
    'gender': 's6p4_q2',
    's6p5q9': 's6p4_q3',
    's6p5q10': 's6p4_q4',
    's6p5q11_dist': 's6p4_q5_dc',
    's6p5q11_country': 's6p4_q5_cc',
    's6p5q13_march': 's6p4_q6m14',
    's6p5q13_feb': 's6p4_q6f14',
    's6p5q13_jan': 's6p4_q6j14',
    's6p5q13_dec': 's6p4_q6d13',
    's6p5q13_nov': 's6p4_q6n13',
    's6p5q13_oct': 's6p4_q6o13',
    's6p5q13_sep': 's6p4_q6s13',
    's6p5q13_aug': 's6p4_q6ag13',
    's6p5q13_jul': 's6p4_q6jul13',
    's6p5q13_jun': 's6p4_q6jun13',
    's6p5q13_may': 's6p4_q6m13',
    's6p5q13_apr': 's6p4_q6ap13',
    's6p5q14': 's6p4_q7',
    's6p5q15': 's6p4_q8',
    's6p5q4': 'M_Rel_HoHH',
    's6p5q5': 'M_NBiV_YN',
    's6p5q6': 'M_NB_Loc',
    's6p5q7_dist': 'M_NB_D',
    's6p5q7_country': 'M_NB_C',
    's6p5q8': 'M_OV_12m',
    's6p5q12': 'M_Status',
    's6p5q16': 'M_RemHH_12m',
    's6p5q17': 'M_RemHH_Val',
    's6p5q18': 'M_Rem_Method',
    's6p5q19': 'M_RemCost_YN',
    's6p5q20': 'M_Cost',
    's6p5q21': 'M_Pconn',
    'PROVINCE_ID': 'P_ID',
    'DISTRICT_ID': 'D_ID',
    'TEHSIL_ID': 'T_ID',
    'UC_ID': 'UC_ID',
    'MAUZA_ID': 'M_ID'
    
}, inplace=True)


df_2013.rename(columns={
    's6p4_q6m13': 's6p4_q6m14',
    's6p4_q6f13': 's6p4_q6f14',
    's6p4_q6j13': 's6p4_q6j14',
    's6p4_q6d12': 's6p4_q6d13',
    's6p4_q6n12': 's6p4_q6n13',
    's6p4_q6o12': 's6p4_q6o13',
    's6p4_q6s12': 's6p4_q6s13',
    's6p4_q6ag12': 's6p4_q6ag13',
    's6p4_q6jul12': 's6p4_q6jul13',
    's6p4_q6jun12': 's6p4_q6jun13',
    's6p4_q6m12': 's6p4_q6m13',
    's6p4_q6ap12': 's6p4_q6ap13'

}, inplace=True)


# df_2014 doesn't need renaming as it is the reference

In [4]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:


mapping_2012 = [
    'hid', 'round', None, 'r_pid', 's6p4_q2', 's6p4_q3', 's6p4_q4', 's6p4_q5_dc', 
    's6p4_q5_cc', 's6p4_q6m14', 's6p4_q6f14', 's6p4_q6j14', 's6p4_q6d13', 's6p4_q6n13', 
    's6p4_q6o13', 's6p4_q6s13', 's6p4_q6ag13', 's6p4_q6jul13', 's6p4_q6jun13', 's6p4_q6m13', 
    's6p4_q6ap13', 's6p4_q7', 's6p4_q8', 'M_Rel_HoHH', 'M_NBiV_YN', 'M_NB_Loc', 'M_NB_D', 
    'M_NB_C', 'M_OV_12m', 'M_Status', 'M_RemHH_12m', 'M_RemHH_Val', 'M_Rem_Method', 
    'M_RemCost_YN', 'M_Cost', 'M_Pconn', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID'
]



mapping_2013 = [
    'hid', 'round', 's6p4_qa', 'r_pid', 's6p4_q2', 's6p4_q3', 's6p4_q4', 's6p4_q5_dc', 
    's6p4_q5_cc', 's6p4_q6m14', 's6p4_q6f14', 's6p4_q6j14', 's6p4_q6d13', 's6p4_q6n13', 
    's6p4_q6o13', 's6p4_q6s13', 's6p4_q6ag13', 's6p4_q6jul13', 's6p4_q6jun13', 's6p4_q6m13', 
    's6p4_q6ap13', 's6p4_q7', 's6p4_q8', None, None, None, None, None, None, None, None, 
    None, None, None, None, None, None, None, None, None
]


mapping_2014 = [
    'hid', 'round', 's6p4_qa', 'r_pid', 's6p4_q2', 's6p4_q3', 's6p4_q4', 's6p4_q5_dc', 
    's6p4_q5_cc', 's6p4_q6m14', 's6p4_q6f14', 's6p4_q6j14', 's6p4_q6d13', 's6p4_q6n13', 
    's6p4_q6o13', 's6p4_q6s13', 's6p4_q6ag13', 's6p4_q6jul13', 's6p4_q6jun13', 's6p4_q6m13', 
    's6p4_q6ap13', 's6p4_q7', 's6p4_q8', None, None, None, None, None, None, None, None, 
    None, None, None, None, None, None, None, None
]





In [5]:
# Create a list of all possible columns in the correct order
all_columns = []


for col in mapping_2012:
    if col and col not in all_columns:
        all_columns.append(col)     
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [6]:
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col.strip()  # Remove leading/trailing whitespace
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if ref_col in df.columns:
                    print(f"Appending data for column {ref_col}")
                    if isinstance(df[ref_col], pd.Series):
                        merged_data[ref_col].extend(df[ref_col].tolist())
                    elif isinstance(df[ref_col], pd.DataFrame):
                        print(f"Column {ref_col} is duplicated in DataFrame. Appending data for each duplicate.")
                        for _, series in df[ref_col].items():
                            merged_data[ref_col].extend(series.tolist())
                else:
                    print(f"Column {ref_col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df


In [7]:
# Usage with dataframes and mappings
dfs = [ df_2012, df_2013, df_2014]
mappings = [ mapping_2012, mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 's6p4_q2', 'r_pid', 'M_Rel_HoHH', 'M_NBiV_YN', 'M_NB_Loc', 'M_NB_D', 'M_NB_C', 'M_OV_12m', 's6p4_q3', 's6p4_q4', 's6p4_q5_dc', 's6p4_q5_cc', 'M_Status', 's6p4_q6j14', 's6p4_q6f14', 's6p4_q6m14', 's6p4_q6ap13', 's6p4_q6m13', 's6p4_q6jun13', 's6p4_q6jul13', 's6p4_q6ag13', 's6p4_q6s13', 's6p4_q6o13', 's6p4_q6n13', 's6p4_q6d13', 's6p4_q7', 's6p4_q8', 'M_RemHH_12m', 'M_RemHH_Val', 'M_Rem_Method', 'M_RemCost_YN', 'M_Cost', 'M_Pconn', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID']
Appending data for column hid
Appending data for column round
Appending data for column r_pid
Appending data for column s6p4_q2
Appending data for column s6p4_q3
Appending data for column s6p4_q4
Appending data for column s6p4_q5_dc
Appending data for column s6p4_q5_cc
Appending data for column s6p4_q6m14
Appending data for column s6p4_q6f14
Appending data for column s6p4_q6j14
Appending data for column s6p4_q6d13
Appending data for column s6p4_q6n13
Append

In [8]:
# Rename columns for the merged file (if needed)
rename_mapping = {
     'hid': 'HID',
    'round': 'Survey_Round',
    's6p4_qa': 'M_QA',
    'r_pid': 'PID',
    's6p4_q2': 'M_Gender',
    's6p4_q3': 'M_Loc',
    's6p4_q4': 'M_Loc_Diff',
    's6p4_q5_dc': 'M_Loc_DC',
    's6p4_q5_cc': 'M_Loc_CC',
    's6p4_q6m14': 'M_Gone_Mar',
    's6p4_q6f14': 'M_Gone_Feb',
    's6p4_q6j14': 'M_Gone_Jan',
    's6p4_q6d13': 'M_Gone_Dec',
    's6p4_q6n13': 'M_Gone_Nov',
    's6p4_q6o13': 'M_Gone_Oct',
    's6p4_q6s13': 'M_Gone_Sep',
    's6p4_q6ag13': 'M_Gone_Aug',
    's6p4_q6jul13': 'M_Gone_Jul',
    's6p4_q6jun13': 'M_Gone_Jun',
    's6p4_q6m13': 'M_Gone_May',
    's6p4_q6ap13': 'M_Gone_Apr',
    's6p4_q7': 'M_Res',
    's6p4_q8': 'M_Jloc'
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_6_part_5.csv', index=False)

