In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks”
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [2]:
#since there are 2 files for this section in multiple years, we will make a standardized single file for sectional merging

import pandas as pd
import numpy as np

# Load the first file
df1 = pd.read_excel(r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\23. Section 5 Part 3 Goods and inputs purch on credit\2014_s5p3a_m.xlsx")
# Load the second file
df2 = pd.read_excel(r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\23. Section 5 Part 3 Goods and inputs purch on credit\2014_s5p3b_m.xlsx")


# Merge the two files based on a common column
merged_df = pd.concat([df1, df2], ignore_index=True)

# Drop redundant columns
merged_df.drop(merged_df.columns[merged_df.columns.str.contains('Unnamed', case=True)], axis=1, inplace=True)
merged_df.drop(merged_df.columns[merged_df.columns.str.contains(' ', case=False)], axis=1, inplace=True)
# Save the merged dataframe to a CSV file
merged_df.to_csv('2014_s5p3ap3b_m.csv', index=True)

In [5]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd
import numpy as np
# Store excel file locations to variables

EI_2012_5 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\26. Section 6 Part 3 NAE Male Proprietor\2012_5_s7p4.xlsx"
EI_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\26. Section 6 Part 3 NAE Male Proprietor\2013_s6p3a_m.xlsx"
EI_2014 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\26. Section 6 Part 3 NAE Male Proprietor\2014_s6p3a_m.xlsx"

# Read excel files 

df_2012_5 = pd.read_excel(EI_2012_5)
df_2013 = pd.read_excel(EI_2013)
df_2014 = pd.read_excel(EI_2014)



In [6]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df to df_2014 column names


df_2012_5.rename(columns={
    'Round': 'round',
    'S7P4ID': 's6p3a_bid',
    'PID1': 'r_pid',
    'S7P4Q4': 's6p3a_q3',
    'S7P4Q5': 's6p3a_q6',
    'S7P4_NAME': 'ENA_Bname',
    'S7P4Q6': 'ENA_ST',
    'S7P4Q7': 'ENA_ST_SHH',
    'S7P4Q8': 'ENA_ST_HH',
    'S7P4Q9': 'ENA_ST_NP',
    'S7P4Q10': 'ENA_Bspendings',
    'C_PROVINCE': 'P_ID',
    'C_DISTRICT': 'D_ID',
    'C_TEHSIL': 'T_ID',
    'C_UC': 'UC_ID',
    'C_MOUZA': 'M_ID'
    
}, inplace=True)


df_2013.rename(columns={
    's6p3a__bid': 's6p3a_bid',
    's6p3a_q5': 's6p3a_q6',
    's6p3a_q6': 's6p3a_q7',
    's6p3a_q7': 's6p3a_q8',
    's6p3a_q8': 's6p3a_q9',
    's6p3a_q9': 's6p3a_q10',
    's6p3a_q10a': 's6p3a_q13',
    's6p3a_q10b': 's6p3a_q14',
    's6p3a_q11a': 's6p3a_q15',
    's6p3a_q11b': 's6p3a_q16',
    's6p3a_q12a': 's6p3a_q17',
    's6p3a_q12b': 's6p3a_q18',
    's6p3a_q12c': 's6p3a_q19',
    's6p3a_q13a': 's6p3a_q20',
    's6p3a_q13b': 's6p3a_q21',
    's6p3a_q13c': 's6p3a_q22',
    's6p3a_q14': 's6p3a_q23',
    's6p3a_q15': 's6p3a_q24',
    's6p3a_q16': 's6p3a_q25',
    's6p3a_q17': 's6p3a_q26'

}, inplace=True)


# df_2014 doesn't need renaming as it is the reference

In [7]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:


mapping_2012_5 = [
    'hid', 'round', None, 's6p3a_bid', 'r_pid', 's6p3a_q3', None, None, 's6p3a_q6', 
    None, None, None, None, None, None, None, None, None, None, None, None, None, None, 
    None, None, None, None, None, None, 'ENA_Bname', 'ENA_ST', 'ENA_ST_SHH', 'ENA_ST_HH', 
    'ENA_ST_NP', 'ENA_Bspendings', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID', 'C_HH_NUM'
]



mapping_2013 = [
    'hid', 'round', 's6p3a_qa', 's6p3a_bid', 'r_pid', 's6p3a_q3', 's6p3a_q4', None, 
    's6p3a_q6', 's6p3a_q7', 's6p3a_q8', 's6p3a_q9', 's6p3a_q10', None, None, 
    's6p3a_q13', 's6p3a_q14', 's6p3a_q15', 's6p3a_q16', 's6p3a_q17', 's6p3a_q18', 
    's6p3a_q19', 's6p3a_q20', 's6p3a_q21', 's6p3a_q22', 's6p3a_q23', 's6p3a_q24', 
    's6p3a_q25', 's6p3a_q26', None, None, None, None, None, None, None, None, None, None, 
    None, None
]


mapping_2014 = [
    'hid', 'round', 's6p3a_qa', 's6p3a_bid', 'r_pid', 's6p3a_q3', 's6p3a_q4', 
    's6p3a_q5', 's6p3a_q6', 's6p3a_q7', 's6p3a_q8', 's6p3a_q9', 's6p3a_q10', 
    's6p3a_q11', 's6p3a_q12', 's6p3a_q13', 's6p3a_q14', 's6p3a_q15', 's6p3a_q16', 
    's6p3a_q17', 's6p3a_q18', 's6p3a_q19', 's6p3a_q20', 's6p3a_q21', 's6p3a_q22', 
    's6p3a_q23', 's6p3a_q24', 's6p3a_q25', 's6p3a_q26', None, None, None, None, None, 
    None, None, None, None, None, None, None
]





In [8]:
# Create a list of all possible columns in the correct order
all_columns = []


for col in mapping_2012_5:
    if col and col not in all_columns:
        all_columns.append(col)     
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [9]:
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col.strip()  # Remove leading/trailing whitespace
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if ref_col in df.columns:
                    print(f"Appending data for column {ref_col}")
                    if isinstance(df[ref_col], pd.Series):
                        merged_data[ref_col].extend(df[ref_col].tolist())
                    elif isinstance(df[ref_col], pd.DataFrame):
                        print(f"Column {ref_col} is duplicated in DataFrame. Appending data for each duplicate.")
                        for _, series in df[ref_col].items():
                            merged_data[ref_col].extend(series.tolist())
                else:
                    print(f"Column {ref_col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df


In [10]:
# Usage with dataframes and mappings
dfs = [ df_2012_5, df_2013, df_2014]
mappings = [ mapping_2012_5, mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'round', 'hid', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID', 'C_HH_NUM', 's6p3a_bid', 'ENA_Bname', 'r_pid', 's6p3a_q3', 's6p3a_q6', 'ENA_ST', 'ENA_ST_SHH', 'ENA_ST_HH', 'ENA_ST_NP', 'ENA_Bspendings']
Appending data for column hid
Appending data for column round
Appending data for column s6p3a_bid
Appending data for column r_pid
Appending data for column s6p3a_q3
Appending data for column s6p3a_q6
Appending data for column ENA_Bname
Appending data for column ENA_ST
Appending data for column ENA_ST_SHH
Appending data for column ENA_ST_HH
Appending data for column ENA_ST_NP
Appending data for column ENA_Bspendings
Appending data for column P_ID
Appending data for column D_ID
Appending data for column T_ID
Appending data for column UC_ID
Appending data for column M_ID
Appending data for column C_HH_NUM
Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 's6p3a_qa', 's6p3a_bid', 'r_pid', 's6p3a_q3', 's6p3a_q4', 's6p3a_q6', 's6p3a_q

In [11]:
# Rename columns for the merged file (if needed)
rename_mapping = {
     'hid': 'HID',
    'round': 'Survey_Round',
    's6p3a_qa': 'NAE_YN',
    's6p3a_bid': 'NAE_Bid',
    'r_pid': 'PID',
    's6p3a_q3': 'NAE_Otype',
    's6p3a_q4': 'NAE_BReg',
    's6p3a_q5': 'NAE_NoBReg_Res',
    's6p3a_q6': 'NAE_E_IoO',
    's6p3a_q7': 'NAE_E_ST',
    's6p3a_q8': 'NAE_SC_S1',
    's6p3a_q9': 'NAE_SC_S2',
    's6p3a_q10': 'NAE_Einit_Val',
    's6p3a_q11': 'NAE_BDS_Acc',
    's6p3a_q12': 'NAE_BDS_Name',
    's6p3a_q13': 'NAE_SEP_NoPTotal',
    's6p3a_q14': 'NAE_SEP_PD',
    's6p3a_q15': 'NAE_UFL_NoP',
    's6p3a_q16': 'NAE_UFL_PD',
    's6p3a_q17': 'NAE_PHL_NoP',
    's6p3a_q18': 'NAE_PHL_PD',
    's6p3a_q19': 'NAE_PHL_AL',
    's6p3a_q20': 'NAE_CHL_NoP',
    's6p3a_q21': 'NAE_CHL_PD',
    's6p3a_q22': 'NAE_CHL_AL',
    's6p3a_q23': 'NAE_Loc',
    's6p3a_q24': 'NAE_PS',
    's6p3a_q25': 'NAE_LocPS',
    's6p3a_q26': 'NAE_OpT'
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_6_part_3.csv', index=False)

