In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks”
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [24]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd
import numpy as np
# Store excel file locations to variables

credit_2012 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\22. Section 5 Part 2\2012_5_s6p2.xlsx"
credit_2012_5 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\22. Section 5 Part 2\2012_s5p2_m.xlsx"
credit_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\22. Section 5 Part 2\2013_s5p2_m.xlsx"
credit_2014 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\22. Section 5 Part 2\2014_s5p2_m.xlsx"

# Read excel files 
df_2012 = pd.read_excel(credit_2012)
df_2012_5 = pd.read_excel(credit_2012_5)
df_2013 = pd.read_excel(credit_2013)
df_2014 = pd.read_excel(credit_2014)



In [25]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df_2013 to df_2014 column names
df_2013.rename(columns={
    's5p2_q7a': 's5p2_q8',
    's5p2_q8': 's5p2_q10',
    's5p2_q9': 's5p2_q11',
    's5p2_q10': 's5p2_q12',
    's5p2_q11_m': 's5p2_q13_m',
    's5p2_q11_y': 's5p2_q13_y',
    's5p2_q12': 's5p2_q14',
    's5p2_q13': 's5p2_q15',
    's5p2_q14': 's5p2_q16',
    's5p2_q15': 's5p2_q17',
    's5p2_q16': 's5p2_q18',
    's5p2_q17': 's5p2_q19',
    's5p2_q18': 's5p2_q20',
    's5p2_q19': 's5p2_q21'
}, inplace=True)

# Rename columns in df_2012_5 to df_2014 column names
df_2012_5.rename(columns={
    'Round': 'round',
    'S6P2_LOAN_NUM': 'loan_number',
    'S6P2Q1': 's5p2_q1',
    'S6P2Q2': 's5p2_q2',
    'S6P2Q3': 's5p2_q3',
    'S6P2Q4_MONTH': 's5p2_q4_m',
    'S6P2Q4_YEAR': 's5p2_q4_y',
    'S6P2Q5': 's5p2_q5',
    'S6P2Q6': 's5p2_q6',
    'S6P2Q7': 's5p2_q7',
    'S6P2Q8': 's5p2_q10',
    'S6P2Q9': 's5p2_q11',
    'S6P2Q10': 's5p2_q12',
    'S6P2Q11_MONTH': 's5p2_q13_m',
    'S6P2Q11_YEAR': 's5p2_q13_y',
    'S6P2Q12': 's5p2_q16',
    'S6P2Q13': 's5p2_q17',
    'S6P2Q14': 's5p2_q19',
    'S6P2Q15': 's5p2_q20',
    'S6P2Q16': 's5p2_q21',
    'S6P2Q17': 'Loan_Src_Formal',
    'S6P2Q18': 'Loan_Src_Formal_SC',
    'S6P2Q19A': 'Crop_SL_MI',
    'S6P2Q19B1': 'Crop_SL_MI_A',
    'S6P2Q19B2': 'Crop_SL_MI_AM',
    'S6P2Q19C': 'Crop_SL_MI_Val',
    'S6P2Q19D': 'Crop_SL_SI',
    'S6P2Q19E1': 'Crop_SL_SI_A',
    'S6P2Q19E2': 'Crop_SL_SI_AM',
    'S6P2Q19F': 'Crop_SL_SI_Val',
    'C_PROVINCE': 'P_ID',
    'C_DISTRICT': 'D_ID',
    'C_TEHSIL': 'T_ID',
    'C_MAUZA': 'M_ID'
}, inplace=True)

# Rename columns in df_2012 to df_2014 column names
df_2012.rename(columns={
    'LOAN_ID': 'loan_number',
    'S5P2Q1': 's5p2_q1',
    'S5P2Q2': 's5p2_q2',
    'S5P2Q3': 's5p2_q3',
    'S5P2Q4_MONTH': 's5p2_q4_m',
    'S5P2Q4_YEAR': 's5p2_q4_y',
    'S5P2Q5': 's5p2_q5',
    'S5P2Q6': 's5p2_q6',
    'S5P2Q7': 's5p2_q7',
    'S5P2Q8': 's5p2_q10',
    'S5P2Q9': 's5p2_q11',
    'S5P2Q10': 's5p2_q12',
    'S5P2Q11_MONTH': 's5p2_q13_m',
    'S5P2Q11_YEAR': 's5p2_q13_y',
    'S5P2Q12': 's5p2_q16',
    'S5P2Q13': 's5p2_q17',
    'S5P2Q14': 's5p2_q19',
    'S5P2Q15': 's5p2_q20',
    'S5P2Q16': 's5p2_q21',
    'S5P2Q17': 'Loan_Src_InFormal',
    'S5P2Q18': 'Loan_Src_InFormal_SC',
    'S5P2Q19A': 'Crop_SL_MI',
    'S5P2Q19B': 'Crop_SL_MI_A',
    'S5P2Q19C': 'Crop_SL_MI_Val',
    'S5P2Q19D': 'Crop_SL_SI',
    'S5P2Q19E': 'Crop_SL_SI_A',
    'S5P2Q19F': 'Crop_SL_SI_Val',
    'PROVINCE_ID': 'P_ID',
    'DISTRICT_ID': 'D_ID',
    'TEHSIL_ID': 'T_ID',
    'UC_ID': 'UC_ID',
    'MAUZA_ID': 'M_ID'
}, inplace=True)


# df_2014 doesn't need renaming as it is the reference

In [26]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:


mapping_2012 = [
    'hid', 'round', None, None, 'loan_number', 's5p2_q1', 's5p2_q2', 's5p2_q3', 's5p2_q4_m', 's5p2_q4_y',
    's5p2_q5', 's5p2_q6', 's5p2_q7', None, None, 's5p2_q10', 's5p2_q11', 's5p2_q12', 's5p2_q13_m', 's5p2_q13_y',
    None, None, 's5p2_q16', 's5p2_q17', None, 's5p2_q19', 's5p2_q20', 's5p2_q21', None, None,
    'Loan_Src_InFormal', 'Loan_Src_InFormal_SC', 'Crop_SL_MI', 'Crop_SL_MI_A', None, 'Crop_SL_MI_Val', 'Crop_SL_SI', 'Crop_SL_SI_A', None,
    'Crop_SL_SI_Val', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID', None
]

mapping_2012_5 = [
    'hid', 'round', None, None, 'loan_number', 's5p2_q1', 's5p2_q2', 's5p2_q3', 's5p2_q4_m', 's5p2_q4_y',
    's5p2_q5', 's5p2_q6', 's5p2_q7', None, None, 's5p2_q10', 's5p2_q11', 's5p2_q12', 's5p2_q13_m', 's5p2_q13_y',
    None, None, 's5p2_q16', 's5p2_q17', None, 's5p2_q19', 's5p2_q20', 's5p2_q21', 'Loan_Src_Formal',
    'Loan_Src_Formal_SC',  None, None, 'Crop_SL_MI',	'Crop_SL_MI_A',	'Crop_SL_MI_AM',	'Crop_SL_MI_Val',
    'Crop_SL_SI',	'Crop_SL_SI_A',	'Crop_SL_SI_AM',	'Crop_SL_SI_Val'
    'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID', None
]


mapping_2013= [
    'hid', 'round', 's5p2_qa', 's5p2_qb', 'loan_number', 's5p2_q1', 's5p2_q2', 's5p2_q3', 's5p2_q4_m', 's5p2_q4_y',
    's5p2_q5', 's5p2_q6', 's5p2_q7', 's5p2_q8', None, 's5p2_q10', 's5p2_q11', 's5p2_q12', 's5p2_q13_m', 's5p2_q13_y',
    's5p2_q14', 's5p2_q15', 's5p2_q16', 's5p2_q17', 's5p2_q18', 's5p2_q19', 's5p2_q20', 's5p2_q21',
    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
]


mapping_2014 = [
    'hid', 'round', 's5p2_qa', 's5p2_qb', 'loan_number', 's5p2_q1', 's5p2_q2', 's5p2_q3', 's5p2_q4_m', 's5p2_q4_y',
    's5p2_q5', 's5p2_q6', 's5p2_q7', 's5p2_q8', 's5p2_q9', 's5p2_q10', 's5p2_q11', 's5p2_q12', 's5p2_q13_m', 's5p2_q13_y',
    's5p2_q14', 's5p2_q15', 's5p2_q16', 's5p2_q17', 's5p2_q18', 's5p2_q19', 's5p2_q20', 's5p2_q21',
    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
]




In [27]:
# Create a list of all possible columns in the correct order
all_columns = []

for col in mapping_2012:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2012_5:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [28]:
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col.strip()  # Remove leading/trailing whitespace
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if ref_col in df.columns:
                    print(f"Appending data for column {ref_col}")
                    if isinstance(df[ref_col], pd.Series):
                        merged_data[ref_col].extend(df[ref_col].tolist())
                    elif isinstance(df[ref_col], pd.DataFrame):
                        print(f"Column {ref_col} is duplicated in DataFrame. Appending data for each duplicate.")
                        for _, series in df[ref_col].items():
                            merged_data[ref_col].extend(series.tolist())
                else:
                    print(f"Column {ref_col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df


In [29]:
# Usage with dataframes and mappings
dfs = [df_2012, df_2012_5, df_2013, df_2014]
mappings = [mapping_2012, mapping_2012_5, mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'Round', 'hid', 'C_PROVINCE', 'C_DISTRICT', 'C_TEHSIL', 'C_UC', 'C_MOUZA', 'C_HH_NUM', 'S6P2_LOAN_NUM', 'S6P2Q1', 'S6P2Q2', 'S6P2Q3', 'S6P2Q4_MONTH', 'S6P2Q4_YEAR', 'S6P2Q5', 'S6P2Q6', 'S6P2Q7', 'S6P2Q8', 'S6P2Q9', 'S6P2Q10', 'S6P2Q11_MONTH', 'S6P2Q11_YEAR', 'S6P2Q12', 'S6P2Q13', 'S6P2Q14', 'S6P2Q15', 'S6P2Q16', 'S6P2Q17', 'S6P2Q18', 'S6P2Q19A', 'S6P2Q19B1', 'S6P2Q19B2', 'S6P2Q19C', 'S6P2Q19D', 'S6P2Q19E1', 'S6P2Q19E2', 'S6P2Q19F']
Appending data for column hid
Column round not found in DataFrame. Adding NaNs.
Column loan_number not found in DataFrame. Adding NaNs.
Column s5p2_q1 not found in DataFrame. Adding NaNs.
Column s5p2_q2 not found in DataFrame. Adding NaNs.
Column s5p2_q3 not found in DataFrame. Adding NaNs.
Column s5p2_q4_m not found in DataFrame. Adding NaNs.
Column s5p2_q4_y not found in DataFrame. Adding NaNs.
Column s5p2_q5 not found in DataFrame. Adding NaNs.
Column s5p2_q6 not found in DataFrame. Adding NaNs.
Column s5p

In [30]:
# Rename columns for the merged file (if needed)
rename_mapping = {
    'hid': 'HID',
    'round': 'Survey_Round',
    'loan_number': 'Loan_ID',
    's5p2_q1': 'Loan_A',
    's5p2_q2': 'Loan_src',
    's5p2_q3': 'Loan_Purp',
    's5p2_q4_m': 'Loan_M',
    's5p2_q4_y': 'Loan_Y',
    's5p2_q5': 'Loan_Int',
    's5p2_q6': 'Loan_AnInt',
    's5p2_q7': 'Loan_AF',
    's5p2_q8': 'Loan_AF_Res',
    's5p2_q9': 'Loan_Ins',
    's5p2_q10': 'Loan_Coll',
    's5p2_q11': 'Loan_Coll_Type',
    's5p2_q12': 'Loan_Rpy_InTime',
    's5p2_q13_m': 'Loan_Rpy_M',
    's5p2_q13_y': 'Loan_Rpy_Y',
    's5p2_q14': 'Loan_rpy',
    's5p2_q15': 'Loan_rpy_ResNo',
    's5p2_q16': 'Loan_Rpy_Done',
    's5p2_q17': 'Loan_Rpy_TPSF',
    's5p2_q18': 'Loan_Rpy_T_Kind',
    's5p2_q19': 'Loan_Lforgive',
    's5p2_q20': 'Loan_Lforgive_A',
    's5p2_q21': 'Loan_Rpy_TTBP'
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_5_part_2.csv', index=False)

