In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks‚Äù
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [1]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd
import numpy as np
# Store excel file locations to variables

path_2012 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\15. Section 3 Part 2 Household Asset\2012_s3p2_m.xlsx"
path_2012_5 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\15. Section 3 Part 2 Household Asset\2012_5_s3p2.xlsx"
path_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\15. Section 3 Part 2 Household Asset\2013_s3p2_m.xlsx"
path_2014 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\15. Section 3 Part 2 Household Asset\2014_s3p2_m.xlsx"
# Read excel files 
df_2012 = pd.read_excel(path_2012)
df_2012_5 = pd.read_excel(path_2012_5)
df_2013 = pd.read_excel(path_2013)
df_2014 = pd.read_excel(path_2014)



In [2]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df_2013 to df_2014 column names

df_2012.rename(columns={
    'S3P2Q3': 's3p2_q2',
    'S3P2Q4': 's3p2_q3',
    'S3P2Q5': 's3p2_q4',
    'PROVINCE_ID': 'P_ID',
    'DISTRICT_ID': 'D_ID',
    'TEHSIL_ID': 'T_ID',
    'UC_ID': 'UC_ID	',
    'MAUZA_ID': 'M_ID'
}, inplace=True)

df_2012_5.rename(columns={
    'Round': 'round',
    'S3P2Q2': 'assets_id',
    'S3P2_NAME': 'asset_name',
    'S3P2Q3': 's3p2_q2',
    'S3P2Q4': 's3p2_q3',
    'S3P2Q5': 's3p2_q4',
    'C_PROVINCE': 'P_ID',
    'C_DISTRICT': 'D_ID',
    'C_TEHSIL': 'T_ID',
    'C_UC': 'UC_ID',
    'C_MOUZA': 'M_ID',
}, inplace=True)


df_2013.rename(columns={

    's3p2_q1': 'asset_name',
    's3p2_q2': 's3p2_q2',
    's3p2_q4': 's3p2_q3',
    's3p2_q5': 's3p2_q4',
    's3p2_q3': 'Ass_ALL_Part',
    's3p2_q6': '2013_s3p2_q6',
    's3p2_q7': '2013_s3p2_q7',
    's3p2_q8': '2013_s3p2_q8',
    's3p2_q9': '2013_s3p2_q9',
}, inplace=True)


# df_2014 doesn't need renaming as it is the reference

In [3]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:


mapping_2012 = [
    "hid", "round", "s3p1_sr", "None", "None", "s3p1_q2", "s3p1_q3", "s3p1_q4",
    "s3p1_q5", "None", "None", "None", "None", "None", "P_ID", "D_ID", "T_ID",
    "UC_ID", "M_ID", "None", "None", "None", "None"
]

mapping_2012_5 = [
    "hid", "round", "s3p1_sr", "s3p1_name", "None", "None", "s3p1_q3", "s3p1_q4",
    "s3p1_q5", "None", "None", "None", "None", "None", "P_ID", "D_ID", "T_ID", "UC_ID", 
    "M_ID", "C_HH_NUM", "Ass_Y_OldestU", "Ass_EP_Cap", "Ass_EP_U"
]

mapping_2013 = [
    "hid", "round", "s3p1_sr", "s3p1_name", "None", "s3p1_q2", "s3p1_q3", "s3p1_q4",
    "s3p1_q5", "s3p1_q6", "s3p1_q7", "s3p1_q8", "s3p1_q9", "Ass_ALL_Part", "None",
    "None", "None", "None", "None", "None", "None", "None", "None"
]

mapping_2014 = [
    "hid", "round", "s3p1_sr", "s3p1_name", "s3p1_q1", "s3p1_q2", "s3p1_q3", "s3p1_q4",
    "s3p1_q5", "s3p1_q6", "s3p1_q7", "s3p1_q8", "s3p1_q9", "None", "None", "None",
    "None", "None", "None", "None", "None", "None", "None"
]



In [4]:
# Create a list of all possible columns in the correct order
all_columns = []
for col in mapping_2012:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2012_5:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [5]:

# Function to standardize and merge DataFrames
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if col in df.columns:
                    print(f"Appending data for column {col}")
                    merged_data[ref_col].extend(df[col].tolist())
                else:
                    print(f"Column {col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df

In [6]:
# Usage with dataframes and mappings
dfs = [df_2012, df_2012_5, df_2013, df_2014]
mappings = [ mapping_2012, mapping_2012_5, mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 'asset_id', 's3p2_q2', 's3p2_q3', 's3p2_q4', 'P_ID', 'D_ID', 'T_ID', 'UC_ID\t', 'M_ID']
Appending data for column hid
Appending data for column round
Column s3p1_sr not found in DataFrame. Adding NaNs.
Column None not found in DataFrame. Adding NaNs.
Column None not found in DataFrame. Adding NaNs.
Column s3p1_q2 not found in DataFrame. Adding NaNs.
Column s3p1_q3 not found in DataFrame. Adding NaNs.
Column s3p1_q4 not found in DataFrame. Adding NaNs.
Column s3p1_q5 not found in DataFrame. Adding NaNs.
Column None not found in DataFrame. Adding NaNs.
Column None not found in DataFrame. Adding NaNs.
Column None not found in DataFrame. Adding NaNs.
Column None not found in DataFrame. Adding NaNs.
Column None not found in DataFrame. Adding NaNs.
Appending data for column P_ID
Appending data for column D_ID
Appending data for column T_ID
Column UC_ID not found in DataFrame. Adding NaNs.
Appending data for column M_ID
Column 

In [7]:
# Rename columns for the merged file (if needed)
rename_mapping = {
    'hid': 'HID',
    'round': 'Survey_Round',
    'assets_id': 'Ass_C',
    'asset_name': 'Ass_N',
    's3p2_q1': 'Ass_PR',
    's3p2_q2': 'Ass_CR',
    's3p2_q3': 'Ass_HH_no',
    's3p2_q4': 'Ass_Val_T',
    's3p2_q5': 'Ass_Rep',
    's3p2_q6': 'Ass_S',
    's3p2_q7': 'Ass_S_Val',
    's3p2_q8': 'Ass_P',
    's3p2_q9': 'Ass_P_Val'
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_3_part_2.csv', index=False)

