In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks”
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [None]:
'''
The following code block will merge all roaster files across the years.
* First, we are going to read the respective files and store them as data frames
* Next, we are going to define column mappings that I have already figured out via manual methods
* Once the mappings are done per the set rules, we will see the new roaster dataset across the years 1.
'''

In [21]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd

# Store excel file locations to variables

agri_2012 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\11. Section 2 Part 9 Cost of livestock care and labour\2012_S2P9.xlsx"
agri_2012_5 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\11. Section 2 Part 9 Cost of livestock care and labour\2012_1.5_s1p9_q20ato25.xlsx"
agri_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\11. Section 2 Part 9 Cost of livestock care and labour\2013_s2p9cont_m.xlsx"
agri_2014= r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\11. Section 2 Part 9 Cost of livestock care and labour\2014_s2p11_m.xlsx"
# Read excel files 
df_2012 = pd.read_excel(agri_2012)
df_2012_5 = pd.read_excel(agri_2012_5)
df_2013 = pd.read_excel(agri_2013)
df_2014 = pd.read_excel(agri_2014)


In [28]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df
df_2012.rename(columns={
    'labor_cost': 's2p11_q1',
    'building': 's2p11_q3',
    'electricity_gas': 's2p11_q4',
    'tools_machinary': 's2p11_q5',
    'veterinary_medicines': 's2p11_q6',
    'other_cost': 's2p11_q11',
    'PROVINCE_ID': 'P_ID',
    'DISTRICT_ID': 'D_ID',
    'TEHSIL_ID': 'T_ID',
    'UC_ID': 'UC_ID',
    'MAUZA_ID': 'M_ID',
}, inplace=True)

df_2012_5.rename(columns={
    'Round' : 'round',
    'S1P9Q20M_RS': 's2p11_q1',
    'S1P9Q20M_KIND': 's2p11_q2',
    'S1P9Q21': 's2p11_q3',
    'S1P9Q22': 's2p11_q4',
    'S1P9Q23': 's2p11_q5',
    'S1P9Q24': 's2p11_q6',
    'S1P9Q25': 's2p11_q11',
    'S1P9Q20G': 'HL_M_no',
    'S1P9Q20H': 'HL_M_Days',
    'S1P9Q20I': 'HL_M_HpD',
    'S1P9Q20J': 'HL_W_no',
    'S1P9Q20K': 'HL_W_Days',
    'S1P9Q20L': 'HL_W_Hpd',
    'C_PROVINCE': 'P_ID',
    'C_DISTRICT': 'D_ID',
    'C_TEHSIL': 'T_ID',
    'C_UC': 'UC_ID',
    'C_MOUZA': 'M_ID',
    'S1P9Q20A': 'FL_M_no',
    'S1P9Q20B': 'FL_M_Days',
    'S1P9Q20C': 'FL_M_HpD',
    'S1P9Q20D': 'FL_W_no',
    'S1P9Q20E': 'FL_W_Days',
    'S1P9Q20F': 'FL_W_Hpd'
}, inplace=True)

df_2013.rename(columns= {
    's2p9_cash': 's2p11_q1',
    's2p9_kind': 's2p11_q2',
    's2p9_q8': 's2p11_q3',
    's2p9_q9': 's2p11_q4',
    's2p9_q10': 's2p11_q5',
    's2p9_q11': 's2p11_q6',
    's2p9_q12': 's2p11_q11',
    's2p9_q7a': 'HL_M_no',
    's2p9_q7b': 'HL_M_Days',
    's2p9_q7c': 'HL_M_HpD',
    's2p9_q7d': 'HL_W_no',
    's2p9_q7e': 'HL_W_Days',
    's2p9_q7f': 'HL_W_Hpd'
}, inplace=True)


# df_2014 doesn't need renaming as it is the reference

In [34]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:


mapping_2012 = [
    "hid", "round", "s2p11_q1", None, 
    "s2p11_q3", "s2p11_q4", "s2p11_q5", "s2p11_q6", None, None, 
    None, "s2p11_q11", None, None, None, None, 
    None, None, None, "P_ID", "D_ID", "T_ID", 
    "UC_ID", "M_ID", None, None, None, None, 
    None, None, None
]

mapping_2012_5 = [
    "hid", "round", "s2p11_q1", "s2p11_q2", 
    "s2p11_q3", "s2p11_q4", "s2p11_q5", "s2p11_q6", None, None, 
    None, "s2p11_q11", "HL_M_no", "HL_M_Days", "HL_M_HpD", "HL_W_no", 
    "HL_W_Days", "HL_W_Hpd", "P_ID", "D_ID", "T_ID", 
    "UC_ID", "M_ID", "C_HH_NUM", "FL_M_no", "FL_M_Days", "FL_M_HpD", 
    "FL_W_no", "FL_W_Days", "FL_W_Hpd"
]


mapping_2013 = [
    "hid", "round", "s2p11_q1", "s2p11_q2", 
    "s2p11_q3", "s2p11_q4", "s2p11_q5", "s2p11_q6", None, None, 
    None, "s2p11_q11", "HL_M_no", "HL_M_Days", "HL_M_HpD", "HL_W_no", 
    "HL_W_Days", "HL_W_Hpd", None, None, None, 
    None, None, None, None, None, None, 
    None, None, None
]

mapping_2014 = [
    "hid", "round", "s2p11_q1", "s2p11_q2", 
    "s2p11_q3", "s2p11_q4", "s2p11_q5", "s2p11_q6", "s2p11_q8", "s2p11_q9", 
    "s2p11_q10", "s2p11_q11", None, None, None, None, 
    None, None, None, None, None, None, 
    None, None, None, None, None, None, 
    None, None
]


In [35]:
# Create a list of all possible columns in the correct order
all_columns = []
for col in mapping_2012:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2012_5:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [36]:

# Function to standardize and merge DataFrames
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if col in df.columns:
                    print(f"Appending data for column {col}")
                    merged_data[ref_col].extend(df[col].tolist())
                else:
                    print(f"Column {col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df

In [37]:
# Usage with dataframes and mappings
dfs = [df_2012, df_2012_5, df_2013, df_2014]
mappings = [mapping_2012, mapping_2012_5, mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['hid', 'round', 's2p11_q3', 's2p11_q4', 's2p11_q1', 's2p11_q5', 's2p11_q6', 's2p11_q11', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID']
Appending data for column hid
Appending data for column round
Appending data for column s2p11_q1
Appending data for column s2p11_q3
Appending data for column s2p11_q4
Appending data for column s2p11_q5
Appending data for column s2p11_q6
Appending data for column s2p11_q11
Appending data for column P_ID
Appending data for column D_ID
Appending data for column T_ID
Appending data for column UC_ID
Appending data for column M_ID
Processing DataFrame with columns: ['Unnamed: 0', 'round', 'hid', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID', 'C_HH_NUM', 'FL_M_no', 'FL_M_Days', 'FL_M_HpD', 'FL_W_no', 'FL_W_Days', 'FL_W_Hpd', 'HL_M_no', 'HL_M_Days', 'HL_M_HpD', 'HL_W_no', 'HL_W_Days', 'HL_W_Hpd', 's2p11_q1', 's2p11_q2', 's2p11_q3', 's2p11_q4', 's2p11_q5', 's2p11_q6', 's2p11_q11']
Appending data for column hid
Appending data for column r

In [38]:
# Rename columns for the merged file (if needed)
rename_mapping={
    'hid': 'HID',
    'round': 'Survey_Round',
    's2p11_q1': 'L_Cash',
    's2p11_q2': 'L_Kind',
    's2p11_q3': 'Cost_Build',
    's2p11_q4': 'Cost_Elec_Gas',
    's2p11_q5': 'Cost_Tools_Mcha',
    's2p11_q6': 'Cost_vet_meds',
    's2p11_q8': 'Cost_Art_Ins',
    's2p11_q9': 'Cost_Breed',
    's2p11_q10': 'Cost_Trans',
    's2p11_q11': 'Cost_Other'
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_2_part_9.csv', index=False)

