In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks”
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [None]:
'''
The following code block will merge all roaster files across the years.
* First, we are going to read the respective files and store them as data frames
* Next, we are going to define column mappings that I have already figured out via manual methods
* Once the mappings are done per the set rules, we will see the new roaster dataset across the years 1.
'''

In [16]:
#since there are 2 files for this section in multiple years, we will make a standardized single file for sectional merging

import pandas as pd
import numpy as np

# Load the first file
df1 = pd.read_excel(r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\13. Section 2 Part 11 Own and Paid livestock work during last 12m\2014_s2p12_m.xlsx")
# Load the second file
df2 = pd.read_excel(r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\13. Section 2 Part 11 Own and Paid livestock work during last 12m\2014_s2p14_m.xlsx")


# Merge the two files based on a common column
merged_df = pd.concat([df1, df2], ignore_index=True)

# Drop redundant columns
merged_df.drop(merged_df.columns[merged_df.columns.str.contains('Unnamed', case=True)], axis=1, inplace=True)
merged_df.drop(merged_df.columns[merged_df.columns.str.contains(' ', case=False)], axis=1, inplace=True)
# Save the merged dataframe to a CSV file
merged_df.to_csv('2014_s2p12 & s2p14.csv', index=True)

In [17]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd
import numpy as np
# Store excel file locations to variables

agri_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\13. Section 2 Part 11 Own and Paid livestock work during last 12m\2013_s2p10 & s2p11.csv"
agri_2014 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\13. Section 2 Part 11 Own and Paid livestock work during last 12m\2014_s2p12 & s2p14.csv"

# Read excel files 
df_2013 = pd.read_csv(agri_2013)
df_2014 = pd.read_csv(agri_2014)



In [18]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df_2013 to df_2014 column names
df_2013.rename(columns={
    'hid': 'hid',
    'round': 'round',
    'r_pid': 'r_pid',
    's2p10_q1_i': 's2p12_q1',
    's2p10_q1_ii': 's2p12_q2',
    's2p10_q2_i': 's2p12_q3',
    's2p10_q2_ii': 's2p12_q4',
    's2p10_q3_i': 's2p12_q5',
    's2p10_q3_ii': 's2p12_q6',
    's2p10_q4_i': 's2p12_q7',
    's2p10_q4_ii': 's2p12_q8',
    's2p10_q5_i': 's2p12_q9',
    's2p10_q5_ii': 's2p12_q10',
    's2p11_q1a': 's2p14_q2',
    's2p11_q1b': 's2p14_q3',
    's2p11_q1c': 's2p14_q4',
    's2p11_q2a': 's2p14_q5',
    's2p11_q2b': 's2p14_q6',
    's2p11_q2c': 's2p14_q7',
    's2p11_q3a': 's2p14_q8',
    's2p11_q3b': 's2p14_q9',
    's2p11_q3c': 's2p14_q10',
    's2p11_q4a': 's2p14_q11',
    's2p11_q4b': 's2p14_q12',
    's2p11_q4c': 's2p14_q13',
    's2p11_q5a': 's2p14_q14',
    's2p11_q5b': 's2p14_q15',
    's2p11_q5c': 's2p14_q16',

    's2p10_q6_i': 'O_OA_HpD',
    's2p10_q6_ii': 'O_OA_D_T',
    's2p11_q6a': 'P_OA_HpD',
    's2p11_q6b': 'P_OA_D_T',
    's2p11_q6c': 'P_OA_WpD'
}, inplace=True)




# df_2014 doesn't need renaming as it is the reference

In [19]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:


mapping_2013 = [
    "hid", "round", "r_pid", "s2p12_q1", "s2p12_q2", "s2p12_q3",
    "s2p12_q4", "s2p12_q5", "s2p12_q6", "s2p12_q7", "s2p12_q8",
    "s2p12_q9", "s2p12_q10", None, "s2p14_q2", "s2p14_q3", "s2p14_q4", 
    "s2p14_q5", "s2p14_q6", "s2p14_q7", "s2p14_q8", "s2p14_q9", "s2p14_q10",
    "s2p14_q11", "s2p14_q12", "s2p14_q13", "s2p14_q14", "s2p14_q15", "s2p14_q16",
    None, None, "O_OA_HpD", "O_OA_D_T", "P_OA_HpD", "P_OA_D_T", "P_OA_WpD"
]

mapping_2014 = [
    "hid", "round", "r_pid", "s2p12_q1", "s2p12_q2", "s2p12_q3", "s2p12_q4",
    "s2p12_q5", "s2p12_q6", "s2p12_q7", "s2p12_q8", "s2p12_q9", "s2p12_q10",
    "s2p14_q1", "s2p14_q2", "s2p14_q3", "s2p14_q4", "s2p14_q5", "s2p14_q6", 
    "s2p14_q7", "s2p14_q8", "s2p14_q9", "s2p14_q10", "s2p14_q11", "s2p14_q12",
    "s2p14_q13", "s2p14_q14", "s2p14_q15", "s2p14_q16", "s2p14_q17", "s2p14_q18",
    None, None, None, None, None
]



In [20]:
# Create a list of all possible columns in the correct order
all_columns = []

for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [21]:

# Function to standardize and merge DataFrames
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if col in df.columns:
                    print(f"Appending data for column {col}")
                    merged_data[ref_col].extend(df[col].tolist())
                else:
                    print(f"Column {col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df

In [22]:
# Usage with dataframes and mappings
dfs = [df_2013, df_2014]
mappings = [ mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 'r_pid', 's2p12_q1', 's2p12_q2', 's2p12_q3', 's2p12_q4', 's2p12_q5', 's2p12_q6', 's2p12_q7', 's2p12_q8', 's2p12_q9', 's2p12_q10', 'O_OA_HpD', 'O_OA_D_T', 's2p14_q2', 's2p14_q3', 's2p14_q4', 's2p14_q5', 's2p14_q6', 's2p14_q7', 's2p14_q8', 's2p14_q9', 's2p14_q10', 's2p14_q11', 's2p14_q12', 's2p14_q13', 's2p14_q14', 's2p14_q15', 's2p14_q16', 'P_OA_HpD', 'P_OA_D_T', 'P_OA_WpD']
Appending data for column hid
Appending data for column round
Appending data for column r_pid
Appending data for column s2p12_q1
Appending data for column s2p12_q2
Appending data for column s2p12_q3
Appending data for column s2p12_q4
Appending data for column s2p12_q5
Appending data for column s2p12_q6
Appending data for column s2p12_q7
Appending data for column s2p12_q8
Appending data for column s2p12_q9
Appending data for column s2p12_q10
Appending data for column s2p14_q2
Appending data for column s2p14_q3
Appending data for column s2p14_q4
Appendi

In [23]:
# Rename columns for the merged file (if needed)
rename_mapping = {
    'hid': 'HID',
    'round': 'Survey_Round',
    'r_pid': 'PID',
    's2p12_q1': 'O_LvsC_HpD',
    's2p12_q2': 'O_LvsC_D_T',
    's2p12_q3': 'O_Milk_HpD',
    's2p12_q4': 'O_Milk_D_T',
    's2p12_q5': 'O_DC_HpD',
    's2p12_q6': 'O_DC_D_T',
    's2p12_q7': 'O_Graz_HpD',
    's2p12_q8': 'O_Graz_D_T',
    's2p12_q9': 'O_MedC_HpD',
    's2p12_q10': 'O_MedC_D_T',
    's2p14_q1': 'Emp_Status',
    's2p14_q2': 'P_LvsC_HpD',
    's2p14_q3': 'P_LvsC_D_T',
    's2p14_q4': 'P_LvsC_WpD',
    's2p14_q5': 'P_Milk_HpD',
    's2p14_q6': 'P_Milk_D_T',
    's2p14_q7': 'P_Milk_WpD',
    's2p14_q8': 'P_DC_HpD',
    's2p14_q9': 'P_DC_D_T',
    's2p14_q10': 'P_DC_WpD',
    's2p14_q11': 'P_Graz_HpD',
    's2p14_q12': 'P_Graz_D_T',
    's2p14_q13': 'P_Graz_WpD',
    's2p14_q14': 'P_MedC_HpD',
    's2p14_q15': 'P_MedC_D_T',
    's2p14_q16': 'P_MedC_WpD',
    's2p14_q17': 'P_Inc_D_T',
    's2p14_q18': 'P_Inc_W_T',
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_2_part_11.csv', index=False)

