In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks”
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [2]:
 import pandas as pd
data = pd.read_stata(r"C:\Users\warra\Desktop\Freelance\data\data\2013_data\Male\100_s11p1_m.dta")
data.to_csv('2013_s11p1_m.csv')

In [4]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd
import numpy as np
# Store excel file locations to variables

PiSSN_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\32. Section 8 Part 1 Participation in social safety nets\2013_s11p1_m.csv"
PiSSN_2014 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\32. Section 8 Part 1 Participation in social safety nets\2014_s8p1_m.csv"

# Read excel files 
df_2013 = pd.read_csv(PiSSN_2013)
df_2014 = pd.read_csv(PiSSN_2014)



In [6]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df to df_2014 column names
ht

# df_2014 doesn't need renaming as it is the reference

In [7]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:

mapping_2013 = [
    'hid', 'round', None, 's8p1_q1', 'r_pid', None, None, None, None, None, 
    None, None, None, None, 's8p1_q12', 'Prgm_Name', 'Prgm_Aware', 'Prgm_AiT', 
    'HH_Assist_1y', 'HHM_DecCS', 'Cash', 'Wheat_kg', 'OtherFood', 'OtherIK'
]

mapping_2014 = [
     "hid", "round", "s8p1_qa", "s8p1_q1", "r_pid", "s8p1_q3", "s8p1_q4", "s8p1_q5",
    "s8p1_q6", "s8p1_q7", "s8p1_q8", "s8p1_q9", "s8p1_q10", "s8p1_q11", "s8p1_q12",
    None,None,None,None,None,None,None,None,None
]





In [10]:
# Create a list of all possible columns in the correct order
all_columns = []

     
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [11]:
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col.strip()  # Remove leading/trailing whitespace
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if ref_col in df.columns:
                    print(f"Appending data for column {ref_col}")
                    if isinstance(df[ref_col], pd.Series):
                        merged_data[ref_col].extend(df[ref_col].tolist())
                    elif isinstance(df[ref_col], pd.DataFrame):
                        print(f"Column {ref_col} is duplicated in DataFrame. Appending data for each duplicate.")
                        for _, series in df[ref_col].items():
                            merged_data[ref_col].extend(series.tolist())
                else:
                    print(f"Column {ref_col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df


In [12]:
# Usage with dataframes and mappings
dfs = [ df_2013, df_2014]
mappings = [ mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 's8p1_q1', 'Prgm_Name', 'Prgm_Aware', 'Prgm_AiT', 'HH_Assist_1y', 'r_pid', 'HHM_DecCS', 'Cash', 'Wheat_kg', 'OtherFood', 'OtherIK', 's8p1_q12']
Appending data for column hid
Appending data for column round
Appending data for column s8p1_q1
Appending data for column r_pid
Appending data for column s8p1_q12
Appending data for column Prgm_Name
Appending data for column Prgm_Aware
Appending data for column Prgm_AiT
Appending data for column HH_Assist_1y
Appending data for column HHM_DecCS
Appending data for column Cash
Appending data for column Wheat_kg
Appending data for column OtherFood
Appending data for column OtherIK
Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 's8p1_qa', 's8p1_q1', 'r_pid', 's8p1_q3', 's8p1_q4', 's8p1_q5', 's8p1_q6', 's8p1_q7', 's8p1_q8', 's8p1_q9', 's8p1_q10', 's8p1_q11', 's8p1_q12']
Appending data for column hid
Appending data for column round
Appending data for column s8p1_qa
Ap

In [13]:
# Rename columns for the merged file (if needed)
rename_mapping = {
     'hid': 'HID',
    'round': 'Survey_Round',
    's8p1_qa': 'HH_Benefit',
    's8p1_q1': 'Prgm_Code',
    'r_pid': 'PID',
    's8p1_q3': 'HH_Benefit_m',
    's8p1_q4': 'HH_Benefit_y',
    's8p1_q5': 'HH_Benefit_freq',
    's8p1_q6': 'Assist_Type',
    's8p1_q7': 'Amt__BenefitR_ET',
    's8p1_q8': 'Amt_CashR_1y',
    's8p1_q9': 'Cash_Maj_Spent',
    's8p1_q10': 'ItemRIK_1y_Val',
    's8p1_q11': 'Musage_WOFIKP',
    's8p1_q12': 'Prgm_Perception'
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_8_part_1.csv', index=False)

