In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks”
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [None]:
'''
The following code block will merge all roaster files across the years.
* First, we are going to read the respective files and store them as data frames
* Next, we are going to define column mappings that I have already figured out via manual methods
* Once the mappings are done per the set rules, we will see the new roaster dataset across the years 1.
'''

In [4]:
#since there are 2 files for this section in multiple years, we will make a standardized single file for sectional merging

import pandas as pd

# Load the first file
df1 = pd.read_excel(r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\Section 2 Part 5 TIME ALLOCATED TO OWN FARM ACTIVITIES DURING RABI AND KHARIF\2013_RABI_s2p6a_m.xlsx")

# Load the second file
df2 = pd.read_excel(r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\Section 2 Part 5 TIME ALLOCATED TO OWN FARM ACTIVITIES DURING RABI AND KHARIF\2013_KHARIF_s2p6b_m.xlsx")

# Merge the two files based on a common column
merged_df = pd.concat([df1, df2], ignore_index=True)

# Drop redundant columns
merged_df.drop(merged_df.columns[merged_df.columns.str.contains('Unnamed', case=True)], axis=1, inplace=True)
merged_df.drop(merged_df.columns[merged_df.columns.str.contains(' ', case=False)], axis=1, inplace=True)
# Save the merged dataframe to a CSV file
merged_df.to_csv('2013_s2p6a_s2pb6.csv', index=True)

In [5]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd

# Store excel file locations to variables

agri_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\Section 2 Part 5 TIME ALLOCATED TO OWN FARM ACTIVITIES DURING RABI AND KHARIF\2013_s2p6a_s2pb6.csv"
agri_2014 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\Section 2 Part 5 TIME ALLOCATED TO OWN FARM ACTIVITIES DURING RABI AND KHARIF\2014_s2p5_S2P6.csv"

# Read excel files 
df_2013 = pd.read_csv(agri_2013)
df_2014 = pd.read_csv(agri_2014)


In [6]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df
df_2013.rename(columns={
    's2p6a_q1_i': 's2p5_q1',
    's2p6a_q1_ii': 's2p5_q2',
    's2p6a_q2_i': 's2p5_q3',
    's2p6a_q2_ii': 's2p5_q4',
    's2p6a_q3_i': 's2p5_q5',
    's2p6a_q3_ii': 's2p5_q6',
    's2p6a_q4_i': 's2p5_q7',
    's2p6a_q4_ii': 's2p5_q8',
    's2p6a_q5_i': 's2p5_q9',
    's2p6a_q5_ii': 's2p5_q10',
    's2p6a_q6_i': 's2p5_q11',
    's2p6a_q6_ii': 's2p5_q12',
    's2p6a_q7_i': 's2p5_q13',
    's2p6a_q7_ii': 's2p5_q14',
    's2p6a_q8_i': 's2p5_q15',
    's2p6a_q8_ii': 's2p5_q16',
    's2p6a_q9_i': 's2p5_q17',
    's2p6a_q9_ii': 's2p5_q18',
    's2p6a_q10_i': 's2p5_q19',
    's2p6a_q10_ii': 's2p5_q20',
    's2p6b_q1_i': 's2p6_q1',
    's2p6b_q1_ii': 's2p6_q2',
    's2p6b_q2_i': 's2p6_q3',
    's2p6b_q2_ii': 's2p6_q4',
    's2p6b_q3_i': 's2p6_q5',
    's2p6b_q3_ii': 's2p6_q6',
    's2p6b_q4_i': 's2p6_q7',
    's2p6b_q4_ii': 's2p6_q8',
    's2p6b_q5_i': 's2p6_q9',
    's2p6b_q5_ii': 's2p6_q10',
    's2p6b_q6_i': 's2p6_q11',
    's2p6b_q6_ii': 's2p6_q12',
    's2p6b_q7_i': 's2p6_q13',
    's2p6b_q7_ii': 's2p6_q14',
    's2p6b_q8_i': 's2p6_q15',
    's2p6b_q8_ii': 's2p6_q16',
    's2p6b_q9_i': 's2p6_q17',
    's2p6b_q9_ii': 's2p6_q18',
    's2p6b_q10_i': 's2p6_q19',
    's2p6b_q10_ii': 's2p6_q20'
}, inplace=True)


# df_2014 doesn't need renaming as it is the reference

In [7]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:

mapping_2013 = [
    'hid', 'round', None, 'r_pid', 's2p5_q1', 's2p5_q2', 's2p5_q3', 's2p5_q4', 's2p5_q5', 's2p5_q6', 's2p5_q7', 's2p5_q8',
    's2p5_q9', 's2p5_q10', 's2p5_q11', 's2p5_q12', 's2p5_q13', 's2p5_q14', 's2p5_q15', 's2p5_q16', 's2p5_q17',
    's2p5_q18', 's2p5_q19', 's2p5_q20', None, 's2p6_q1', 's2p6_q2', 's2p6_q3', 's2p6_q4', 's2p6_q5', 's2p6_q6',
    's2p6_q7', 's2p6_q8', 's2p6_q9', 's2p6_q10', 's2p6_q11', 's2p6_q12', 's2p6_q13', 's2p6_q14', 's2p6_q15',
    's2p6_q16', 's2p6_q17', 's2p6_q18', 's2p6_q19', 's2p6_q20'
]


mapping_2014 = [
    'hid', 'round', 's2p5_qa', 'r_pid', 's2p5_q1', 's2p5_q2', 's2p5_q3', 's2p5_q4', 's2p5_q5', 's2p5_q6', 's2p5_q7',
    's2p5_q8', 's2p5_q9', 's2p5_q10', 's2p5_q11', 's2p5_q12', 's2p5_q13', 's2p5_q14', 's2p5_q15', 's2p5_q16', 's2p5_q17',
    's2p5_q18', 's2p5_q19', 's2p5_q20', 's2p6_qa', 's2p6_q1', 's2p6_q2', 's2p6_q3', 's2p6_q4', 's2p6_q5', 's2p6_q6',
    's2p6_q7', 's2p6_q8', 's2p6_q9', 's2p6_q10', 's2p6_q11', 's2p6_q12', 's2p6_q13', 's2p6_q14', 's2p6_q15', 's2p6_q16',
    's2p6_q17', 's2p6_q18', 's2p6_q19', 's2p6_q20'
]




In [8]:
# Create a list of all possible columns in the correct order
all_columns = []
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [9]:
import numpy as np

def standardize_and_merge(dfs, mappings, ref_mapping, df_names):
    """
    Standardize and merge dataframes based on reference mapping.

    Parameters:
    dfs (list of pd.DataFrame): List of dataframes to be merged.
    mappings (list of list): List of mappings corresponding to each dataframe.
    ref_mapping (list): Reference mapping to standardize the column names.
    df_names (list): List of dataframe names.

    Returns:
    pd.DataFrame: The merged dataframe with standardized column names.
    """
    
    # Create a dictionary to store columns from all dataframes
    merged_data = {col: [] for col in ref_mapping if col}
    # Track already included columns
    included_cols = set(merged_data.keys())
    
    max_len = 0  # To track the maximum length of columns

    # Iterate through each dataframe and its corresponding mapping
    for df, mapping, df_name in zip(dfs, mappings, df_names):
        for i, col in enumerate(df.columns):
            if col in mapping:
                ref_col = ref_mapping[mapping.index(col)]
                if ref_col:  # Reference column is not None
                    if ref_col in merged_data:
                        merged_data[ref_col].extend(df[col].tolist())
                    else:
                        merged_data[ref_col] = df[col].tolist()
                    max_len = max(max_len, len(merged_data[ref_col]))
                else:
                    # For columns in the dataframes but not in the reference mapping
                    new_col_name = f"{df_name}_{col}"
                    if new_col_name not in included_cols:
                        merged_data[new_col_name] = df[col].tolist()
                        included_cols.add(new_col_name)
                        max_len = max(max_len, len(merged_data[new_col_name]))
            else:
                # Handle columns not present in the mapping
                for j, ref_col in enumerate(ref_mapping):
                    if not ref_col:
                        new_col_name = f"{df_name}_{col}"
                        if new_col_name not in included_cols:
                            merged_data[new_col_name] = df[col].values.tolist() 
                            included_cols.add(new_col_name)
                            max_len = max(max_len, len(merged_data[new_col_name]))

    # Ensure all columns have the same length
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    # Convert the merged_data dictionary to a DataFrame
    merged_df = pd.DataFrame.from_dict(merged_data)
    
    # Remove columns containing 'Unnamed'
    merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('Unnamed')]

    return merged_df


In [10]:
# List of dataframes and their mappings
dfs = [df_2013, df_2014]
mappings = [mapping_2013, mapping_2014]
ref_mapping = mapping_2014
df_name= ['2013', '2014']
merged_df = standardize_and_merge(dfs, mappings, ref_mapping, df_name)


In [12]:
# Rename columns for the merged file (if needed)
rename_mapping = {
    'hid': 'HID',
    'round': 'Survey_Round',
    'r_pid': 'PID',
    's2p5_q1': 'R_LandPrep_HpD',
    's2p5_q2': 'R_LandPrep_TD',
    's2p5_q3': 'R_Sowing_HpD',
    's2p5_q4': 'R_Sowing_TD',
    's2p5_q5': 'R_Irr_HpD',
    's2p5_q6': 'R_Irr_TD',
    's2p5_q7': 'R_FertiApp_Hpd',
    's2p5_q8': 'R_FertiApp_TD',
    's2p5_q9': 'R_PestiApp_Hpd',
    's2p5_q10': 'R_PestiApp_TD',
    's2p5_q11': 'R_Weeding_Hpd',
    's2p5_q12': 'R_Weeding_TD',
    's2p5_q13': 'R_HPS_Hpd',
    's2p5_q14': 'R_HPS_TD',
    's2p5_q15': 'R_Thresh_Hpd',
    's2p5_q16': 'R_Thresh_TD',
    's2p5_q17': 'R_TnS_Hpd',
    's2p5_q18': 'R_TnS_TD',
    's2p5_q19': 'R_Prune_Hpd',
    's2p5_q20': 'R_Prune_TD',
    's2p6_q1': 'K_LandPrep_HpD',
    's2p6_q2': 'K_LandPrep_TD',
    's2p6_q3': 'K_Sowing_HpD',
    's2p6_q4': 'K_Sowing_TD',
    's2p6_q5': 'K_Irr_HpD',
    's2p6_q6': 'K_Irr_TD',
    's2p6_q7': 'K_FertiApp_Hpd',
    's2p6_q8': 'K_FertiApp_TD',
    's2p6_q9': 'K_PestiApp_Hpd',
    's2p6_q10': 'K_PestiApp_TD',
    's2p6_q11': 'K_Weeding_Hpd',
    's2p6_q12': 'K_Weeding_TD',
    's2p6_q13': 'K_HPS_Hpd',
    's2p6_q14': 'K_HPS_TD',
    's2p6_q15': 'K_Thresh_Hpd',
    's2p6_q16': 'K_Thresh_TD',
    's2p6_q17': 'K_TnS_Hpd',
    's2p6_q18': 'K_TnS_TD',
    's2p6_q19': 'K_Prune_Hpd',
    's2p6_q20': 'K_Prune_TD'
}


merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_2_part_5.csv', index=False)

