In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks”
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [None]:
'''
The following code block will merge all roaster files across the years.
* First, we are going to read the respective files and store them as data frames
* Next, we are going to define column mappings that I have already figured out via manual methods
* Once the mappings are done per the set rules, we will see the new roaster dataset across the years 1.
'''

In [1]:
#since there are 2 files for this section in multiple years, we will make a standardized single file for sectional merging

import pandas as pd

# Load the first file
df1 = pd.read_excel(r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\9. Section 2 Part 7 LABOUR AND MACHINERY USED IN RABI AND KHARIF\2013_s2p5_m.xlsx")

# Load the second file
df2 = pd.read_excel(r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\9. Section 2 Part 7 LABOUR AND MACHINERY USED IN RABI AND KHARIF\2013_s2p5_q11_q12_m.xlsx")

# Merge the two files based on a common column
merged_df = pd.concat([df1, df2], ignore_index=True)

# Drop redundant columns
merged_df.drop(merged_df.columns[merged_df.columns.str.contains('Unnamed', case=True)], axis=1, inplace=True)
merged_df.drop(merged_df.columns[merged_df.columns.str.contains(' ', case=False)], axis=1, inplace=True)
# Save the merged dataframe to a CSV file
merged_df.to_csv('2013_S2P6.csv', index=True)

In [2]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd

# Store excel file locations to variables

agri_2012 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\9. Section 2 Part 7 LABOUR AND MACHINERY USED IN RABI AND KHARIF\2012_s2p5_m.xlsx"
agri_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\9. Section 2 Part 7 LABOUR AND MACHINERY USED IN RABI AND KHARIF\2013_S2P6.csv"

# Read excel files 
df_2012 = pd.read_excel(agri_2012)
df_2013 = pd.read_csv(agri_2013)


In [5]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df
df_2012.rename(columns={
    'crop_code': 'crop_code',
    's2p5q1b1': 's2p5_q1a_i',
    's2p5q1b2': 's2p5_q1a_ii',
    's2p5q1b3': 's2p5_q1a_iii',
    's2p5q1c1': 's2p5_q1b_i',
    's2p5q1c2': 's2p5_q1b_ii',
    's2p5q1c3': 's2p5_q1b_iii',
    's2p5q2b1': 's2p5_q2a_i',
    's2p5q2b2': 's2p5_q2a_ii',
    's2p5q2b3': 's2p5_q2a_iii',
    's2p5q2c1': 's2p5_q2b_i',
    's2p5q2c2': 's2p5_q2b_ii',
    's2p5q2c3': 's2p5_q2b_iii',
    's2p5q3b1': 's2p5_q3a_i',
    's2p5q3b2': 's2p5_q3a_ii',
    's2p5q3b3': 's2p5_q3a_iii',
    's2p5q4b1': 's2p5_q4a_i',
    's2p5q4b2': 's2p5_q4a_ii',
    's2p5q4b3': 's2p5_q4a_iii',
    's2p5q5b1': 's2p5_q5a_i',
    's2p5q5b2': 's2p5_q5a_ii',
    's2p5q5b3': 's2p5_q5a_iii',
    's2p5q5c1': 's2p5_q5b_i',
    's2p5q5c2': 's2p5_q5b_ii',
    's2p5q5c3': 's2p5_q5b_iii',
    's2p5q6b1': 's2p5_q6a_i',
    's2p5q6b2': 's2p5_q6a_ii',
    's2p5q6b3': 's2p5_q6a_iii',
    's2p5q6c1': 's2p5_q6b_i',
    's2p5q6c2': 's2p5_q6b_ii',
    's2p5q6c3': 's2p5_q6b_iii',
    's2p5q7b1': 's2p5_q7a_i',
    's2p5q7b2': 's2p5_q7a_ii',
    's2p5q7b3': 's2p5_q7a_iii',
    's2p5q7c1': 's2p5_q7b_i',
    's2p5q7c2': 's2p5_q7b_ii',
    's2p5q7c3': 's2p5_q7b_iii',
    's2p5q8b1': 's2p5_q8a_i',
    's2p5q8b2': 's2p5_q8a_ii',
    's2p5q8b3': 's2p5_q8a_iii',
    's2p5q8c1': 's2p5_q8b_i',
    's2p5q8c2': 's2p5_q8b_ii',
    's2p5q8c3': 's2p5_q8b_iii',
    's2p5q9b1': 's2p5_q9a_i',
    's2p5q9b2': 's2p5_q9a_ii',
    's2p5q9b3': 's2p5_q9a_iii',
    's2p5q9c1': 's2p5_q9b_i',
    's2p5q9c2': 's2p5_q9b_ii',
    's2p5q9c3': 's2p5_q9b_iii',
    'area': 'Area',
    'unit': 'Unit',
    's2p5q1a1': 'LandPrep_FL_NoP',
    's2p5q1a2': 'LandPrep_FL_NoD',
    's2p5q2a1': 'Sowing_FL_NoP',
    's2p5q2a2': 'Sowing_FL_NoD',
    's2p5q3a1': 'Irr_FL_NoP',
    's2p5q3a2': 'Irr_FL_NoD',
    's2p5q4a1': 'FA_FL_NoP',
    's2p5q4a2': 'FA_FL_NoD',
    's2p5q5a1': 'PA_CHL_NoP',
    's2p5q5a2': 'PA_FL_NoD',
    's2p5q6a1': 'Weed_FL_NoP',
    's2p5q6a2': 'Weed_FL_NoD',
    's2p5q7a1': 'HPS_FL_NoP',
    's2p5q7a2': 'HPS_FL_NoD',
    's2p5q8a1': 'Thresh_FL_NoP',
    's2p5q8a2': 'Thresh_FL_NoD',
    's2p5q9a1': 'TnS_FL_NoP',
    's2p5q9a2': 'TnS_FL_NoD'
}, inplace=True)


# df_2013 doesn't need renaming as it is the reference

In [6]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:


mapping_2012 = [
    "hid", "round", None, "crop_code", 
    "s2p5_q1a_i", "s2p5_q1a_ii", "s2p5_q1a_iii", "s2p5_q1b_i", "s2p5_q1b_ii", "s2p5_q1b_iii", 
    "s2p5_q2a_i", "s2p5_q2a_ii", "s2p5_q2a_iii", "s2p5_q2b_i", "s2p5_q2b_ii", "s2p5_q2b_iii", 
    "s2p5_q3a_i", "s2p5_q3a_ii", "s2p5_q3a_iii", "s2p5_q4a_i", "s2p5_q4a_ii", "s2p5_q4a_iii", 
    "s2p5_q5a_i", "s2p5_q5a_ii", "s2p5_q5a_iii", "s2p5_q5b_i", "s2p5_q5b_ii", "s2p5_q5b_iii", 
    "s2p5_q6a_i", "s2p5_q6a_ii", "s2p5_q6a_iii", "s2p5_q6b_i", "s2p5_q6b_ii", "s2p5_q6b_iii", 
    "s2p5_q7a_i", "s2p5_q7a_ii", "s2p5_q7a_iii", "s2p5_q7b_i", "s2p5_q7b_ii", "s2p5_q7b_iii", 
    "s2p5_q8a_i", "s2p5_q8a_ii", "s2p5_q8a_iii", "s2p5_q8b_i", "s2p5_q8b_ii", "s2p5_q8b_iii", 
    "s2p5_q9a_i", "s2p5_q9a_ii", "s2p5_q9a_iii", "s2p5_q9b_i", "s2p5_q9b_ii", "s2p5_q9b_iii", 
    None, None, None, None, None, None, None, 
    "Area", "Unit", 
    "LandPrep_FL_NoP", "LandPrep_FL_NoD", 
    "Sowing_FL_NoP", "Sowing_FL_NoD", 
    "Irr_FL_NoP", "Irr_FL_NoD", 
    "FA_FL_NoP", "FA_FL_NoD", 
    "PA_CHL_NoP", "PA_FL_NoD", 
    "Weed_FL_NoP", "Weed_FL_NoD", 
    "HPS_FL_NoP", "HPS_FL_NoD", 
    "Thresh_FL_NoP", "Thresh_FL_NoD", 
    "TnS_FL_NoP", "TnS_FL_NoD"
]


mapping_2013 = [
    "hid", "round", "crop_name", "crop_code", 
    "s2p5_q1a_i", "s2p5_q1a_ii", "s2p5_q1a_iii", "s2p5_q1b_i", "s2p5_q1b_ii", "s2p5_q1b_iii", 
    "s2p5_q2a_i", "s2p5_q2a_ii", "s2p5_q2a_iii", "s2p5_q2b_i", "s2p5_q2b_ii", "s2p5_q2b_iii", 
    "s2p5_q3a_i", "s2p5_q3a_ii", "s2p5_q3a_iii", "s2p5_q4a_i", "s2p5_q4a_ii", "s2p5_q4a_iii", 
    "s2p5_q5a_i", "s2p5_q5a_ii", "s2p5_q5a_iii", "s2p5_q5b_i", "s2p5_q5b_ii", "s2p5_q5b_iii", 
    "s2p5_q6a_i", "s2p5_q6a_ii", "s2p5_q6a_iii", "s2p5_q6b_i", "s2p5_q6b_ii", "s2p5_q6b_iii", 
    "s2p5_q7a_i", "s2p5_q7a_ii", "s2p5_q7a_iii", "s2p5_q7b_i", "s2p5_q7b_ii", "s2p5_q7b_iii", 
    "s2p5_q8a_i", "s2p5_q8a_ii", "s2p5_q8a_iii", "s2p5_q8b_i", "s2p5_q8b_ii", "s2p5_q8b_iii", 
    "s2p5_q9a_i", "s2p5_q9a_ii", "s2p5_q9a_iii", "s2p5_q9b_i", "s2p5_q9b_ii", "s2p5_q9b_iii", 
    "s2p5_q10a_i", "s2p5_q10a_ii", "s2p5_q10a_iii", "s2p5_q11", 
    "s2p5_q12_i", "s2p5_q12_ii", "s2p5_q12_iii", 
    None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
]


In [8]:
# Create a list of all possible columns in the correct order
all_columns = []
for col in mapping_2012:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)



In [9]:
import numpy as np

def standardize_and_merge(dfs, mappings, ref_mapping, df_names):
    """
    Standardize and merge dataframes based on reference mapping.

    Parameters:
    dfs (list of pd.DataFrame): List of dataframes to be merged.
    mappings (list of list): List of mappings corresponding to each dataframe.
    ref_mapping (list): Reference mapping to standardize the column names.
    df_names (list): List of dataframe names.

    Returns:
    pd.DataFrame: The merged dataframe with standardized column names.
    """
    
    # Create a dictionary to store columns from all dataframes
    merged_data = {col: [] for col in ref_mapping if col}
    # Track already included columns
    included_cols = set(merged_data.keys())
    
    max_len = 0  # To track the maximum length of columns

    # Iterate through each dataframe and its corresponding mapping
    for df, mapping, df_name in zip(dfs, mappings, df_names):
        for i, col in enumerate(df.columns):
            if col in mapping:
                ref_col = ref_mapping[mapping.index(col)]
                if ref_col:  # Reference column is not None
                    if ref_col in merged_data:
                        merged_data[ref_col].extend(df[col].tolist())
                    else:
                        merged_data[ref_col] = df[col].tolist()
                    max_len = max(max_len, len(merged_data[ref_col]))
                else:
                    # For columns in the dataframes but not in the reference mapping
                    new_col_name = f"{df_name}_{col}"
                    if new_col_name not in included_cols:
                        merged_data[new_col_name] = df[col].tolist()
                        included_cols.add(new_col_name)
                        max_len = max(max_len, len(merged_data[new_col_name]))
            else:
                # Handle columns not present in the mapping
                for j, ref_col in enumerate(ref_mapping):
                    if not ref_col:
                        new_col_name = f"{df_name}_{col}"
                        if new_col_name not in included_cols:
                            merged_data[new_col_name] = df[col].values.tolist() 
                            included_cols.add(new_col_name)
                            max_len = max(max_len, len(merged_data[new_col_name]))

    # Ensure all columns have the same length
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    # Convert the merged_data dictionary to a DataFrame
    merged_df = pd.DataFrame.from_dict(merged_data)
    
    # Remove columns containing 'Unnamed'
    merged_df = merged_df.loc[:, ~merged_df.columns.str.contains('Unnamed')]

    return merged_df


In [11]:
# List of dataframes and their mappings
dfs = [df_2012, df_2013]
mappings = [mapping_2012, mapping_2013]
ref_mapping = mapping_2013
df_name= ['2012', '2013']
merged_df = standardize_and_merge(dfs, mappings, ref_mapping, df_name)


In [12]:
# Rename columns for the merged file (if needed)
rename_mapping = {
    'hid': 'HID',
    'round': 'Survey_Round',
    'crop_name': 'CN',
    'crop_code': 'CC',
    's2p5_q1a_i': 'LandPrep_CHL_NoP',
    's2p5_q1a_ii': 'LandPrep_CHL_NoD',
    's2p5_q1a_iii': 'LandPrep_CHL_CPP',
    's2p5_q1b_i': 'LandPrep_M',
    's2p5_q1b_ii': 'LandPrep_M_Status',
    's2p5_q1b_iii': 'LandPrep_CoM',
    's2p5_q2a_i': 'Sowing_CHL_NoP',
    's2p5_q2a_ii': 'Sowing_CHL_NoD',
    's2p5_q2a_iii': 'Sowing_CHL_CPP',
    's2p5_q2b_i': 'Sowing_M',
    's2p5_q2b_ii': 'Sowing_M_Status',
    's2p5_q2b_iii': 'Sowing_CoM',
    's2p5_q3a_i': 'Irr_CHL_NoP',
    's2p5_q3a_ii': 'Irr_CHL_NoD',
    's2p5_q3a_iii': 'Irr_CHL_CPP',
    's2p5_q4a_i': 'FA_CHL_NoP',
    's2p5_q4a_ii': 'FA_CHL_NoD',
    's2p5_q4a_iii': 'FA_CHL_CPP',
    's2p5_q5a_i': 'PA_CHL_NoP',
    's2p5_q5a_ii': 'PA_CHL_NoD',
    's2p5_q5a_iii': 'PA_CHL_CPP',
    's2p5_q5b_i': 'PA_M',
    's2p5_q5b_ii': 'PA_M_Status',
    's2p5_q5b_iii': 'PA_CoM',
    's2p5_q6a_i': 'Weed_CHL_NoP',
    's2p5_q6a_ii': 'Weed_CHL_NoD',
    's2p5_q6a_iii': 'Weed_CHL_CPP',
    's2p5_q6b_i': 'Weed_M',
    's2p5_q6b_ii': 'Weed_M_Status',
    's2p5_q6b_iii': 'Weed_CoM',
    's2p5_q7a_i': 'HPS_CHL_NoP',
    's2p5_q7a_ii': 'HPS_CHL_NoD',
    's2p5_q7a_iii': 'HPS_CHL_CPP',
    's2p5_q7b_i': 'HPS_M',
    's2p5_q7b_ii': 'HPS_M_Status',
    's2p5_q7b_iii': 'HPS_CoM',
    's2p5_q8a_i': 'Thresh_CHL_NoP',
    's2p5_q8a_ii': 'Thresh_CHL_NoD',
    's2p5_q8a_iii': 'Thresh_CHL_CPP',
    's2p5_q8b_i': 'Thresh_M',
    's2p5_q8b_ii': 'Thresh_M_Status',
    's2p5_q8b_iii': 'Thresh_CoM',
    's2p5_q9a_i': 'TnS_CHL_NoP',
    's2p5_q9a_ii': 'TnS_CHL_NoD',
    's2p5_q9a_iii': 'TnS_CHL_CPP',
    's2p5_q9b_i': 'TnS_M',
    's2p5_q9b_ii': 'TnS_M_Status',
    's2p5_q9b_iii': 'TnS_CoM',
    's2p5_q10a_i': 'Prune_CHL_NoP',
    's2p5_q10a_ii': 'Prune_CHL_NoD',
    's2p5_q10a_iii': 'Prune_CHL_CPP',
    's2p5_q11': 'PHL',
    's2p5_q12_i': 'PHL_NoP',
    's2p5_q12_ii': 'PHL_NoM',
    's2p5_q12_iii': 'PHL_CPP'
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_2_part_7.csv', index=False)

