In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks”
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [None]:
'''
The following code block will merge all roaster files across the years.
* First, we are going to read the respective files and store them as data frames
* Next, we are going to define column mappings that I have already figured out via manual methods
* Once the mappings are done per the set rules, we will see the new roaster dataset across the years 1.
'''

In [9]:
#since there are 2 files for this section in multiple years, we will make a standardized single file for sectional merging

import pandas as pd

# Load the first file
df1 = pd.read_excel(r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\10. Section 2 Part 8 Ownership, Sale and Purchase of Livestock 12m\Livestock owned + cost of livestock\2014_s2p9_m.xlsx")
# Load the second file
df2 = pd.read_excel(r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\10. Section 2 Part 8 Ownership, Sale and Purchase of Livestock 12m\Livestock owned + cost of livestock\2014_s2p10_m.xlsx")

df2.rename(columns={
    's2p10_name':'animal_name'
}, inplace=True)
# Merge the two files based on a common column
merged_df = pd.concat([df1, df2], ignore_index=True)

# Drop redundant columns
merged_df.drop(merged_df.columns[merged_df.columns.str.contains('Unnamed', case=True)], axis=1, inplace=True)
merged_df.drop(merged_df.columns[merged_df.columns.str.contains(' ', case=False)], axis=1, inplace=True)
# Save the merged dataframe to a CSV file
merged_df.to_csv('2014_S2P8.csv', index=True)

In [12]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd

# Store excel file locations to variables

agri_2012 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\10. Section 2 Part 8 Ownership, Sale and Purchase of Livestock 12m\Livestock owned + cost of livestock\2012_s2p6_m.xlsx"
agri_2012_5 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\10. Section 2 Part 8 Ownership, Sale and Purchase of Livestock 12m\Livestock owned + cost of livestock\2012_1.5_S2P8_q1to5b&18ato19c.csv"
agri_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\10. Section 2 Part 8 Ownership, Sale and Purchase of Livestock 12m\Livestock owned + cost of livestock\2013_s2p9_m.xlsx"
agri_2014 = r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\10. Section 2 Part 8 Ownership, Sale and Purchase of Livestock 12m\Livestock owned + cost of livestock\2014_S2P8.csv"

# Read excel files 
df_2012 = pd.read_excel(agri_2012)
df_2012_5 = pd.read_csv(agri_2012_5)
df_2013 = pd.read_excel(agri_2013)
df_2014 = pd.read_csv(agri_2014)



In [13]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.
# Rename columns in df_2012 to df_2014 column names
df_2012.rename(columns={
    'S2P6Q1A': 's2p9_q1',
    'S2P6Q1B': 's2p9_q2',
    'S2P6Q1C': 's2p9_q3',
    'S2P6Q1D': 's2p9_q4',
    'S2P6Q1E': 's2p9_q5',
    'S2P6Q1F': 's2p9_q6',
    'S2P6Q2A': 's2p9_q7',
    'S2P6Q2B': 's2p9_q8',
    'S2P6Q3A': 's2p9_q9',
    'S2P6Q3B': 's2p9_q10',
    'S2P6Q4A': 's2p9_q11',
    'S2P6Q4B': 's2p9_q12',
    'S2P6Q5A': 's2p9_q13',
    'S2P6Q5B': 's2p9_q14',
    'S2P6Q6A': 's2p10_q1',
    'S2P6Q6B': 's2p10_q2',
    'S2P6Q6C': 's2p10_q4',
    'S2P6Q6D': 's2p10_q6',
    'S2P6Q6E': 's2p10_q7',
    'S2P6Q6F': 's2p10_q9',
    'PROVINCE_ID': 'P_ID',
    'DISTRICT_ID': 'D_ID',
    'TEHSIL_ID': 'T_ID',
    'UC_ID': 'UC_ID',
    'MAUZA_ID': 'M_ID',
}, inplace=True)


# Rename columns in df_2012_5 to df_2014 column names
df_2012_5.rename(columns={
    'hid': 'hid',
    'Round': 'round',
    'Code': 'animal_code',
    'S1P9_NAME': 'animal_name',
    'S1P9Q1A': 's2p9_q1',
    'S1P9Q1B': 's2p9_q2',
    'S1P9Q1C': 's2p9_q3',
    'S1P9Q1D': 's2p9_q4',
    'S1P9Q1E': 's2p9_q5',
    'S1P9Q1F': 's2p9_q6',
    'S1P9Q2ANUM': 's2p9_q7',
    'S1P9Q2B': 's2p9_q8',
    'S1P9Q5ANUM': 's2p9_q9',
    'S1P9Q5B': 's2p9_q10',
    'S1P9Q3ANUM': 's2p9_q11',
    'S1P9Q3B': 's2p9_q12',
    'S1P9Q4ANUM': 's2p9_q13',
    'S1P9Q4B': 's2p9_q14',
    'S1P9Q18A': 's2p10_q1',
    'S1P9Q18B': 's2p10_q2',
    'S1P9Q18C': 's2p10_q4',
    'S1P9Q19A': 's2p10_q6',
    'S1P9Q19B': 's2p10_q7',
    'S1P9Q19C': 's2p10_q9',
    'C_PROVINCE': 'C_ID',
    'C_DISTRICT': 'D_ID',
    'C_TEHSIL': 'T_ID',
    'C_UC': 'UC_ID',
    'C_MOUZA': 'M_ID'
}, inplace=True)


# Rename columns in df_2013 to df_2014 column names
df_2013.rename(columns={
    'hid': 'hid',
    'round': 'round',
    'animal_code': 'animal_code',
    'animal_name': 'animal_name',
    's2p9_q1a': 's2p9_q1',
    's2p9_q1c': 's2p9_q2',
    's2p9_q1b': 's2p9_q3',
    's2p9_q1d': 's2p9_q4',
    's2p9_q1e': 's2p9_q5',
    's2p9_q1f': 's2p9_q6',
    's2p9_q2a': 's2p9_q7',
    's2p9_q2b': 's2p9_q8',
    's2p9_q3a': 's2p9_q9',
    's2p9_q3b': 's2p9_q10',
    's2p9_q4a': 's2p9_q11',
    's2p9_q4b': 's2p9_q12',
    's2p9_q5a': 's2p9_q13',
    's2p9_q5b': 's2p9_q14',
    's2p9_q6a': 's2p10_q1',
    's2p9_q6b': 's2p10_q2',
    's2p9_q6c': 's2p10_q4',
    's2p9_q6d': 's2p10_q6',
    's2p9_q6e': 's2p10_q7',
    's2p9_q6f': 's2p10_q9'
}, inplace=True)




# df_2014 doesn't need renaming as it is the reference

In [17]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:


mapping_2012 = [
    'hid', 'round', None, 'animal_code', None, 's2p9_q1', 's2p9_q2', 's2p9_q3', 's2p9_q4', 's2p9_q5', 's2p9_q6',
    's2p9_q7', 's2p9_q8', 's2p9_q9', 's2p9_q10', 's2p9_q11', 's2p9_q12', 's2p9_q13', 's2p9_q14', 's2p10_q1', 's2p10_q2',
    None, 's2p10_q4', None, 's2p10_q6', 's2p10_q7', None, 's2p10_q9', None, None, None, None, None, None, None, 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID', None
]

mapping_2012_5 = [
    'hid', 'round', None, 'animal_code', 'animal_name', 's2p9_q1', 's2p9_q2', 's2p9_q3', 's2p9_q4', 's2p9_q5', 's2p9_q6',
    's2p9_q7', 's2p9_q8', 's2p9_q9', 's2p9_q10', 's2p9_q11', 's2p9_q12', 's2p9_q13', 's2p9_q14', 's2p10_q1', 's2p10_q2',
    None, 's2p10_q4', None, 's2p10_q6', 's2p10_q7', None, 's2p10_q9', None, None, None, None, None, None, None, 'C_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID', 'C_HH_NUM'
]

mapping_2013 = [
    'hid', 'round', None, 'animal_code', 'animal_name', 's2p9_q1', 's2p9_q2', 's2p9_q3', 's2p9_q4', 's2p9_q5', 's2p9_q6',
    's2p9_q7', 's2p9_q8', 's2p9_q9', 's2p9_q10', 's2p9_q11', 's2p9_q12', 's2p9_q13', 's2p9_q14', 's2p10_q1', 's2p10_q2',
    None, 's2p10_q4', None, 's2p10_q6', 's2p10_q7', None, 's2p10_q9', None, None, None, None, None, None, None, None, None, None, None, None, None
]

mapping_2014 = [
    'hid', 'round', 's2p9_qa', 'animal_code', 'animal_name', 's2p9_q1', 's2p9_q2', 's2p9_q3', 's2p9_q4', 's2p9_q5', 's2p9_q6',
    's2p9_q7', 's2p9_q8', 's2p9_q9', 's2p9_q10', 's2p9_q11', 's2p9_q12', 's2p9_q13', 's2p9_q14', 's2p10_q1', 's2p10_q2',
    's2p10_q3', 's2p10_q4', 's2p10_q5', 's2p10_q6', 's2p10_q7', 's2p10_q8', 's2p10_q9', 's2p10_q10', 's2p10_q11', 's2p10_q12',
    's2p10_q13', 's2p10_q14', 's2p10_q15', 's2p10_q16', None, None, None, None, None, None
]


In [18]:
# Create a list of all possible columns in the correct order
all_columns = []
for col in mapping_2012:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2012_5:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [23]:

# Function to standardize and merge DataFrames
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if col in df.columns:
                    print(f"Appending data for column {col}")
                    merged_data[ref_col].extend(df[col].tolist())
                else:
                    print(f"Column {col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df

In [24]:
# Usage with dataframes and mappings
dfs = [df_2012, df_2012_5, df_2013, df_2014]
mappings = [mapping_2012, mapping_2012_5, mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 'animal_code', 's2p9_q1', 's2p9_q2', 's2p9_q3', ' ', 's2p9_q5', 's2p9_q6', 's2p9_q7', 's2p9_q8', 's2p9_q9', 's2p9_q10', 's2p9_q11', 's2p9_q12', 's2p9_q13', 's2p9_q14', 's2p10_q1', 's2p10_q2', 's2p10_q4', 's2p10_q6', 's2p10_q7', 's2p10_q9', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID']
Appending data for column hid
Appending data for column round
Appending data for column animal_code
Appending data for column s2p9_q1
Appending data for column s2p9_q2
Appending data for column s2p9_q3
Column s2p9_q4 not found in DataFrame. Adding NaNs.
Appending data for column s2p9_q5
Appending data for column s2p9_q6
Appending data for column s2p9_q7
Appending data for column s2p9_q8
Appending data for column s2p9_q9
Appending data for column s2p9_q10
Appending data for column s2p9_q11
Appending data for column s2p9_q12
Appending data for column s2p9_q13
Appending data for column s2p9_q14
Appending data for column s2p10_q1
Appending data for 

In [25]:
# Rename columns for the merged file (if needed)
rename_mapping = {
    'hid': 'HID',
    'round': 'Survey_Round',
    'animal_code': 'AC',
    'animal_name': 'AN',
    's2p9_q1': 'Own_HH_AA',
    's2p9_q2': 'Own_AA_no',
    's2p9_q3': 'Own_Type',
    's2p9_q4': 'Own_HH_AA_Val',
    's2p9_q5': 'Own_HH_YA_No',
    's2p9_q6': 'Own_HH_YA_Val',
    's2p9_q7': 'SSHC_Qty',
    's2p9_q8': 'SSHC_val',
    's2p9_q9': 'GLS_Qty',
    's2p9_q10': 'GLS_val',
    's2p9_q11': 'GIA_Qty',
    's2p9_q12': 'GIA_val',
    's2p9_q13': 'Qty_P',
    's2p9_q14': 'Val_P',
    's2p10_q1': 'CLF_F1_Type',
    's2p10_q2': 'CLF_F1_Src',
    's2p10_q3': 'CLF_F1_AvgQty',
    's2p10_q4': 'CLF_F1_Cost',
    's2p10_q5': 'CLF_F1_Days',
    's2p10_q6': 'CLF_F2_Type',
    's2p10_q7': 'CLF_F2_Src',
    's2p10_q8': 'CLF_F2_AvgQty',
    's2p10_q9': 'CLF_F2_Cost',
    's2p10_q10': 'CLF_F2_Days',
    's2p10_q11': 'CLF_F3_Type',
    's2p10_q12': 'CLF_F3_Src',   
    's2p10_q13': 'CLF_F3_AvgQty', 
    's2p10_q14': 'CLF_F3_Cost',   
    's2p10_q15': 'CLF_F3_Days',   
    's2p10_q16': 'CLF_LFS'
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_2_part_8.csv', index=False)

