In [None]:
'''
This notebook contains the code to merge all male data across the years based on sections.
There are datasets for 2012, 2012-1.5, 2013 and 2014
The different sections that will be merged are as follows:
**2012**
1. Cover
2. Roaster
3. Section 1: Education (All men 18 and above)
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks
10. Section 8: Community Participation and Social Network Membership

**2013**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Health
10. Section 8: Political Participation and Governance

**2014**
1. Cover
2. Roaster
3. Section 1: Education: Males 19 years and older
4. Section 2: Agriculture
5. Section 3: Assets
6. Section 4: Consumption and Expenditure
7. Section 5: Credit
8. Section 6: Employment and Income
9. Section 7: Economic Events/Shocks”
10. Section 8: Participation in Social Safety Net
11. Section 9: Siblings
12. Section 10: Transfers
13. Section 11: Health and Nutrition

'''

In [None]:
'''
All files in the male folder will be converted to xlsx format for readability
This will be done for all male files across the years
'''

import pandas as pd
import glob

# Path where the .dta files are located
# We will use just once cell for each conversion since just the path is changing
folder_path = r'C:\Users\warra\Downloads\data\data\2014_data\Male'

# Get a list of all .dta files in the specified directory
file_list = glob.glob(folder_path + '/*.dta')

# Loop through the list of files
for file in file_list:
    # Read the .dta file into a pandas DataFrame
    df = pd.read_stata(file, convert_categoricals=False)
    
    # Define the output file name by replacing .dta with .xlsx
    output_file = file.replace('.dta', '.xlsx')
    
    # Write the DataFrame to an Excel file
    df.to_excel(output_file)


In [None]:
'''
The following code block will merge all roaster files across the years.
* First, we are going to read the respective files and store them as data frames
* Next, we are going to define column mappings that I have already figured out via manual methods
* Once the mappings are done per the set rules, we will see the new roaster dataset across the years 1.
'''

In [2]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd

# Store excel file locations to variables (change it as per your path to file)
agri_2012=r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\6. Section 2 Part 4 INPUT USE IN RABI\2012_s2p4_m.xlsx"
agri_2013=r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\6. Section 2 Part 4 INPUT USE IN RABI\2013_s2p4_m.xlsx"
agri_2014=r"C:\Users\warra\Desktop\Freelance\data\data\MaleMerge\6. Section 2 Part 4 INPUT USE IN RABI\2014_s2p4_m.xlsx"

# Read excel files
df_2012 = pd.read_excel(agri_2012)
df_2013 = pd.read_excel(agri_2013)
df_2014 = pd.read_excel(agri_2014)


In [3]:
'''
This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.
For example in the roaster data for 2013 rq21 and rq23 are not the same as rq21 and rq23 in 2014 data, but they have the same variable names. Hence, we decide to rename such columns beforehand
We will add the updated name to the mapping dictionaries instead of the original names.

'''

# Rename columns in df

df_2012.rename(columns={
    'crop_code': 's2p4_q1',
    's2p4q3a': 's2p4_q2',
    's2p4q4a': 's2p4_q3',
    's2p4q5a': 's2p4_q4',
    's2p4q2a': 's2p4_q5',
    's2p4q6a': 's2p4_q8',
    'area': 'Area',
    'unit': 'Unit',
    's2p4q2b': 'Exp_Seed_Own',
    's2p4q3b': 'Exp_Pest_Own',
    's2p4q4b': 'Exp_Fert_Own',
    's2p4q5b': 'Exp_Irr_Own',
    's2p4q6b': 'Exp_Misc_Own',
}, inplace=True)


df_2013.rename(columns={
    'crop_code': 's2p4_q1',
    's2p4_q1a': 's2p4_q2',
    's2p4_q2a': 's2p4_q3',
    's2p4_q3a': 's2p4_q4',
    's2p4_q4e': 's2p4_q5',
    's2p4_q5a': 's2p4_q8',
    's2p4_q4f': 'Exp_Seed_Own',
    's2p4_q1b': 'Exp_Pest_Own',
    's2p4_q2b': 'Exp_Fert_Own',
    's2p4_q3b': 'Exp_Irr_Own',
    's2p4_q5b': 'Exp_Misc_Own',
    's2p4_q4a': 'Exp_Seed_T_Pq',
    's2p4_q4a_kg': 'Exp_Seed_T_PqKG',
    's2p4_q4b': 'Exp_Seed_T_Pu',
    's2p4_q4c': 'Exp_Seed_T_Oq',
    's2p4_q4c_kg': 'Exp_Seed_T_OqKG',
    's2p4_q4d': 'Exp_Seed_T_Ou',
}, inplace=True)
# df_2014 doesn't need renaming as it is the reference

In [14]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns

mapping_2012 = [
    'hid', 'round', 's2p4_q1', 's2p4_q2', 's2p4_q3', 's2p4_q4', 's2p4_q5', None, None,
    's2p4_q8', 'Area', 'Unit', 'Exp_Seed_Own', 'Exp_Pest_Own', 'Exp_Fert_Own', 'Exp_Irr_Own', 'Exp_Misc_Own', None, None,
    None, None, None, None
]

mapping_2013 = [
    'hid', 'round', 's2p4_q1', 's2p4_q2', 's2p4_q3', 's2p4_q4', 's2p4_q5', 's2p4_q8', None, None,
    None, None, 'Exp_Seed_Own', 'Exp_Pest_Own', 'Exp_Fert_Own', 'Exp_Irr_Own', 'Exp_Misc_Own', 'Exp_Seed_T_Pq',
    'Exp_Seed_T_PqKG', 'Exp_Seed_T_Pu', 'Exp_Seed_T_Oq', 'Exp_Seed_T_OqKG', 'Exp_Seed_T_Ou'
]

mapping_2014 = [
    'hid', 'round', 's2p4_q1', 's2p4_q2', 's2p4_q3', 's2p4_q4', 's2p4_q5', 's2p4_q6', 's2p4_q7', 's2p4_q8',
    None, None, None, None, None, None, None, None,
    None, None, None, None, None
]


In [15]:
# Create a list of all possible columns in the correct order
all_columns = []
for col in mapping_2012:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)


In [16]:

# Function to standardize and merge DataFrames
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col
                if col in df.columns:
                    print(f"Appending data for column {col}")
                    merged_data[ref_col].extend(df[col].tolist())
                else:
                    print(f"Column {col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df

In [17]:
# Usage with dataframes and mappings
dfs = [df_2012, df_2013, df_2014]
mappings = [mapping_2012, mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 's2p4_q1', 'Area', 'Unit', 's2p4_q5', 'Exp_Seed_Own', 's2p4_q2', 'Exp_Pest_Own', 's2p4_q3', 'Exp_Fert_Own', 's2p4_q4', 'Exp_Irr_Own', 's2p4_q8', 'Exp_Misc_Own']
Appending data for column hid
Appending data for column round
Appending data for column s2p4_q1
Appending data for column s2p4_q2
Appending data for column s2p4_q3
Appending data for column s2p4_q4
Appending data for column s2p4_q5
Appending data for column s2p4_q8
Appending data for column Area
Appending data for column Unit
Appending data for column Exp_Seed_Own
Appending data for column Exp_Pest_Own
Appending data for column Exp_Fert_Own
Appending data for column Exp_Irr_Own
Appending data for column Exp_Misc_Own
Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 'crop_name', 's2p4_q1', 's2p4_q2', 'Exp_Pest_Own', 's2p4_q3', 'Exp_Fert_Own', 's2p4_q4', 'Exp_Irr_Own', 'Exp_Seed_T_Pq', 'Exp_Seed_T_PqKG', 'Exp_Seed_T_Pu', 'Exp_Seed_T_Oq', 'Exp_Seed_T

KeyError: 's2p4_q6'

In [22]:
# Rename columns for the merged file (if needed)
rename_mapping = {
        'hid': 'HID',
        'round': 'Survey_Round',
        's2p4_q1': 'CC',
        's2p4_q2': 'Exp_Pest_T',
        's2p4_q3': 'Exp_Fert_T',
        's2p4_q4': 'Exp_Irr_T',
        's2p4_q5': 'Exp_Seed_T',
        's2p4_q6': 'Exp_Lab',
        's2p4_q7': 'Exp_Mech',
        's2p4_q8': 'Exp_Misc',
        # Add other renamings here
    }

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_Section_2_part_4.csv', index=False)

