In [1]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd
import numpy as np

# Store excel file locations to variables (change it as per your path to file)
file_2012=r"C:\Users\warra\Desktop\Freelance\data\data\FemaleMerge\1. Merging by Parts\10. Section 2 Part 5b Non-Agricultural Enterprises Expenditure & Income\2012_s2p5b_f.csv"
file_2013=r"C:\Users\warra\Desktop\Freelance\data\data\FemaleMerge\1. Merging by Parts\10. Section 2 Part 5b Non-Agricultural Enterprises Expenditure & Income\2013_s2p5b_f.csv"
file_2014=r"C:\Users\warra\Desktop\Freelance\data\data\FemaleMerge\1. Merging by Parts\10. Section 2 Part 5b Non-Agricultural Enterprises Expenditure & Income\2014_s2p5b_f.csv"

# Read excel files
df_2012 = pd.read_csv(file_2012)
df_2013 = pd.read_csv(file_2013)
df_2014 = pd.read_csv(file_2014)


In [2]:
'''
This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.
For example in the roaster data for 2013 rq21 and rq23 are not the same as rq21 and rq23 in 2014 data, but they have the same variable names. Hence, we decide to rename such columns beforehand
We will add the updated name to the mapping dictionaries instead of the original names.

'''

# Rename columns in df_2012
df_2012.rename(columns={
    'business_id': 's2p5b_business_id',
    'part2q2': 's2p5b_q2',
    'part2q3': 's2p5b_q3',
    'part2q4': 's2p5b_q4',
    'part2q5': 's2p5b_q5',
    'part2q7': 's2p5b_q12',
    'part2q8': 's2p5b_q13',
    'part2q9': 's2p5b_q14',
    'part2q10': 's2p5b_q15',
    'part2q11': 's2p5b_q16',
    'part2q12': 's2p5b_q17',
    'part2q13': 's2p5b_q18',
    'part2q14': 's2p5b_q19',
    'part2q15': 's2p5b_q20',
    'part2q6': 'NAEEI_CPro_Val',
    'part2q16': '2012_part2q16',
    'PROVINCE_ID': 'P_ID',
    'DISTRICT_ID': 'D_ID',
    'TEHSIL_ID': 'T_ID',
    'UC_ID': 'UC_ID',
    'MAUZA_ID': 'M_ID'
}, inplace=True)

# Rename columns in df_2013
df_2013.rename(columns={
    'business_id': 's2p5b_business_id',
    'r_pid': 'PID'
}, inplace=True)

# df_2014 doesn't need renaming as it is the reference

In [3]:
# Updated mappings

mapping_2012 = [
    'hid', 'round', 's2p5b_business_id', None, 's2p5b_q2', 's2p5b_q3', 's2p5b_q4', 's2p5b_q5',
    None, None, None, None, None, None, None, None, None, 's2p5b_q12', 's2p5b_q13', 's2p5b_q14',
    's2p5b_q15', 's2p5b_q16', 's2p5b_q17', 's2p5b_q18', 's2p5b_q19', 's2p5b_q20', None,
    'NAEEI_CPro_Val', '2012_part2q16', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID'
]

mapping_2013 = [
    'hid', 'round', 's2p5b_business_id', 's2p5b_q1', 's2p5b_q2', 's2p5b_q3', 's2p5b_q4', 's2p5b_q5', 's2p5b_q6_n', 's2p5b_q6_c', 
    's2p5b_q7_n', 's2p5b_q7_c', 's2p5b_q8_n', 's2p5b_q8_c', 's2p5b_q9', 's2p5b_q10', 's2p5b_q11', 's2p5b_q12', 's2p5b_q13', 
    's2p5b_q14', 's2p5b_q15', 's2p5b_q16', 's2p5b_q17', 's2p5b_q18', 's2p5b_q19', 's2p5b_q20', 
    'PID', None, None, None, None, None, None, None
]

mapping_2014 = [
    'hid', 'round', 's2p5b_business_id', 's2p5b_q1', 's2p5b_q2', 's2p5b_q3', 's2p5b_q4', 's2p5b_q5', 's2p5b_q6_n', 's2p5b_q6_c', 
    's2p5b_q7_n', 's2p5b_q7_c', 's2p5b_q8_n', 's2p5b_q8_c', 's2p5b_q9', 's2p5b_q10', 's2p5b_q11', 's2p5b_q12', 's2p5b_q13', 
    's2p5b_q14', 's2p5b_q15', 's2p5b_q16', 's2p5b_q17', 's2p5b_q18', 's2p5b_q19', 's2p5b_q20', 
    None, None, None, None, None, None, None, None
]


In [4]:
# Create a list of all possible columns in the correct order
all_columns = []

for col in mapping_2012:
    if col and col not in all_columns:
        all_columns.append(col)   
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [5]:
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col.strip()  # Remove leading/trailing whitespace
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if ref_col in df.columns:
                    print(f"Appending data for column {ref_col}")
                    if isinstance(df[ref_col], pd.Series):
                        merged_data[ref_col].extend(df[ref_col].tolist())
                    elif isinstance(df[ref_col], pd.DataFrame):
                        print(f"Column {ref_col} is duplicated in DataFrame. Appending data for each duplicate.")
                        for _, series in df[ref_col].items():
                            merged_data[ref_col].extend(series.tolist())
                else:
                    print(f"Column {ref_col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df


In [6]:
# Usage with dataframes and mappings
dfs = [df_2012, df_2013, df_2014]
mappings = [mapping_2012, mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'hid', 'round', 's2p5b_business_id', 's2p5b_q2', 's2p5b_q3', 's2p5b_q4', 's2p5b_q5', 'NAEEI_CPro_Val', 's2p5b_q12', 's2p5b_q13', 's2p5b_q14', 's2p5b_q15', 's2p5b_q16', 's2p5b_q17', 's2p5b_q18', 's2p5b_q19', 's2p5b_q20', '2012_part2q16', 'P_ID', 'D_ID', 'T_ID', 'UC_ID', 'M_ID']
Appending data for column hid
Appending data for column round
Appending data for column s2p5b_business_id
Appending data for column s2p5b_q2
Appending data for column s2p5b_q3
Appending data for column s2p5b_q4
Appending data for column s2p5b_q5
Appending data for column s2p5b_q12
Appending data for column s2p5b_q13
Appending data for column s2p5b_q14
Appending data for column s2p5b_q15
Appending data for column s2p5b_q16
Appending data for column s2p5b_q17
Appending data for column s2p5b_q18
Appending data for column s2p5b_q19
Appending data for column s2p5b_q20
Appending data for column NAEEI_CPro_Val
Appending data for column 2012_part2q16
Appending data for co

In [7]:
# Rename columns for the merged file (if needed)
rename_mapping = {
   'hid': 'HID',
    'round': 'Survey_Round',
    's2p5b_business_id': 'NAEEI_Bid',
    's2p5b_q1': 'NAEEI_SRP_TS',
    's2p5b_q2': 'NAEEI_TR',
    's2p5b_q3': 'NAEEI_HHShare',
    's2p5b_q4': 'NAEEI_HHShare_NP',
    's2p5b_q5': 'NAEEI_HHShare_NPR',
    's2p5b_q6_n': 'NAEEI_Fraw_Name',
    's2p5b_q6_c': 'NAEEI_Fraw_Code',
    's2p5b_q7_n': 'NAEEI_Sraw_Name',
    's2p5b_q7_c': 'NAEEI_Sraw_Code',
    's2p5b_q8_n': 'NAEEI_Traw_Name',
    's2p5b_q8_c': 'NAEEI_Traw_Code',
    's2p5b_q9': 'NAEEI_Fraw_Val',
    's2p5b_q10': 'NAEEI_Sraw_Val',
    's2p5b_q11': 'NAEEI_Traw_Val',
    's2p5b_q12': 'NAEEI_OMI_Val',
    's2p5b_q13': 'NAEEI_Fuel_Val',
    's2p5b_q14': 'NAEEI_Sal_Val',
    's2p5b_q15': 'NAEEI_Rent_Val',
    's2p5b_q16': 'NAEEI_SnP_Val',
    's2p5b_q17': 'NAEEI_PnT_Val',
    's2p5b_q18': 'NAEEI_LnRF_Val',
    's2p5b_q19': 'NAEEI_InPT_Val',
    's2p5b_q20': 'NAEEI_OE_Val'
}

merged_df.rename(columns=rename_mapping, inplace=True)
'''
# Drop redundant columns
merged_df.drop(merged_df.columns[merged_df.columns.str.contains('Unnamed', case=True)], axis=1, inplace=True)
merged_df.drop(merged_df.columns[merged_df.columns.str.contains(' ', case=False)], axis=1, inplace=True)
'''
# Save the merged dataframe to a CSV file
merged_df.to_csv('8. merged_merged_EnI_NAE Expenditure and Income.csv', index=False)
