In [3]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd
import numpy as np
# Store excel file locations to variables

Com_2012 = r"C:\Users\warra\Desktop\Freelance\data\data\CommunityMerge\1. Merging by Parts\7. S3 Natural Disasters\2012_com_s8.csv"
Com_2013 = r"C:\Users\warra\Desktop\Freelance\data\data\CommunityMerge\1. Merging by Parts\7. S3 Natural Disasters\2013_s3_focus.csv"
Com_2014 = r"C:\Users\warra\Desktop\Freelance\data\data\CommunityMerge\1. Merging by Parts\7. S3 Natural Disasters\2014_s4_community.csv"

# Read excel files 
df_2012 = pd.read_csv(Com_2012)
df_2013 = pd.read_csv(Com_2013)
df_2014 = pd.read_csv(Com_2014)



In [4]:
#This code block will be used to standardize column names across the years to avoid discrepancies during the merging process.

# Rename columns in df to df_2012 column names
df_2012.rename(columns={
    'incident': 's4_event_no',
    'S8Q3': 's4_q3',
    'S8Q4': 's4_q4',
    'S8Q1': 'ND_M_5y',
    'S8Q2A': 'ND_y1',
    'S8Q2B': 'ND_y2',
    'S8Q2C': 'ND_y3',
    'community': 'Community',
    'fid': 'FID',
    'PROVINCE_ID': 'P_ID',
    'DISTRICT_ID': 'D_ID',
    'TEHSIL_ID': 'T_ID',
    'MAUZA_ID': 'M_ID'
}, inplace=True)

# Rename columns in df to df_2013 column names
df_2013.rename(columns={
    'event_id': 's4_event_no',
    'event_name': 's4_event_description',
    's3_q1': 's4_q1',
    's3_q2a': 's4_q2a',
    's3_q2b': 's4_q2b',
    's3_q2c': 's4_q2c',
    's3_q3': 's4_q3',
    's3_q4': 's4_q4',
    's3_q5': 's4_q5',
    's3_q6': 's4_q6',
    's3_q7': 's4_q7',
    's3_q8': 's4_q8',
    's3_q9': 's4_q9',
    's3_q10': 's4_q10'
}, inplace=True)


# df_2014 doesn't need renaming as it is the reference

In [5]:
# Define the column mappings based on the provided positions
# Make dictionaries for each year with updated names
# replace with 'None' where there are no columns
#Here are the updated mapping lists for the given datasets:


mapping_2012 = [
     'cid', 's4_event_no', None, None, None, None,
    None, 's4_q3', 's4_q4', None, None, None,
    None, None, None, None, 'ND_M_5y',
    'ND_y1', 'ND_y2', 'ND_y3', 'Community', 
    'FID', 'P_ID', 'D_ID', 'T_ID', 'UC_ID','M_ID'
]

mapping_2013 = [
    'cid',
    's4_event_no',
    's4_event_description',
    's4_q1',
    's4_q2a',
    's4_q2b',
    's4_q2c',
    's4_q3',
    's4_q4',
    's4_q5',
    None, 's4_q6', 's4_q7', 's4_q8', 's4_q9', 's4_q10', None, None, None, None, None, None, None, None, None, None, None
]


mapping_2014 = [
    'cid',
    's4_event_no',
    's4_event_description',
    's4_q1',
    's4_q2a',
    's4_q2b',
    's4_q2c',
    's4_q3',
    's4_q4',
    's4_q5',
    's4_q5a',
    's4_q6',
    's4_q7',
    's4_q8',
    's4_q9',
    's4_q10',
    None, None, None, None, None, None, None, None, None, None, None
]





In [6]:
# Create a list of all possible columns in the correct order
all_columns = []

for col in mapping_2012:
    if col and col not in all_columns:
        all_columns.append(col)   
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [7]:
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col.strip()  # Remove leading/trailing whitespace
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if ref_col in df.columns:
                    print(f"Appending data for column {ref_col}")
                    if isinstance(df[ref_col], pd.Series):
                        merged_data[ref_col].extend(df[ref_col].tolist())
                    elif isinstance(df[ref_col], pd.DataFrame):
                        print(f"Column {ref_col} is duplicated in DataFrame. Appending data for each duplicate.")
                        for _, series in df[ref_col].items():
                            merged_data[ref_col].extend(series.tolist())
                else:
                    print(f"Column {ref_col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df


In [8]:
# Usage with dataframes and mappings
dfs = [df_2012, df_2013, df_2014]
mappings = [mapping_2012, mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'round', 'cid', 'Community', 's4_event_no', 'ND_M_5y', 'ND_y1', 'ND_y2', 'ND_y3', 's4_q3', 's4_q4', 'FID', 'M_ID', 'P_ID', 'D_ID', 'T_ID', 'UC_ID']
Appending data for column cid
Appending data for column s4_event_no
Appending data for column s4_q3
Appending data for column s4_q4
Appending data for column ND_M_5y
Appending data for column ND_y1
Appending data for column ND_y2
Appending data for column ND_y3
Appending data for column Community
Appending data for column FID
Appending data for column P_ID
Appending data for column D_ID
Appending data for column T_ID
Appending data for column UC_ID
Appending data for column M_ID
Processing DataFrame with columns: ['Unnamed: 0', 'cid', 's4_event_no', 's4_event_description', 's4_q1', 's4_q2a', 's4_q2b', 's4_q2c', 's4_q3', 's4_q4', 's4_q5', 's4_q6', 's4_q7', 's4_q8', 's4_q9', 's4_q10']
Appending data for column cid
Appending data for column s4_event_no
Appending data for column s4_event_descrip

In [9]:
# Rename columns for the merged file (if needed)
rename_mapping = {
    'cid': 'CID',
    's4_event_no': 'ND_Eventno',
    's4_event_description': 'ND_EventDesc',
    's4_q1': 'ND_M_y',
    's4_q2a': 'ND_m1',
    's4_q2b': 'ND_m2',
    's4_q2c': 'ND_m3',
    's4_q3': 'ND_ABC',
    's4_q4': 'ND_AM',
    's4_q5': 'ND_FG',
    's4_q5a': 'ND_PG',
    's4_q6': 'ND_LG',
    's4_q7': 'ND_PNGO',
    's4_q8': 'ND_FNGO',
    's4_q9': 'ND_Oth',
    's4_q10': 'ND_UseAssist'
}


merged_df.rename(columns=rename_mapping, inplace=True)

# Drop redundant columns
df_2014.drop(df_2014.columns[df_2014.columns.str.contains('Unnamed', case=True)], axis=1, inplace=True)
df_2014.drop(df_2014.columns[df_2014.columns.str.contains(' ', case=False)], axis=1, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('6. merged_S4_Natural Disasters.csv', index=False)

