In [None]:
'''
This notebook contains the code to merge all Prices data across the years based on sections.
There are datasets for 2012, 2013 and 2014
'''

In [1]:
# Define the folder path where the data files are located
data_folder = r"C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\2. Section 1 Basic Information about District, UC and M Level Respondents"

# List of all the file names to be merged
file_names = ['1 merged_Prices_Basic-Information-about-District-Level-Respondents.csv',
             '2 merged_Prices_Basic-Information-about-Union-Council-Level-Respondents.csv',
             '3 merged_Prices_Basic-Information-about-Mouza-Level-Respondents.csv']

In [None]:
    """
The following cell
    * Merges multiple CSV files based on specified merge keys.

    * Parameters:
     - data_folder (str): The folder path where the data files are located.
     - file_names (list of str): List of all the file names to be merged.
     - output_file (str): The name of the output file where merged data will be saved.
     - merge_keys (list of str): List of keys to merge the data on.

    * Returns:
     - None
    """

In [3]:
import pandas as pd
import os

# Initialize an empty DataFrame to hold the merged data
merged_df = pd.DataFrame()

# Iterate over each file in the list, read it, and concatenate it to the merged DataFrame
for file_name in file_names:
    file_path = os.path.join(data_folder, file_name)  # Create the full file path
    print(f"Looking for file: {file_path}")  # Debug print
    if os.path.exists(file_path):
        print(f"File found: {file_path}")  # Debug print
        df = pd.read_csv(file_path, dtype=str)  # Read the CSV file with all columns as strings to handle mixed data types
        merged_df = pd.concat([merged_df, df], ignore_index=True)  # Concatenate the current file's data to the merged DataFrame
    else:
        print(f"File not found: {file_path}")  # Debug print

# Proceed only if merged_df is not empty
if not merged_df.empty:
    # Reorder columns alphabetically; put columns starting with digits at the end
    sorted_columns = sorted(merged_df.columns, key=lambda x: (x[0].isdigit(), x))
    # Reorder the DataFrame columns according to the defined order
    merged_df = merged_df[sorted_columns]

    # Remove columns that are entirely empty
    merged_df.dropna(axis=1, how='all', inplace=True)
    # Remove rows that are entirely empty
    merged_df.dropna(axis=0, how='all', inplace=True)

    # Save the merged and processed data to a new CSV file
    merged_df.to_csv('2. Basic Information about District, UC and M Level Respondents.csv', index=False)
    print("Merged data saved successfully!")
else:
    print("No data to merge and save.")


Looking for file: C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\2. Section 1 Basic Information about District, UC and M Level Respondents\1 merged_Prices_Basic-Information-about-District-Level-Respondents.csv
File found: C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\2. Section 1 Basic Information about District, UC and M Level Respondents\1 merged_Prices_Basic-Information-about-District-Level-Respondents.csv
Looking for file: C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\2. Section 1 Basic Information about District, UC and M Level Respondents\2 merged_Prices_Basic-Information-about-Union-Council-Level-Respondents.csv
File found: C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\2. Section 1 Basic Information about District, UC and M Level Respondents\2 merged_Prices_Basic-Information-about-Union-Council-Level-Respondents.csv
Looking for file: C:\Users\warra\Desktop\F