In [7]:
import pandas as pd
import os

# Define the folder path where the data files are located
data_folder = r"C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections"

# List of all the file names to be merged
file_names = ['2. merged_Prices_Basic-Information-about-District, UC and M-Level-Respondents.csv']

In [4]:
# for a single unique identifier

import os
import pandas as pd

# Initialize an empty DataFrame to hold the merged data
merged_df = pd.DataFrame()

# Iterate over each file in the list, read it, and concatenate it to the merged DataFrame
for file_name in file_names:
    file_path = os.path.join(data_folder, file_name)  # Create the full file path
    print(f"Looking for file: {file_path}")  # Debug print
    if os.path.exists(file_path):
        print(f"File found: {file_path}")  # Debug print
        df = pd.read_csv(file_path, dtype=str)  # Read the CSV file with all columns as strings to handle mixed data types

        # Convert numeric columns to double (if possible)
        for col in df.columns:
            try:
                df[col] = pd.to_numeric(df[col], errors='raise', downcast='float')
            except ValueError:
                # Keep non-convertible columns as strings
                pass

        merged_df = pd.concat([merged_df, df], ignore_index=True)  # Concatenate the current file's data to the merged DataFrame
    else:
        print(f"File not found: {file_path}")  # Debug print

# Proceed only if merged_df is not empty
if not merged_df.empty:
    # Define the merge keys
    merge_keys = ['CID']
    # Filter out merge keys that are not in the DataFrame
    available_merge_keys = [key for key in merge_keys if key in merged_df.columns]

    # Function to handle aggregation of non-merge key columns
    def aggregate_data(group):
        # Initialize an empty dictionary to hold aggregated results
        agg_dict = {}
        for col in group.columns:
            if col in available_merge_keys:
                # Keep merge keys as they are
                agg_dict[col] = group[col].iloc[0]
            else:
                # For numeric columns, sum the values
                if pd.api.types.is_numeric_dtype(group[col]):
                    agg_dict[col] = group[col].sum()
                else:
                    # For string columns, concatenate all values
                    agg_dict[col] = ', '.join(group[col].dropna().astype(str).unique())
        return pd.Series(agg_dict)

    # Check for initial duplicates
    initial_duplicates = merged_df[merged_df.duplicated(subset=available_merge_keys, keep=False)]
    if not initial_duplicates.empty:
        print(f"Initial duplicates found based on keys {available_merge_keys}:")
        print(initial_duplicates)

    # Handle rows with missing keys separately
    complete_keys_df = merged_df.dropna(subset=available_merge_keys)
    incomplete_keys_df = merged_df[merged_df[available_merge_keys].isnull().any(axis=1)]

    # Drop rows with missing merge keys
    merged_df.dropna(subset=available_merge_keys, inplace=True)

    # Apply aggregation function to each group for rows with complete keys
    if not merged_df.empty:
        merged_df = merged_df.groupby(available_merge_keys).apply(aggregate_data).reset_index(drop=True)

    # Check for duplicates after aggregation
    post_aggregation_duplicates = merged_df[merged_df.duplicated(subset=available_merge_keys, keep=False)]
    if not post_aggregation_duplicates.empty:
        print(f"Post-aggregation duplicates found based on keys {available_merge_keys}:")
        print(post_aggregation_duplicates)

    # Reorder columns to have available merge keys first and then the rest alphabetically
    remaining_columns = [col for col in merged_df.columns if col not in available_merge_keys]  # Get columns excluding available merge keys
    # Sort the remaining columns alphabetically; put columns starting with digits at the end
    sorted_columns = sorted(remaining_columns, key=lambda x: (x[0].isdigit(), x))

    # Define the final column order with available merge keys first
    column_order = available_merge_keys + sorted_columns
    # Reorder the DataFrame columns according to the defined order
    merged_df = merged_df[column_order]

    # Remove columns that are entirely empty
    merged_df.dropna(axis=1, how='all', inplace=True)
    # Remove rows that are entirely empty
    merged_df.dropna(axis=0, how='all', inplace=True)

    # Append rows with incomplete keys to the end of the DataFrame
    final_df = pd.concat([merged_df, incomplete_keys_df], ignore_index=True)

    # Save the merged and processed data to a new CSV file
    final_df.to_csv('1. merged_cover_CID_Level.csv', index=False)
    print("Merged data saved successfully!")
else:
    print("No data to merge and save.")


Looking for file: C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\1. merged_cover.csv
File found: C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\1. merged_cover.csv
Initial duplicates found based on keys ['CID']:
             CID  Survey_Round  P_Name  P_ID    D_Name  D_ID           T_Name  \
0    101010101.0           2.0  PUNJAB   1.0     KASUR   1.0          CHUNIAN   
1    101010202.0           2.0  PUNJAB   1.0     KASUR   1.0          CHUNIAN   
2    101010203.0           2.0  PUNJAB   1.0     KASUR   1.0          CHUNIAN   
3    101020304.0           2.0  PUNJAB   1.0     KASUR   1.0          PATTOKI   
4    102030405.0           2.0  PUNJAB   1.0   BHAKKAR   2.0          BHAKKAR   
..           ...           ...     ...   ...       ...   ...              ...   
143  318467071.0           3.0     KPK   3.0  NOWSHERA  18.0  NOWSHERA TEHSIL   
144  318467172.0           3.0     KPK   3.0  NOWSHERA  18.0  NOWSHERA TEHSIL   

In [6]:
#CID, Price_ID

import os
import pandas as pd

# Initialize an empty DataFrame to hold the merged data
merged_df = pd.DataFrame()

# Iterate over each file in the list, read it, and concatenate it to the merged DataFrame
for file_name in file_names:
    file_path = os.path.join(data_folder, file_name)  # Create the full file path
    print(f"Looking for file: {file_path}")  # Debug print
    if os.path.exists(file_path):
        print(f"File found: {file_path}")  # Debug print
        df = pd.read_csv(file_path, dtype=str)  # Read the CSV file with all columns as strings to handle mixed data types

        # Convert numeric columns to double (if possible)
        for col in df.columns:
            try:
                df[col] = pd.to_numeric(df[col], errors='raise', downcast='float')
            except ValueError:
                # Keep non-convertible columns as strings
                pass

        merged_df = pd.concat([merged_df, df], ignore_index=True)  # Concatenate the current file's data to the merged DataFrame
    else:
        print(f"File not found: {file_path}")  # Debug print

# Proceed only if merged_df is not empty
if not merged_df.empty:
    # Define the merge keys
    merge_keys = ['CID', 'Price_ID']  # Example of multiple keys
    # Filter out merge keys that are not in the DataFrame
    available_merge_keys = [key for key in merge_keys if key in merged_df.columns]

    # Function to handle aggregation of non-merge key columns
    def aggregate_data(group):
        # Initialize an empty dictionary to hold aggregated results
        agg_dict = {}
        for col in group.columns:
            if col in available_merge_keys:
                # Keep merge keys as they are
                agg_dict[col] = group[col].iloc[0]
            else:
                # For numeric columns, sum the values
                if pd.api.types.is_numeric_dtype(group[col]):
                    agg_dict[col] = group[col].sum()
                else:
                    # For string columns, concatenate all values
                    agg_dict[col] = ', '.join(group[col].dropna().astype(str).unique())
        return pd.Series(agg_dict)

    # Check for initial duplicates
    initial_duplicates = merged_df[merged_df.duplicated(subset=available_merge_keys, keep=False)]
    if not initial_duplicates.empty:
        print(f"Initial duplicates found based on keys {available_merge_keys}:")
        print(initial_duplicates)

    # Handle rows with missing keys separately
    complete_keys_df = merged_df.dropna(subset=available_merge_keys)
    incomplete_keys_df = merged_df[merged_df[available_merge_keys].isnull().any(axis=1)]

    # Drop rows with missing merge keys
    merged_df.dropna(subset=available_merge_keys, inplace=True)

    # Apply aggregation function to each group for rows with complete keys
    if not merged_df.empty:
        merged_df = merged_df.groupby(available_merge_keys).apply(aggregate_data).reset_index(drop=True)

    # Check for duplicates after aggregation
    post_aggregation_duplicates = merged_df[merged_df.duplicated(subset=available_merge_keys, keep=False)]
    if not post_aggregation_duplicates.empty:
        print(f"Post-aggregation duplicates found based on keys {available_merge_keys}:")
        print(post_aggregation_duplicates)

    # Reorder columns to have available merge keys first and then the rest alphabetically
    remaining_columns = [col for col in merged_df.columns if col not in available_merge_keys]  # Get columns excluding available merge keys
    # Sort the remaining columns alphabetically; put columns starting with digits at the end
    sorted_columns = sorted(remaining_columns, key=lambda x: (x[0].isdigit(), x))

    # Define the final column order with available merge keys first
    column_order = available_merge_keys + sorted_columns
    # Reorder the DataFrame columns according to the defined order
    merged_df = merged_df[column_order]

    # Remove columns that are entirely empty
    merged_df.dropna(axis=1, how='all', inplace=True)
    # Remove rows that are entirely empty
    merged_df.dropna(axis=0, how='all', inplace=True)

    # Append rows with incomplete keys to the end of the DataFrame
    final_df = pd.concat([merged_df, incomplete_keys_df], ignore_index=True)

    # Save the merged and processed data to a new CSV file
    final_df.to_csv('3. merged_Prices_Prices-Of-Consumption-Items_CID-Price_ID-Levels.csv', index=False)
    print("Merged data saved successfully!")
else:
    print("No data to merge and save.")


Looking for file: C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\3. merged_Prices_Prices-Of-Consumption-Items.csv
File found: C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\3. merged_Prices_Prices-Of-Consumption-Items.csv
Initial duplicates found based on keys ['CID', 'Price_ID']:
               CID  Survey_Round  Item_Code     Unit  Qty_Unit  D_Prices  \
0      101010203.0           1.0        1.0        1       1.0      60.0   
1      111284344.0           1.0        1.0        1       1.0      45.0   
2      108192930.0           1.0        1.0        1       1.0      50.0   
3      107172728.0           1.0        1.0        1       1.0      70.0   
4      109233435.0           1.0        1.0        1       1.0      45.0   
...            ...           ...        ...      ...       ...       ...   
16935  318467172.0           3.0       94.0  Package       1.0     -44.0   
16936  319477273.0           3.0       94.0  Packa

In [8]:
#CID, Price_ID

import os
import pandas as pd

# Initialize an empty DataFrame to hold the merged data
merged_df = pd.DataFrame()

# Iterate over each file in the list, read it, and concatenate it to the merged DataFrame
for file_name in file_names:
    file_path = os.path.join(data_folder, file_name)  # Create the full file path
    print(f"Looking for file: {file_path}")  # Debug print
    if os.path.exists(file_path):
        print(f"File found: {file_path}")  # Debug print
        df = pd.read_csv(file_path, dtype=str)  # Read the CSV file with all columns as strings to handle mixed data types

        # Convert numeric columns to double (if possible)
        for col in df.columns:
            try:
                df[col] = pd.to_numeric(df[col], errors='raise', downcast='float')
            except ValueError:
                # Keep non-convertible columns as strings
                pass

        merged_df = pd.concat([merged_df, df], ignore_index=True)  # Concatenate the current file's data to the merged DataFrame
    else:
        print(f"File not found: {file_path}")  # Debug print

# Proceed only if merged_df is not empty
if not merged_df.empty:
    # Define the merge keys
    merge_keys = ['CID', 'DLR_Caste', 'UCLR_Caste']  # Example of multiple keys
    # Filter out merge keys that are not in the DataFrame
    available_merge_keys = [key for key in merge_keys if key in merged_df.columns]

    # Function to handle aggregation of non-merge key columns
    def aggregate_data(group):
        # Initialize an empty dictionary to hold aggregated results
        agg_dict = {}
        for col in group.columns:
            if col in available_merge_keys:
                # Keep merge keys as they are
                agg_dict[col] = group[col].iloc[0]
            else:
                # For numeric columns, sum the values
                if pd.api.types.is_numeric_dtype(group[col]):
                    agg_dict[col] = group[col].sum()
                else:
                    # For string columns, concatenate all values
                    agg_dict[col] = ', '.join(group[col].dropna().astype(str).unique())
        return pd.Series(agg_dict)

    # Check for initial duplicates
    initial_duplicates = merged_df[merged_df.duplicated(subset=available_merge_keys, keep=False)]
    if not initial_duplicates.empty:
        print(f"Initial duplicates found based on keys {available_merge_keys}:")
        print(initial_duplicates)

    # Handle rows with missing keys separately
    complete_keys_df = merged_df.dropna(subset=available_merge_keys)
    incomplete_keys_df = merged_df[merged_df[available_merge_keys].isnull().any(axis=1)]

    # Drop rows with missing merge keys
    merged_df.dropna(subset=available_merge_keys, inplace=True)

    # Apply aggregation function to each group for rows with complete keys
    if not merged_df.empty:
        merged_df = merged_df.groupby(available_merge_keys).apply(aggregate_data).reset_index(drop=True)

    # Check for duplicates after aggregation
    post_aggregation_duplicates = merged_df[merged_df.duplicated(subset=available_merge_keys, keep=False)]
    if not post_aggregation_duplicates.empty:
        print(f"Post-aggregation duplicates found based on keys {available_merge_keys}:")
        print(post_aggregation_duplicates)

    # Reorder columns to have available merge keys first and then the rest alphabetically
    remaining_columns = [col for col in merged_df.columns if col not in available_merge_keys]  # Get columns excluding available merge keys
    # Sort the remaining columns alphabetically; put columns starting with digits at the end
    sorted_columns = sorted(remaining_columns, key=lambda x: (x[0].isdigit(), x))

    # Define the final column order with available merge keys first
    column_order = available_merge_keys + sorted_columns
    # Reorder the DataFrame columns according to the defined order
    merged_df = merged_df[column_order]

    # Remove columns that are entirely empty
    merged_df.dropna(axis=1, how='all', inplace=True)
    # Remove rows that are entirely empty
    merged_df.dropna(axis=0, how='all', inplace=True)

    # Append rows with incomplete keys to the end of the DataFrame
    final_df = pd.concat([merged_df, incomplete_keys_df], ignore_index=True)

    # Save the merged and processed data to a new CSV file
    final_df.to_csv('2. merged_Prices_Basic-Information-about-Respondents_CID-DLR_Caste-UCLR_Caste-Levels.csv', index=False)
    print("Merged data saved successfully!")
else:
    print("No data to merge and save.")


Looking for file: C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\2. merged_Prices_Basic-Information-about-District, UC and M-Level-Respondents.csv
File found: C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\2. MERGED CSV Sections\2. merged_Prices_Basic-Information-about-District, UC and M-Level-Respondents.csv
Initial duplicates found based on keys ['CID', 'DLR_Caste', 'UCLR_Caste']:
              CID  DLR_Caste   DLR_DOI  DLR_DOI_d  DLR_DOI_m  DLR_DOI_y  \
2     101010101.0      -88.0  5/5/2013       23.0        6.0     2014.0   
3     101010101.0      -88.0  5/5/2013       23.0        6.0     2014.0   
10    101010203.0       87.0  5/5/2013       27.0        6.0     2014.0   
11    101010203.0       87.0  5/5/2013       27.0        6.0     2014.0   
14    101020304.0      -88.0  5/5/2013       31.0        5.0     2014.0   
...           ...        ...       ...        ...        ...        ...   
1268  102040708.0        NaN       NaN        NaN   