In [None]:
'''
This notebook contains the code to merge all PRICES data across the years based on sections.
There are datasets for 2012, 2013 and 2014

'''

In [2]:
'''
All files in the respective folder will be converted to CSV format for readability
This will be done for all male files across the years
'''

import pandas as pd
import glob

# Path where the .dta files are located
folder_path = r'C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\1. Merging by Parts\1. Cover'

# Get a list of all .dta files in the specified directory
file_list = glob.glob(folder_path + '/*.dta')

# Loop through the list of files
for file in file_list:
    # Read the .dta file into a pandas DataFrame
    df = pd.read_stata(file, convert_categoricals=False)
    
    # Define the output file name by replacing .dta with .xlsx
    output_file = file.replace('.dta', '.csv')
    
    # Write the DataFrame to an Excel file
    df.to_csv(output_file)


In [None]:
'''
The following code block will merge all "cover" files across the years.
* First, we are going to read the respective files and store them as data frames
* Next, we are going to define column mappings that I have already figured out via manual methods
'''

In [1]:
# This code block stores file paths to variables to make the code neat
# The stored variables are called in the read_excel function and stored as dataframes

import pandas as pd

# Store excel file locations to variables (change it as per your path to file)

path_2013=r"C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\1. Merging by Parts\1. Cover\2013_cover_prices.csv"
path_2014=r"C:\Users\warra\Desktop\Freelance\data\data\PriceMerge\1. Merging by Parts\1. Cover\2014_cover_page_price.csv"

# Read csv files

df_2013 = pd.read_csv(path_2013)
df_2014 = pd.read_csv(path_2014)


In [2]:
'''
This code block will standardize column names across the years to avoid discrepancies during the merging process.
For example, the cover data for 2013 is noth the same as in 2014 data, but they have the same data under their differently named variables. Hence, we decide to rename such columns beforehand
We will add the updated name to the mapping dictionaries instead of the original names.
'''

# Rename columns in df_2013
df_2013.rename(columns={
    'cid': 'cid',
    'round': 'round',
    'q1': 'province',
    'q2': 'province_id',
    'q3': 'district',
    'q4': 'district_id',
    'q5': 'tehsil',
    'q6': 'tehsil_id',
    'q7': 'union_council',
    'q8': 'uc_id',
    'q10': 'mauza_id'
}, inplace=True)

# df_2014 doesn't need renaming as it is the reference

In [4]:
# Updated mappings
mapping_2013 = [
    'cid', 'round', 'province', 'province_id', 'district', 
    'district_id', 'tehsil', 'tehsil_id', 'union_council', 
    'uc_id', 'mauza_id'
]

mapping_2014 = [
    'cid', 'round', 'province', 'province_id', 'district', 
    'district_id', 'tehsil', 'tehsil_id', 'union_council', 
    'uc_id', 'mauza_id'
]


In [5]:
# Create a list of all possible columns in the correct order
all_columns = []
   
for col in mapping_2013:
    if col and col not in all_columns:
        all_columns.append(col)
for col in mapping_2014:
    if col and col not in all_columns:
        all_columns.append(col)



In [6]:
def standardize_and_merge(dfs, mappings, all_columns):
    merged_data = {col: [] for col in all_columns}

    for df, mapping in zip(dfs, mappings):
        print(f"Processing DataFrame with columns: {df.columns.tolist()}")
        for i, col in enumerate(mapping):
            if col:
                ref_col = col.strip()  # Remove leading/trailing whitespace
                if ref_col not in merged_data:
                    merged_data[ref_col] = []
                if ref_col in df.columns:
                    print(f"Appending data for column {ref_col}")
                    if isinstance(df[ref_col], pd.Series):
                        merged_data[ref_col].extend(df[ref_col].tolist())
                    elif isinstance(df[ref_col], pd.DataFrame):
                        print(f"Column {ref_col} is duplicated in DataFrame. Appending data for each duplicate.")
                        for _, series in df[ref_col].items():
                            merged_data[ref_col].extend(series.tolist())
                else:
                    print(f"Column {ref_col} not found in DataFrame. Adding NaNs.")
                    merged_data[ref_col].extend([np.nan] * len(df))
    
    max_len = max(len(v) for v in merged_data.values())
    for key in merged_data:
        col_len = len(merged_data[key])
        if col_len < max_len:
            merged_data[key].extend([np.nan] * (max_len - col_len))

    merged_df = pd.DataFrame.from_dict(merged_data)
    return merged_df


In [7]:
# Usage with dataframes and mappings
dfs = [ df_2013, df_2014]
mappings = [mapping_2013, mapping_2014]

merged_df = standardize_and_merge(dfs, mappings, all_columns)

Processing DataFrame with columns: ['Unnamed: 0', 'cid', 'round', 'province', 'province_id', 'district', 'district_id', 'tehsil', 'tehsil_id', 'union_council', 'uc_id', 'mauza_id']
Appending data for column cid
Appending data for column round
Appending data for column province
Appending data for column province_id
Appending data for column district
Appending data for column district_id
Appending data for column tehsil
Appending data for column tehsil_id
Appending data for column union_council
Appending data for column uc_id
Appending data for column mauza_id
Processing DataFrame with columns: ['Unnamed: 0', 'cid', 'round', 'province', 'province_id', 'district', 'district_id', 'tehsil', 'tehsil_id', 'union_council', 'uc_id', 'mauza_id']
Appending data for column cid
Appending data for column round
Appending data for column province
Appending data for column province_id
Appending data for column district
Appending data for column district_id
Appending data for column tehsil
Appending dat

In [8]:
# Rename columns for the merged file (if needed)
rename_mapping = {
    'cid': 'CID',
    'round': 'Survey_Round',
    'province': 'P_Name',
    'province_id': 'P_ID',
    'district': 'D_Name',
    'district_id': 'D_ID',
    'tehsil': 'T_Name',
    'tehsil_id': 'T_ID',
    'union_council': 'UC_Name',
    'uc_id': 'UC_ID',
    'mauza_id': 'M_ID'
}

merged_df.rename(columns=rename_mapping, inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('merged_cover.csv', index=False)
