In [46]:
# import packages
import pandas as pd
from pandas import json_normalize
import json

In [47]:
file_paths = {
    'Bangalore': 'Raw_Datasets/bangalore_cars.xlsx',
    'Chennai': 'Raw_Datasets/chennai_cars.xlsx',
    'Delhi': 'Raw_Datasets/delhi_cars.xlsx',
    'Hyderabad': 'Raw_Datasets/hyderabad_cars.xlsx',
    'Jaipur': 'Raw_Datasets/jaipur_cars.xlsx',
    'Kolkata': 'Raw_Datasets/kolkata_cars.xlsx'
}

In [48]:
dataframes = []

def add_city_and_concatenate(file_paths):
    for city,path in file_paths.items():
        df = pd.read_excel(path)
        df["city"] = city
        dataframes.append(df)
    return pd.concat(dataframes,ignore_index=True)

concatenated_df = add_city_and_concatenate(file_paths)


In [49]:
def convert_jsonstr_to_dict(df):
    columns_to_be_checked = ['new_car_overview', 'new_car_feature', 'new_car_specs']
    for column in columns_to_be_checked:
        df[column] = df[column].apply(lambda x: eval(x) if isinstance(x,str) else x)
    return df

In [50]:
def format_new_car_overview(record):
    temp_dict = {}
    top_items = record.get("top",[])
    for item in top_items:
        key = item.get("key")
        value = item.get("value")
        temp_dict[key] = value
    return temp_dict

In [51]:
def format_new_car_feature(record):
    temp_list = []
    top_items = record.get("top", [])
    for item in top_items:
        temp_list.append(item['value'])

    data_items = record.get("data", [])
    for item in data_items:
        sub_items = item.get("list", [])
        for sub_item in sub_items:
            temp_list.append(sub_item['value'])

    return temp_list


In [52]:
def format_new_car_specs(record):
    temp_dict = {}

    top_items = record.get("top",[])
    for item in top_items:
        if 'key' in item and 'value' in item:
            temp_dict[item['key']] = item['value']

    data_items = record.get("data",[])
    for item in data_items:
        sub_items = item.get("list",[])
        for item in sub_items:
            if 'key' in item and 'value' in item:
                temp_dict[item['key']] = item['value']

    return temp_dict

In [53]:
def apply_data_extraction_to_columns(formatted_df):
    formatted_df['new_car_detail'] = formatted_df['new_car_detail'].apply(lambda x: eval(x) if isinstance(x,str) else x)
    formatted_df['new_car_overview'] = formatted_df['new_car_overview'].apply(lambda x: format_new_car_overview(x) if isinstance(x,dict) else x)
    formatted_df['new_car_feature'] = formatted_df['new_car_feature'].apply(lambda x: format_new_car_feature(x) if isinstance(x,dict) else x)
    formatted_df['new_car_specs'] = formatted_df['new_car_specs'].apply(lambda x: format_new_car_specs(x) if isinstance(x,dict) else x)
    return formatted_df

In [54]:
concatenated_df = add_city_and_concatenate(file_paths)
formatted_df = convert_jsonstr_to_dict(concatenated_df)
extracted_df = apply_data_extraction_to_columns(formatted_df)

# Flatten each nested column and create separate DataFrames
car_details = pd.DataFrame(extracted_df["new_car_detail"].tolist())
car_overview = pd.DataFrame(extracted_df["new_car_overview"].tolist())
car_specs = pd.DataFrame(extracted_df["new_car_specs"].tolist())

# Drop nested columns from original df
extracted_df = extracted_df.drop(columns=["new_car_detail", "new_car_overview", "new_car_specs"])

#Now concat flattened indivial columns to original dataframe
flattened_df  = pd.concat([extracted_df,car_details,car_overview,car_specs],axis=1)

In [55]:
flattened_df

Unnamed: 0,new_car_feature,car_links,city,it,ft,bt,km,transmission,ownerNo,owner,...,Front Brake Type,Rear Brake Type,Top Speed,Acceleration,Tyre Type,No Door Numbers,Cargo Volumn,Wheel Size,Alloy Wheel Size,Ground Clearance Unladen
0,"[Power Steering, Power Windows Front, Air Cond...",https://www.cardekho.com/used-car-details/used...,Bangalore,0,Petrol,Hatchback,120000,Manual,3,3rd Owner,...,Ventilated Disc,Drum,150 Kmph,15.05 Seconds,"Tubeless, Radial",5,235-litres,,,
1,"[Power Steering, Power Windows Front, Air Cond...",https://www.cardekho.com/buy-used-car-details/...,Bangalore,0,Petrol,SUV,32706,Manual,2,2nd Owner,...,Ventilated Disc,Drum,,,"Tubeless,Radial",4,352-litres,16,16,
2,"[Power Steering, Power Windows Front, Air Cond...",https://www.cardekho.com/used-car-details/used...,Bangalore,0,Petrol,Hatchback,11949,Manual,1,1st Owner,...,Disc,Drum,150 kmph,14.3 Seconds,Tubeless,5,242-litres,14,14,
3,"[Power Steering, Power Windows Front, Air Cond...",https://www.cardekho.com/buy-used-car-details/...,Bangalore,0,Petrol,Sedan,17794,Manual,1,1st Owner,...,Disc,Drum,172km/hr,14.2 Seconds,"Tubeless,Radial",4,407-litres,14,14,
4,"[Power Steering, Power Windows Front, Air Cond...",https://www.cardekho.com/used-car-details/used...,Bangalore,0,Diesel,SUV,60000,Manual,1,1st Owner,...,Ventilated Disc,Solid Disc,190 Kmph,12 Seconds,"Tubeless,Radial",5,353-litres,16,16,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16733,"[Power Steering, Power Windows Front, Air Cond...",https://www.cardekho.com/used-car-details/used...,Kolkata,0,Petrol,Hatchback,10000,Manual,1,1st Owner,...,Ventilated Disc,Drum,,,"Tubeless, Radial",5,313,,,
16734,"[Power Steering, Power Windows Front, Air Cond...",https://www.cardekho.com/used-car-details/used...,Kolkata,0,Petrol,Hatchback,120000,Manual,1,1st Owner,...,Solid Disc,Drum,140 kmph,19 Seconds,Tubeless Tyres,5,177-litres,,,
16735,"[Power Steering, Power Windows Front, Air Cond...",https://www.cardekho.com/used-car-details/used...,Kolkata,0,Petrol,Sedan,50000,Automatic,3,3rd Owner,...,Ventilated Disc,Solid Disc,230km/hr,8.8 Seconds,"Tubeless,Radial",4,475-litres,17,17,
16736,"[Power Steering, Power Windows Front, Air Cond...",https://www.cardekho.com/used-car-details/used...,Kolkata,0,Petrol,Hatchback,40000,Manual,1,1st Owner,...,Ventilated Disc,Drum,156 Kmph,15 Seconds,"Tubeless,Radial",5,236-liters,14,14,


In [56]:
flattened_df.to_excel("processed_df.xlsx",engine="openpyxl",index=False)