In [1]:
# importing dependencies

import pandas as pd
import numpy as np
import ast

In [2]:
# setting display to max_col for better visibility of cols
pd.set_option("display.max_columns", None)

file_path = 'C:/Users/aksha/OneDrive/Desktop/CARS-FINAL_YEAR_PROJECT/DATA/03_All_parquet_files/data_part_5.parquet'

df = pd.read_parquet(file_path, engine="fastparquet")
df.head()

Unnamed: 0.1,Unnamed: 0,image_list,new_used,car_name,mileage,price,price_drop,deal_type,key_specs,basics,features,other_features,vehicle_history,seller_name,seller_rating,people_count_seller_rating,seller_address,seller_site,car_rating,people_count_car_rating
0,1947,,,,,,,,,,,,,,,,,,,
1,758,,,,,"\n $21,252\n",\n $216 price drop\n,\n Good Deal \n,,,,,,,,,,,,
2,233,['https://platform.cstatic-images.com/xlarge/i...,Used,2021 Chevrolet Tahoe LT,"49,943 mi.","\n $46,995\n","\n $1,000 price drop\n",\n Good Deal | $424 under\n,,{'Exterior color': '\n Midnight Blue Metall...,{'Convenience': '\n\nHeated Seats\nKeyless Ent...,"['\n1.5 KW Heater/defrost Air System\n', '\n1....","{'Accidents or damage': 'None reported', '1-ow...",\n Lexus of Arlington\n,4.9,\n(426 reviews)\n,"1510 W Dundee Rd, Palatine, IL 60074",https://www.lexusofarlington.com/?utm_source=c...,4.1,(47 reviews)
3,412,['https://platform.cstatic-images.com/xlarge/i...,Used,2023 Mazda CX-5 Signature,"16,621 mi.","\n $29,999\n",\n $354 price drop\n,\n Good Deal | $542 under\n,,{'Exterior color': '\n Rhodium White Metall...,{'Convenience': '\n\nAdaptive Cruise Control\n...,"['\n12V power outlets 3 12V power outlets\n', ...","{'Accidents or damage': 'None reported', '1-ow...",\n DCH Ford of Eatontown\n,4.8,\n(661 reviews)\n,"85 NJ-36, Eatontown, NJ 07724",https://www.dchfordofeatontown.com/?utm_source...,4.3,(13 reviews)
4,145,['https://platform.cstatic-images.com/xlarge/i...,Used,2023 Chevrolet Silverado 1500 Custom,"30,277 mi.","\n $34,500\n",\n $500 price drop\n,\n Fair Deal \n,,"{'Exterior color': '\n Red Hot\n ', 'Inter...",{'Convenience': '\n\nKeyless Entry\nKeyless St...,"['\n10-Way Power Driver Seat w/Lumbar\n', '\n1...","{'Accidents or damage': 'None reported', '1-ow...",\n AutoMax Preowned Marlborough\n,5.0,\n(194 reviews)\n,"400 Maple St. Rt. 85, Marlborough, MA 01752",http://www.automaxpreowned.com?utm_source=cars...,3.8,(8 reviews)


#### COLUMN DESCRIPTION

1. `Unnamed: 0` : Column is redundent and is not useful.
1. `Image List` : contains link of image of that perticular car.
2. `New_used` : Column contains whether the category is new or used or whether certified.
3. `Car Name` : contains the name of car and some additional info like make year etc.
4. `Mileage` : Tells the reading of odometer.
5. `Price` : Price of car in USD.
6. `Price Drop` : tells whether the Price of that car dropped from the date of listing.
7. `deal_type` : contains the category like good, fair etc
8. `Key_specs` : contains some features that are present in electric cars only, and hence this col is null for non electric cars.
9. `basics` : contains info like car interior color, exterior color etc
10. `features` : contains info like convinence, safety and seating.
11. `other features` : it gives comprehensive info about the remaining feature of cars.
12. `vehicle_history` : gives info like 1-owner car, accident or damange etc.
13. `seller_rating` : gives the rating to seller and is not specific to the car.
14. `people_count_seller_rating` : gives info about the number of people who give rating to seller, not specific to perticular car.
15. `seller site` : is the link of seller site of cars.
16. `seller address` : contains zip code, state as well as city info of seller.
17. `car_rating` : is rating of car and is not specific to just that row but is rating of all such car in data
18. `people_count_car_rating` : number of people 

In [3]:
# quick summary of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67192 entries, 0 to 67191
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  67192 non-null  int64  
 1   image_list                  53371 non-null  object 
 2   new_used                    54268 non-null  object 
 3   car_name                    54268 non-null  object 
 4   mileage                     53549 non-null  object 
 5   price                       65680 non-null  object 
 6   price_drop                  29991 non-null  object 
 7   deal_type                   40648 non-null  object 
 8   key_specs                   1037 non-null   object 
 9   basics                      54268 non-null  object 
 10  features                    53415 non-null  object 
 11  other_features              52910 non-null  object 
 12  vehicle_history             33739 non-null  object 
 13  seller_name                 538

#### Observations
1. Need to remove the Unnamed: 0 column
2. Data type of some columns are not correct
3. Need to expend column which contains data in json format like basics, features key_specs etc

In [4]:
# number of duplicates in dataset
print(df.duplicated().sum())

610


In [5]:
# dropping duplicates
df.drop_duplicates(inplace=True)

In [6]:
# Dropping the not useful column
df.drop(columns='Unnamed: 0',inplace=True)

##### The main goal of this notebook will be, to expand the json columns.

In [7]:
# because the data in parquet is stored in binary, the json format data when converted back to df, is stored in string format, 
# thus to convert it back to json, we need to use ast.literal_eval function

df['key_specs'] = df['key_specs'].apply(lambda x : ast.literal_eval(x) if x is not None else x)
df['basics'] = df['basics'].apply(lambda x : ast.literal_eval(x) if x is not None else x)
df['features'] = df['features'].apply(lambda x : ast.literal_eval(x) if x is not None else x)
df['other_features'] = df['other_features'].apply(lambda x : ast.literal_eval(x) if x is not None else x)
df['vehicle_history'] = df['vehicle_history'].apply(lambda x : ast.literal_eval(x) if x is not None else x)

In [8]:
# <------------------------------ Expanding the Basics column ------------------------------------>


# init the empty dict
Exterior_color = []
Interior_color = []
Drivetrain = []
MPG = []
Fuel_Type = []
Transmission = []
Engine = []
VIN = []
Stock_no = []
Mileage = []


def expending_basics(basics_col):
    """
    This Function will append the value to list, if it is present in the json, else it will simply add np.nan

    Return Value is None
    """
    try:
        Exterior_color.append(basics_col['Exterior color'])
    except:
        Exterior_color.append(np.nan)
    
    try:
        Interior_color.append(basics_col['Interior color'])
    except:
        Interior_color.append(np.nan)
    
    try:
        Drivetrain.append(basics_col['Drivetrain'])
    except:
        Drivetrain.append(np.nan)
    
    try:
        MPG.append(basics_col['MPG'])
    except:
        MPG.append(np.nan)
    
    try:
        Fuel_Type.append(basics_col['Fuel type'])
    except:
        Fuel_Type.append(np.nan)
    
    try:
        Transmission.append(basics_col['Transmission'])
    except:
        Transmission.append(np.nan)
    
    try:
        Engine.append(basics_col['Engine'])
    except:
        Engine.append(np.nan)
    
    try:
        VIN.append(basics_col['VIN'])
    except:
        VIN.append(np.nan)
    
    try:
        Stock_no.append(basics_col['Stock #'])
    except:
        Stock_no.append(np.nan)
    
    try:
        Mileage.append(basics_col['Mileage'])
    except:
        Mileage.append(np.nan)

for i in df['basics'].values:
    expending_basics(i)

In [9]:
# checking if there is logical issue in working of expanding_basics
print(len(Exterior_color))
print(len(Interior_color))
print(len(Drivetrain))
print(len(MPG))
print(len(Fuel_Type))
print(len(Transmission))
print(len(Engine))
print(len(VIN))
print(len(Stock_no))
print(len(Mileage))

66582
66582
66582
66582
66582
66582
66582
66582
66582
66582


In [10]:
# inserting the values at desired location

df.insert(8,'Exterior_color',Exterior_color)
df.insert(9,'Interior_color',Interior_color)
df.insert(10,'Drivetrain',Drivetrain)
df.insert(11,'MPG',MPG)
df.insert(12,'Fuel_Type',Fuel_Type)
df.insert(13,'Transmission',Transmission)
df.insert(14,'Engine',Engine)
df.insert(15,'VIN',VIN)
df.insert(16,'Stock_no',Stock_no)
df.insert(17,'Mileage',Mileage)

In [11]:
# <------------------------------ Expanding the Features column ------------------------------------>


# init empty list
Convenience = []
Entertainment = []
Exterior = []
Safety = []
Seating = []


def expending_featues(col):
    """
    This Function will append the value to list, if it is present in the json, else it will simply add np.nan

    Return Value is None
    """
    try:
        Convenience.append(col['Convenience'])
    except:
        Convenience.append(np.nan)
    
    try:
        Entertainment.append(col['Entertainment'])
    except:
        Entertainment.append(np.nan)
    
    try:
        Exterior.append(col['Exterior'])
    except:
        Exterior.append(np.nan)
    
    try:
        Safety.append(col['Safety'])
    except:
        Safety.append(np.nan)
    
    try:
        Seating.append(col['Seating'])
    except:
        Seating.append(np.nan)

# looping through values to append in the list
for i in df['features'].values:
    expending_featues(i)

In [12]:
# checking if there is logical issue in working of expanding_features
print(len(Convenience))
print(len(Entertainment))
print(len(Exterior))
print(len(Safety))
print(len(Seating))

66582
66582
66582
66582
66582


In [13]:
# inserting the features value at desired location

df.insert(18,'Convenience',Convenience)
df.insert(19,'Entertainment',Entertainment)
df.insert(20,'Exterior',Exterior)
df.insert(21,'Safety',Safety)
df.insert(22,'Seating',Seating)

In [14]:
# <------------------------------ Expanding the Vehicle History column ------------------------------------>

# Init the empty list for vehicle_history col
Accidents_or_damage = []
Clean_title = []
one_owner_vehicle = []
Personal_use_only = []
Open_recall = []


def expending_vehicle_history(col):
    """
    This Function will append the value to list, if it is present in the json, else it will simply add np.nan

    Return Value is None
    """
    try:
        Accidents_or_damage.append(col['Accidents or damage'])
    except:
        Accidents_or_damage.append(np.nan)
    
    try:
        Clean_title.append(col['Clean title'])
    except:
        Clean_title.append(np.nan)
    
    try:
        one_owner_vehicle.append(col['1-owner vehicle'])
    except:
        one_owner_vehicle.append(np.nan)
    
    try:
        Personal_use_only.append(col['Personal use only'])
    except:
        Personal_use_only.append(np.nan)
    
    try:
        Open_recall.append(col['Open recall'])
    except:
        Open_recall.append(np.nan)

# looping through the Vehicle Hisotry
for i in df['vehicle_history'].values:
    expending_vehicle_history(i)

In [15]:
# checking if there is logical issue in working of expanding_vehicle_history

print(len(Accidents_or_damage))
print(len(Clean_title))
print(len(one_owner_vehicle))
print(len(Personal_use_only))
print(len(Open_recall))

66582
66582
66582
66582
66582


In [16]:
# inserting values at desired location

df.insert(23,'Accidents_or_damage',Accidents_or_damage)
df.insert(24,'Clean_title',Clean_title)
df.insert(25,'one_owner_vehicle',one_owner_vehicle)
df.insert(26,'Personal_use_only',Personal_use_only)
df.insert(27,'Open_recall',Open_recall)

In [17]:
# <------------------------------ Expanding the Key_specs column ------------------------------------>

# init the empty lists
Fuel_Type_e = []
mpge = []
level2_charging = []
dc_fast_charging = []
battery_capacity = []
excepted_range = []
battery_range_score = []

def expending_key_specs(col):
    """
    This Function will append the value to list, if it is present in the json, else it will simply add np.nan

    Return Value is None
    """
    try:
        Fuel_Type_e.append(col['Fuel type'])
    except:
        Fuel_Type_e.append(np.nan)
    
    try:
        mpge.append(col['\nMPGe\nMPGe\nMiles per gallon-equivalent is how the EPA provides efficiency ratings for battery-electric vehicles in a way that can be used in comparison with gasoline-powered vehicles. Actual mileage will vary depending on driving conditions, driving habits, elevation changes, weather, accessory usage (lights, climate control), vehicle condition and other factors.\n\n  Related: Top 10 Most Efficient Electric Cars\n\n\n\n'])
    except:
        mpge.append(np.nan)
    
    try:
        level2_charging.append(col['\nLevel 2 charging\nLevel 2 charging\nCharge time estimates are based on using a 240-volt charging circuit charging from empty to 100% battery capacity.  Level 2 is the fastest way to charge at home, though charging times can vary and are dependent on factors such as the capabilities of the charging circuit, charging equipment and the vehicle’s onboard charger. Level 2 charging time provided by Chrome Data, a JD Power company.\n'])
    except:
        level2_charging.append(np.nan)
    
    try:
        dc_fast_charging.append(col['\nDC fast charging\nDC fast charging\nDC fast charging is the fastest way to charge and only available at pay-for-use public charging stations, though some EVs come with complimentary charging for a limited time.  Real-world DC fast charging times can vary greatly, even on the same vehicle, because of this type of charging’s sensitivities to ambient and battery conditions. DC fast charging time provided by Chrome Data, a JD Power company.\n'])
    except:
        dc_fast_charging.append(np.nan)
    
    try:
        battery_capacity.append(col['\nBattery capacity\nBattery capacity\nBattery capacity is measured in kilowatt-hours, which is a measure of how much energy is used over time. A 70-kWh battery has more energy capacity than a 50-kWh battery and would result in a longer driving range if all other factors were equal. But more battery capacity doesn’t always mean longer range because of differences in energy consumption from vehicle to vehicle. Battery capacity provided by Chrome Data, a JD Power company.\n'])
    except:
        battery_capacity.append(np.nan)
    
    try:
        excepted_range.append(col["\nExpected range\nExpected range\nExpected Range is Recurrent's prediction of the distance this vehicle can travel on a full charge. This value is based on the age, mileage, and location of this vehicle, and the battery pack of this vehicle model and of vehicles in similar model classes. Actual range will vary depending on the condition of this vehicle's battery pack, how you drive, driving conditions and other factors.\n"])
    except:
        excepted_range.append(np.nan)
    
    try:
        battery_range_score.append(col["\nBattery range score\nBattery range score\nThe battery range score is based on this vehicle's current expected range relative to the vehicles expected range when new.\nOutstanding – The expected range exceeds what’s normal when new for this make, model, and battery size\n"])
    except:
        battery_range_score.append(np.nan)

# looping through the key_specs col
for i in df['key_specs'].values:
    expending_key_specs(i)

# inserting values at desired location
df.insert(28,'Fuel_Type_e',Fuel_Type_e)
df.insert(29,'mpge',mpge)
df.insert(30,'level2_charging',level2_charging)
df.insert(31,'dc_fast_charging',dc_fast_charging)
df.insert(32,'battery_capacity',battery_capacity)
df.insert(33,'excepted_range',excepted_range)
df.insert(34,'battery_range_score',battery_range_score)

In [18]:
# checking if there is logical issue in working of expanding_key_specs

print(len(Fuel_Type_e))
print(len(mpge))
print(len(level2_charging))
print(len(dc_fast_charging))
print(len(battery_capacity))
print(len(excepted_range))
print(len(battery_range_score))

66582
66582
66582
66582
66582
66582
66582


In [19]:
# extracting only the important cols
imp_cols  = df.drop(columns=['image_list','mileage','new_used','price_drop','Fuel_Type_e','mpge','level2_charging','dc_fast_charging','battery_capacity','excepted_range','battery_range_score','vehicle_history','deal_type','key_specs','Exterior_color','Interior_color','Drivetrain','MPG','Fuel_Type','Transmission','Engine','VIN','Stock_no','Mileage','Convenience','Entertainment','Exterior','Safety','Seating','Accidents_or_damage','Clean_title','one_owner_vehicle','Personal_use_only','Open_recall','other_features','seller_site'])

# sum up value for each row
fltr = imp_cols.isna().sum(axis=1)

# drop those rows where more than 5 imp_cols values are null
inter_df = df.drop(index=imp_cols[fltr >= 6].index)

In [20]:
# empty rows -> dropped

df.loc[imp_cols[fltr >= 6].index]

Unnamed: 0,image_list,new_used,car_name,mileage,price,price_drop,deal_type,key_specs,Exterior_color,Interior_color,Drivetrain,MPG,Fuel_Type,Transmission,Engine,VIN,Stock_no,Mileage,Convenience,Entertainment,Exterior,Safety,Seating,Accidents_or_damage,Clean_title,one_owner_vehicle,Personal_use_only,Open_recall,Fuel_Type_e,mpge,level2_charging,dc_fast_charging,battery_capacity,excepted_range,battery_range_score,basics,features,other_features,vehicle_history,seller_name,seller_rating,people_count_seller_rating,seller_address,seller_site,car_rating,people_count_car_rating
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,"\n $21,252\n",\n $216 price drop\n,\n Good Deal \n,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
12,,,,,"\n $16,958\n",\n $986 price drop\n,\n Fair Deal \n,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
23,,,,,"\n $31,505\n",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
27,,,,,"\n $30,987\n",,\n Good Deal \n,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67172,,,,,"\n $12,979\n",\n $504 price drop\n,\n Good Deal \n,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
67181,,,,,"\n $15,975\n",,\n Good Deal \n,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
67183,,,,,"\n $25,876\n","\n $1,265 price drop\n",\n Fair Deal \n,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
67184,,,,,"\n $37,625\n",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [21]:
print(inter_df.shape)
inter_df.info()

(54217, 46)


<class 'pandas.core.frame.DataFrame'>
Index: 54217 entries, 2 to 67191
Data columns (total 46 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   image_list                  53320 non-null  object 
 1   new_used                    54217 non-null  object 
 2   car_name                    54217 non-null  object 
 3   mileage                     53498 non-null  object 
 4   price                       54217 non-null  object 
 5   price_drop                  22257 non-null  object 
 6   deal_type                   32589 non-null  object 
 7   key_specs                   1035 non-null   object 
 8   Exterior_color              54217 non-null  object 
 9   Interior_color              54217 non-null  object 
 10  Drivetrain                  54217 non-null  object 
 11  MPG                         52737 non-null  object 
 12  Fuel_Type                   53132 non-null  object 
 13  Transmission                54217 no

In [22]:
inter_df.to_parquet('POST_CLEANING_1_DATA_PART_5.parquet',index=False,engine='fastparquet')