Let's see our train.csv to see what we are dealing with!

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('./datasets/train.csv', index_col='id')
train_df.head(10)

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500
5,Audi,A6 2.0T Sport,2018,40950,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,White,–,None reported,Yes,29950
6,Audi,A8 L 3.0T,2016,62200,Gasoline,333.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,Black,Black,None reported,Yes,28500
7,Chevrolet,Silverado 1500 1LZ,2016,102604,E85 Flex Fuel,355.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,White,Gray,None reported,Yes,12500
8,Ford,F-150 XLT,2020,38352,Gasoline,2.7L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,Snowflake White Pearl Metallic,Black,None reported,Yes,62890
9,BMW,M4 Base,2015,74850,Gasoline,425.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Black,Blue,None reported,Yes,4000


In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 188533 entries, 0 to 188532
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   brand         188533 non-null  object
 1   model         188533 non-null  object
 2   model_year    188533 non-null  int64 
 3   milage        188533 non-null  int64 
 4   fuel_type     183450 non-null  object
 5   engine        188533 non-null  object
 6   transmission  188533 non-null  object
 7   ext_col       188533 non-null  object
 8   int_col       188533 non-null  object
 9   accident      186081 non-null  object
 10  clean_title   167114 non-null  object
 11  price         188533 non-null  int64 
dtypes: int64(3), object(9)
memory usage: 18.7+ MB


# **Data Cleaning**

We find that there a lot of object columns with a number of missing (NaN) values. This next section will explore data to clean and make values possible to provide meaningful parameters in our model. This includes: 
- Dropping unneccessary columns
- Converting columns that have non-numerical datatypes to numerical datatypes for analysis.
- Dropping/imputing NaN values  

In [4]:
# Number of missing values (NaN)
train_df.isnull().sum()

brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

In [5]:
unique_counts = train_df.nunique()
unique_counts

brand             57
model           1897
model_year        34
milage          6651
fuel_type          7
engine          1117
transmission      52
ext_col          319
int_col          156
accident           2
clean_title        1
price           1569
dtype: int64

In [6]:
# The misspelling really annoyed me :D
train_df.rename(columns={"milage":"mileage"}, inplace=True)
train_df

Unnamed: 0_level_0,brand,model,model_year,mileage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500
...,...,...,...,...,...,...,...,...,...,...,...,...
188528,Cadillac,Escalade ESV Platinum,2017,49000,Gasoline,420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,Beige,None reported,Yes,27500
188529,Mercedes-Benz,AMG C 43 AMG C 43 4MATIC,2018,28600,Gasoline,385.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes,30000
188530,Mercedes-Benz,AMG GLC 63 Base 4MATIC,2021,13650,Gasoline,469.0HP 4.0L 8 Cylinder Engine Gasoline Fuel,7-Speed A/T,White,Black,None reported,Yes,86900
188531,Audi,S5 3.0T Prestige,2022,13895,Gasoline,3.0L,1-Speed Automatic,Daytona Gray Pearl Effect,Black,None reported,,84900


## Transmission

Let's explore the different types of transmission types in this data types and try to catergorize the many types into a few broad catergories.

In [7]:
train_df['transmission'].unique()

array(['A/T', 'Transmission w/Dual Shift Mode', '7-Speed A/T',
       '8-Speed A/T', '10-Speed Automatic', '1-Speed A/T', '6-Speed A/T',
       '10-Speed A/T', '9-Speed A/T', '8-Speed Automatic',
       '9-Speed Automatic', '5-Speed A/T', 'Automatic',
       '7-Speed Automatic with Auto-Shift', 'CVT Transmission',
       '5-Speed M/T', 'M/T', '6-Speed M/T', '6-Speed Automatic',
       '4-Speed Automatic', '7-Speed M/T', '2-Speed A/T',
       '1-Speed Automatic', 'Automatic CVT', '4-Speed A/T',
       '6-Speed Manual', 'Transmission Overdrive Switch',
       '8-Speed Automatic with Auto-Shift', '7-Speed Manual',
       '7-Speed Automatic', '9-Speed Automatic with Auto-Shift',
       '6-Speed Automatic with Auto-Shift',
       '6-Speed Electronically Controlled Automatic with O', 'F', 'CVT-F',
       '8-Speed Manual', 'Manual', '–', '2', '6 Speed At/Mt',
       '5-Speed Automatic', '2-Speed Automatic', '8-SPEED A/T', '7-Speed',
       'Variable', 'Single-Speed Fixed Gear', '8-SPEED AT',


We can classify the broad catergories as follows:
- **Automatic**
- **Manual**
- **CVT**
- **Dual** (For electric vehicles)
- **Other** (This will be to cover all those transmission descriptions so that we don't ignore/drop such rows)

In [8]:
# Defining function to catergorize transmission
def catergorize_transmission(transmission):
    if 'A/T' or 'Automatic' in transmission:
        return 'Automatic'
    elif 'M/T' or 'Manual' in transmission:
        return 'Manual'
    elif 'CVT' in transmission:
        return 'CVT'
    elif 'Dual' in transmission:
        return 'Dual'
    else:
        return 'Other'

In [9]:
train_df['transmission_cat'] = train_df['transmission'].apply(catergorize_transmission)
train_df.head()

Unnamed: 0_level_0,brand,model,model_year,mileage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price,transmission_cat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200,Automatic
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999,Automatic
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900,Automatic
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000,Automatic
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500,Automatic


In [10]:
train_df = train_df.drop(columns='transmission')
train_df

Unnamed: 0_level_0,brand,model,model_year,mileage,fuel_type,engine,ext_col,int_col,accident,clean_title,price,transmission_cat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,Yellow,Gray,None reported,Yes,4200,Automatic
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,Silver,Beige,At least 1 accident or damage reported,Yes,4999,Automatic
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,Blue,Gray,None reported,Yes,13900,Automatic
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Black,Black,None reported,Yes,45000,Automatic
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Black,Beige,None reported,Yes,97500,Automatic
...,...,...,...,...,...,...,...,...,...,...,...,...
188528,Cadillac,Escalade ESV Platinum,2017,49000,Gasoline,420.0HP 6.2L 8 Cylinder Engine Gasoline Fuel,White,Beige,None reported,Yes,27500,Automatic
188529,Mercedes-Benz,AMG C 43 AMG C 43 4MATIC,2018,28600,Gasoline,385.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,White,Black,At least 1 accident or damage reported,Yes,30000,Automatic
188530,Mercedes-Benz,AMG GLC 63 Base 4MATIC,2021,13650,Gasoline,469.0HP 4.0L 8 Cylinder Engine Gasoline Fuel,White,Black,None reported,Yes,86900,Automatic
188531,Audi,S5 3.0T Prestige,2022,13895,Gasoline,3.0L,Daytona Gray Pearl Effect,Black,None reported,,84900,Automatic


In [None]:
def age(model_year):
    age = 2024 - model_year
    if age == 0:
        return train_df
    return age

In [None]:
train_df['age'] = train_df['model_year'].apply(age)
train_df.head()

TypeError: age() missing 1 required positional argument: 'mileage'

In [43]:
train_df['annual_avg_mileage'] = round(train_df['mileage'] / train_df['age'])
train_df = train_df.drop(columns='age')
train_df.head(3)

Unnamed: 0_level_0,brand,model,model_year,mileage,fuel_type,engine,ext_col,int_col,accident,clean_title,price,transmission_cat,annual_avg_mileage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,Yellow,Gray,None reported,Yes,4200,Automatic,12529.411765
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,Silver,Beige,At least 1 accident or damage reported,Yes,4999,Automatic,6511.363636
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,Blue,Gray,None reported,Yes,13900,Automatic,6215.045455


In [46]:
train_df.loc[(train_df['model_year'] == 2024)]

Unnamed: 0_level_0,brand,model,model_year,mileage,fuel_type,engine,ext_col,int_col,accident,clean_title,price,transmission_cat,annual_avg_mileage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5570,Lexus,NX 300h Base,2024,16425,Hybrid,2.5L I4 16V MPFI DOHC Hybrid,Black,Gray,None reported,Yes,44700,Automatic,id 0 0.077113 1 0.114660 2 ...
6143,BMW,330 i xDrive,2024,8700,Gasoline,255.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Blue,White,None reported,Yes,33000,Automatic,id 0 0.040845 1 0.060733 2 ...
7164,BMW,M3 CS,2024,31850,Gasoline,543.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Black,Black,None reported,Yes,22550,Automatic,id 0 0.149531 1 0.222339 2 ...
7227,Ford,Expedition Timberline,2024,23300,Gasoline,440.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,Gray,Black,None reported,Yes,69500,Automatic,id 0 0.109390 1 0.162653 2 ...
7413,BMW,M3 CS,2024,34254,Gasoline,543.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,White,Black,None reported,Yes,63900,Automatic,id 0 0.160817 1 0.239120 2 ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
178467,BMW,M3 CS,2024,3415,Gasoline,543.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Black,Black,None reported,Yes,179900,Automatic,id 0 0.016033 1 0.023839 2 ...
180713,BMW,M3 CS,2024,48200,Gasoline,543.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Black,Black,None reported,Yes,67000,Automatic,id 0 0.226291 1 0.336475 2 ...
184683,RAM,1500 Big Horn,2024,18385,Gasoline,395.0HP 5.7L 8 Cylinder Engine Gasoline Fuel,White,Black,None reported,Yes,57000,Automatic,id 0 0.086315 1 0.128342 2 ...
185471,INFINITI,QX56 Base,2024,7100,Gasoline,400.0HP 5.6L 8 Cylinder Engine Gasoline Fuel,Black,Black,None reported,Yes,74900,Automatic,id 0 0.033333 1 0.049564 2 ...


In [45]:
max_avg_mileage = np.max(train_df['annual_avg_mileage'])
min_avg_mileage = np.min(train_df['annual_avg_mileage'])
median_avg_mileage = np.median(train_df['annual_avg_mileage'])

print(f"Max: {max_avg_mileage}")
print(f"Min: {min_avg_mileage}")
print(f"Median: {median_avg_mileage}")


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().