In [None]:
# Importing the libraries
import pandas                as pd
import numpy                 as np
import matplotlib.pyplot     as plt
import plotly.graph_objects  as go
import plotly.express        as px
import matplotlib.ticker     as ticker
import scipy.optimize
from sklearn.model_selection import train_test_split
from sklearn.linear_model    import LinearRegression
from sklearn.metrics         import mean_squared_error,r2_score, mean_absolute_error, mean_squared_error
from sklearn.impute          import SimpleImputer
from sklearn                 import linear_model
from sklearn.preprocessing   import StandardScaler
from sklearn.ensemble        import RandomForestRegressor
!pip install category_encoders
import category_encoders as ce


In [None]:
# Importing the csv file from the local location
UsedCarDF = pd.read_csv('/content/sample_data/vehicles.csv')

In [None]:
UsedCarDF

In [None]:
# Check for rows that have null values in the column year. I don't want to buy a car which does not have a year listed on it
UsedCarDF[UsedCarDF['year'].isnull()]

In [None]:
# Delete the records where Year is null
UsedCarDF.dropna(subset=['year'], inplace=True)

In [None]:
# We could see that the null records for year are deleted from the data frame
UsedCarDF

In [None]:
# Check for other important columns where data is null and delete them from the data frame
UsedCarDF[UsedCarDF['title_status'].isnull()]

In [None]:
# Delete the records where title_status is null
UsedCarDF.dropna(subset=['title_status'], inplace=True)


In [None]:
UsedCarDF

In [None]:
# Delete the records where VIN is null
UsedCarDF.dropna(subset=['VIN'], inplace=True)

In [None]:
UsedCarDF

In [None]:
# Delete rows where the price of the car more than $1,000,000 and less than $100
# Essentially we are removing the outliers

def remove_outliers_iqr(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
  return df_filtered

# Apply to your dataframe
UsedCarDFFiltered = remove_outliers_iqr(UsedCarDF, 'price')
UsedCarDFFiltered


In [None]:
UsedCarDFFiltered.dtypes

In [None]:
# Here we need to convert the string to numerical dtype (Also known as Binary Encoding )
#as the models understand only numerical values of the features
# create scatter plots to understand the corelation between the condition of the car
# and price of the car
UsedCarDFFiltered['condition_codes'] = UsedCarDFFiltered['condition'].astype('category').cat.codes
plt.scatter(UsedCarDFFiltered['condition_codes'], UsedCarDFFiltered['price'])
plt.xlabel('condition (Encoded)')
plt.ylabel('Price')
plt.xticks(UsedCarDFFiltered['condition_codes'].unique(), UsedCarDFFiltered['condition'].unique())
plt.show()

In [None]:

# We can understand that people would prefer the condition of the car to be " excellent "

In [None]:
UsedCarDFFiltered['title_status_codes'] = UsedCarDFFiltered['title_status'].astype('category').cat.codes
plt.scatter(UsedCarDFFiltered['title_status_codes'], UsedCarDFFiltered['price'])
plt.xlabel('title_status (Encoded)')
plt.ylabel('Price')
plt.xticks(UsedCarDFFiltered['title_status_codes'].unique(), UsedCarDFFiltered['title_status'].unique())
plt.show()


In [None]:
# We can understand that people would prefer the title_status of the car to be " clean "

In [None]:
UsedCarDFFiltered['paint_color_codes'] = UsedCarDFFiltered['paint_color'].astype('category').cat.codes
plt.scatter(UsedCarDFFiltered['paint_color_codes'], UsedCarDFFiltered['price'])
plt.xlabel('paint_color (Encoded)')
plt.ylabel('Price')
plt.xticks(UsedCarDFFiltered['paint_color_codes'].unique(), UsedCarDFFiltered['paint_color'].unique())
plt.xticks(rotation=45)
plt.show()

# we can understand that color of the car does not make much difference in the price . so this attribute does not qualify to be a feature

In [None]:
plt.figure(figsize=(20, 15))  # Increased figure size
plt.scatter(UsedCarDFFiltered['year'], UsedCarDFFiltered['price'])
x_values = UsedCarDFFiltered['year'].unique()
# Set x-axis locator and formatter
ax = plt.gca()
ax.xaxis.set_major_locator(ticker.MultipleLocator(base=5))  # Display every 5th year
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%d'))  # Format as integers

# Display every 5th label
#n = 3
plt.xlabel('year')
plt.ylabel('Price')
#plt.xticks(UsedCarDFFiltered['year'].unique())
#plt.xticks(x_values[::n], x_values[::n], rotation=180, fontsize = 30)  # Reduce the font size
plt.xticks(rotation=45, fontsize=30)
plt.tight_layout() # Adjusts subplot parameters for a tight layout
plt.show()

we can cleary see that latest year cars are preferred

In [None]:
plt.figure(figsize=(30, 25))  # Increased figure size
UsedCarDFFiltered['manufacturer_codes'] = UsedCarDFFiltered['manufacturer'].astype('category').cat.codes
plt.scatter(UsedCarDFFiltered['manufacturer_codes'], UsedCarDFFiltered['price'])
plt.xlabel('manufacturer (Encoded)')
plt.ylabel('Price')
plt.xticks(UsedCarDFFiltered['manufacturer_codes'].unique(), UsedCarDFFiltered['manufacturer'].unique())
plt.xticks(rotation=45, fontsize=20)
plt.show()

we can understand that manufacturer also makes a difference to the price of the car

In [None]:
plt.figure(figsize=(15, 6))  # Increased figure size
UsedCarDFFiltered['transmission_codes'] = UsedCarDFFiltered['transmission'].astype('category').cat.codes
plt.scatter(UsedCarDFFiltered['transmission_codes'], UsedCarDFFiltered['price'])
plt.xlabel('transmission (Encoded)')
plt.ylabel('Price')
plt.xticks(UsedCarDFFiltered['transmission_codes'].unique(), UsedCarDFFiltered['transmission'].unique())
plt.xticks(rotation=45, fontsize=15)
plt.show()

no diff in the price with regards to the fuel transmission

In [None]:
plt.figure(figsize=(15, 6))  # Increased figure size
UsedCarDFFiltered['drive_codes'] = UsedCarDFFiltered['drive'].astype('category').cat.codes
plt.scatter(UsedCarDFFiltered['drive_codes'], UsedCarDFFiltered['price'])
plt.xlabel('drive (Encoded)')
plt.ylabel('Price')
plt.xticks(UsedCarDFFiltered['drive_codes'].unique(), UsedCarDFFiltered['drive'].unique())
plt.xticks(rotation=45, fontsize=15)
plt.show()

no diff in the price with the drive type of the car

In [None]:
plt.figure(figsize=(15, 6))  # Increased figure size
UsedCarDFFiltered['fuel_codes'] = UsedCarDFFiltered['fuel'].astype('category').cat.codes
plt.scatter(UsedCarDFFiltered['fuel_codes'], UsedCarDFFiltered['price'])
plt.xlabel('fuel (Encoded)')
plt.ylabel('Price')
plt.xticks(UsedCarDFFiltered['fuel_codes'].unique(), UsedCarDFFiltered['fuel'].unique())
plt.xticks(rotation=45, fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 6))  # Increased figure size
UsedCarDFFiltered['cylinders_codes'] = UsedCarDFFiltered['cylinders'].astype('category').cat.codes
plt.scatter(UsedCarDFFiltered['cylinders_codes'], UsedCarDFFiltered['price'])
plt.xlabel('cylinders (Encoded)')
plt.ylabel('Price')
plt.xticks(UsedCarDFFiltered['cylinders_codes'].unique(), UsedCarDFFiltered['cylinders'].unique())
plt.xticks(rotation=45, fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 6))  # Increased figure size
UsedCarDFFiltered['size_codes'] = UsedCarDFFiltered['size'].astype('category').cat.codes
plt.scatter(UsedCarDFFiltered['size_codes'], UsedCarDFFiltered['price'])
plt.xlabel('size (Encoded)')
plt.ylabel('Price')
plt.xticks(UsedCarDFFiltered['size_codes'].unique(), UsedCarDFFiltered['size'].unique())
plt.xticks(rotation=45, fontsize=15)
plt.show()

In [None]:
UsedCarDFFiltered['model_codes'] = UsedCarDFFiltered['model'].astype('category').cat.codes
UsedCarDFFiltered['type_codes'] = UsedCarDFFiltered['type'].astype('category').cat.codes


In [None]:
#Applying JamesStein Encoder on Region and State as they both have
# high cardinality and also have impact on the price of the car
encoder = ce.JamesSteinEncoder(cols=['region', 'state'])
UsedCarDFFiltered = encoder.fit_transform(UsedCarDFFiltered, UsedCarDFFiltered['price'])
UsedCarDFFiltered

In [None]:
# Feature Engineering and Selection

# Feature Engineering
UsedCarDFFiltered['car_age'] = 2025 - UsedCarDFFiltered['year']  # Create age feature

#Feature Selection
features = UsedCarDFFiltered[['year', 'condition_codes', 'title_status_codes',
                             'odometer', 'manufacturer_codes', 'car_age',
                             'fuel_codes', 'cylinders_codes',
                              'region', 'model_codes',
                              'type_codes', 'state','size_codes', 'paint_color_codes','drive_codes'
                             ]]
target = UsedCarDFFiltered[['price']]

#Data Scaling
#By standardizing the data (Feature scaling), you ensure that all features
#contribute equally to the model, preventing features with larger ranges
#from dominating the learning process.
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

#Data Scaling
#By standardizing the data (Feature scaling), you ensure that all features
#contribute equally to the model, preventing features with larger ranges
#from dominating the learning process.
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)





In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)


In [None]:
# Train the model
# Impute NaN values with the mean of each column
#

imputer = SimpleImputer(strategy='mean') # Create an imputer object
X_train = imputer.fit_transform(X_train)  # Fit and transform X_train
X_test = imputer.transform(X_test)


# Model Selection
model = RandomForestRegressor(random_state=42)
#model = LinearRegression()
# Imp note: I tried both Linear Regression and RandomForest and
#RandomForest gives the better results compared to LinearRegression
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

In [None]:
# Calculate mean/median price
mean_price = UsedCarDFFiltered['price'].mean()
median_price = UsedCarDFFiltered['price'].median()

# Create baseline prediction column
UsedCarDFFiltered['baseline_mean'] = mean_price
UsedCarDFFiltered['baseline_median'] = median_price

# Calculate MSE for baseline predictions
mse_mean = mean_squared_error(UsedCarDFFiltered['price'], UsedCarDFFiltered['baseline_mean'])
mse_median = mean_squared_error(UsedCarDFFiltered['price'], UsedCarDFFiltered['baseline_median'])

print(f"Baseline MSE (Mean): {mse_mean}")
print(f"Baseline MSE (Median): {mse_median}")

In [None]:
percentage_error = (rmse / UsedCarDFFiltered['price'].mean()) * 100
print(f"Percentage Error: {percentage_error:.2f}%")

In [None]:

# 6. Predict price for a new car with given features
#new_car_features = [[2015, 3, 0, 50000, 100]]  # Example features for a new car
#target = UsedCarDFFiltered['price']
#predicted_price = model.predict(new_car_features)
#print(f"Predicted Price: {predicted_price[0]}")

In [None]:
# Add prediction column to dataframe
# Impute NaN values with the mean of each column before prediction

imputer = SimpleImputer(strategy='mean') # Create an imputer object

# Fit and transform the features used for prediction
# Add prediction column to dataframe
# Impute NaN values with the mean of each column before prediction

imputer = SimpleImputer(strategy='mean') # Create an imputer object

# Fit and transform the features used for prediction
features_for_prediction = UsedCarDFFiltered[['year', 'condition_codes', 'title_status_codes', 'odometer','manufacturer_codes','car_age',
                             'fuel_codes', 'cylinders_codes','region', 'model_codes', 'drive_codes',
                               'state','size_codes', 'paint_color_codes','drive_codes']]
imputed_features = imputer.fit_transform(features_for_prediction)

# Predict using the imputed features
UsedCarDFFiltered["predicted_price"] = model.predict(imputed_features)
UsedCarDFFiltered[['year','condition','condition_codes','title_status_codes','title_status','odometer','manufacturer_codes','car_age',
                             'fuel_codes', 'cylinders_codes','region', 'model_codes',
                              'type_codes', 'state','size_codes', 'paint_color_codes','drive_codes','price','predicted_price']]
# Predict using the imputed features
UsedCarDFFiltered["predicted_price"] = model.predict(imputed_features)
# Add prediction column to dataframe
# Impute NaN values with the mean of each column before prediction

imputer = SimpleImputer(strategy='mean') # Create an imputer object

# Fit and transform the features used for prediction
# Add prediction column to dataframe
# Impute NaN values with the mean of each column before prediction

imputer = SimpleImputer(strategy='mean') # Create an imputer object

# Fit and transform the features used for prediction
features_for_prediction = UsedCarDFFiltered[['year', 'condition_codes', 'title_status_codes', 'odometer','manufacturer_codes','car_age',
                             'fuel_codes', 'cylinders_codes','region', 'model_codes', 'drive_codes',
                               'state','size_codes', 'paint_color_codes','drive_codes']]
imputed_features = imputer.fit_transform(features_for_prediction)

# Predict using the imputed features
UsedCarDFFiltered["predicted_price"] = model.predict(imputed_features)
UsedCarDFFiltered[['year','condition','condition_codes','title_status_codes','title_status','odometer','manufacturer_codes','car_age',
                             'fuel_codes', 'cylinders_codes','region', 'model_codes',
                              'type_codes', 'state','size_codes', 'paint_color_codes','drive_codes','price','predicted_price']]
# Predict using the imputed features
UsedCarDFFiltered["predicted_price"] = model.predict(imputed_features)
UsedCarDFFiltered[['year','condition','condition_codes','title_status_codes','title_status','odometer','manufacturer_codes','car_age',
                             'fuel_codes', 'cylinders_codes','region', 'model_codes',
                              'type_codes', 'state','size_codes', 'paint_color_codes','drive_codes','price','predicted_price']]

In [None]:
actual_price = UsedCarDFFiltered['price']
predicted_price = UsedCarDFFiltered['predicted_price']

r2 = r2_score(actual_price, predicted_price)
mae = mean_absolute_error(actual_price, predicted_price)
mse = mean_squared_error(actual_price, predicted_price)
rmse = np.sqrt(mse)

print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")

# Create scatter plot
import matplotlib.pyplot as plt
plt.scatter(actual_price, predicted_price)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs. Predicted Price")
plt.show()

In [None]:
# note : overlaying two plots on one pair of axes requires us to use this sort of code

fig = go.Figure()
fig.add_trace(go.Scatter(x = UsedCarDFFiltered[['year', 'condition_codes', 'title_status_codes', 'odometer','manufacturer_codes','car_age',
                             'fuel_codes', 'cylinders_codes', 'size_codes','region', 'model_codes', 'drive_codes',
                              'type_codes', 'paint_color_codes', 'state']], y = UsedCarDFFiltered['price'],
                          mode  = "markers" , name  = "actual"))

fig.add_trace(go.Scatter(x = UsedCarDFFiltered[['year', 'condition_codes', 'title_status_codes', 'odometer','manufacturer_codes','car_age',
                             'fuel_codes', 'cylinders_codes', 'size_codes','region', 'model_codes', 'drive_codes',
                              'type_codes', 'paint_color_codes', 'state']], y = UsedCarDFFiltered['predicted_price'],
                          mode  = "lines" , name  = "Predicted"))
fig.update_layout(font_size = 20)