In [344]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

import streamlit as st

In [345]:
cars_df = pd.read_csv(Path("used_cars_data.csv"), parse_dates=True, infer_datetime_format=True)
cars_df = cars_df[['mileage','year','model_name','price']]
cars_df.dropna(inplace=True)

In [346]:
cars_df.isnull().sum()

mileage       0
year          0
model_name    0
price         0
dtype: int64

In [337]:
categorical_variables = list(cars_df.dtypes[cars_df.dtypes == "object"].index)
categorical_variables

['model_name']

In [338]:
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = enc.fit_transform(cars_df[categorical_variables])
encoded_df = pd.DataFrame(encoded_data,columns = enc.get_feature_names(categorical_variables))

column_names = list(enc.categories_)
encoded_df



Unnamed: 0,model_name_308,model_name_328,model_name_348 tb,model_name_360,model_name_360 Spider,model_name_360 modena,model_name_360 modena spyder,model_name_360 spider,model_name_365,model_name_430,...,model_name_f355,model_name_f355 spider,model_name_f430,model_name_f430 spider,model_name_f50,model_name_f8 tributo,model_name_ff,model_name_kit car,model_name_mondial,model_name_mondial cabriolet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
292,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [339]:
numerical_variables_df = cars_df.drop(columns = categorical_variables)
carscleaned_df = pd.concat([numerical_variables_df,encoded_df],axis=1)
carscleaned_df

Unnamed: 0,mileage,year,price,model_name_308,model_name_328,model_name_348 tb,model_name_360,model_name_360 Spider,model_name_360 modena,model_name_360 modena spyder,...,model_name_f355,model_name_f355 spider,model_name_f430,model_name_f430 spider,model_name_f50,model_name_f8 tributo,model_name_ff,model_name_kit car,model_name_mondial,model_name_mondial cabriolet
0,1733.0,2020.0,231995.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,239.0,2020.0,366995.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4849.0,2015.0,362750.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1792.0,2017.0,240995.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4341.0,2011.0,187850.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,10500.0,2006.0,124900.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290,9964.0,2011.0,185886.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,34123.0,2004.0,116899.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
292,34123.0,2004.0,116899.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [340]:
model = XGBRegressor()
X = carscleaned_df.drop(['price'],axis=1)
y = carscleaned_df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [341]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [342]:
model.fit(X_train_scaled, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [343]:
import matplotlib.pyplot as plt

y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

results = pd.DataFrame({
    "Prediction": y_pred, 
    "Actual": y_test
}).reset_index(drop=True)
results.head()

Mean Absolute Error: 55305.30026657517


Unnamed: 0,Prediction,Actual
0,150813.390625,149995.0
1,150813.390625,149995.0
2,382104.4375,449500.0
3,106350.125,129000.0
4,402764.40625,1390000.0


In [217]:
X_test_df = X_test.drop(X_test.index[1:])
print(X_test_df)

X_test_scaled_df = X_scaler.transform(X_test_df)

tesing_testing = model.predict(X_test_scaled_df)
#print(testing_testing)
X_test_scaled_df.shape


     mileage  year  model_name_308  model_name_328  model_name_348 tb  \
268    59728  2014             0.0             0.0                0.0   

     model_name_360  model_name_360 Spider  model_name_360 modena  \
268             0.0                    0.0                    0.0   

     model_name_360 modena spyder  model_name_360 spider  ...  \
268                           0.0                    0.0  ...   

     model_name_f355  model_name_f355 spider  model_name_f430  \
268              0.0                     0.0              0.0   

     model_name_f430 spider  model_name_f50  model_name_f8 tributo  \
268                     0.0             0.0                    0.0   

     model_name_ff  model_name_kit car  model_name_mondial  \
268            0.0                 0.0                 0.0   

     model_name_mondial cabriolet  
268                           0.0  

[1 rows x 50 columns]


(1, 50)

In [218]:
testing_testing = model.predict(X_test_df)
print(testing_testing)

[190546.86]


In [219]:
mileage = 59728
year = 2014
model_name = '458 spider'

# Create a dictionary with variable names as keys and variable values as values
data = {'mileage': mileage, 'year': year, 'model_name': model_name}

# Create a dataframe from the dictionary
df = pd.DataFrame(data, index=[0])

# Print the dataframe
print(df)

   mileage  year  model_name
0    59728  2014  458 spider


In [220]:
#enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
new_encoded_data = enc.transform(df[categorical_variables])
encoded_df = pd.DataFrame(new_encoded_data,columns = enc.get_feature_names(categorical_variables))

#enc.transform(df['model'])
encoded_df



Unnamed: 0,model_name_308,model_name_328,model_name_348 tb,model_name_360,model_name_360 Spider,model_name_360 modena,model_name_360 modena spyder,model_name_360 spider,model_name_365,model_name_430,...,model_name_f355,model_name_f355 spider,model_name_f430,model_name_f430 spider,model_name_f50,model_name_f8 tributo,model_name_ff,model_name_kit car,model_name_mondial,model_name_mondial cabriolet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [221]:
new_numerical_variables_df = df.drop(columns = categorical_variables)
new_carscleaned_df = pd.concat([new_numerical_variables_df,encoded_df],axis=1)
new_carscleaned_df.head()


Unnamed: 0,mileage,year,model_name_308,model_name_328,model_name_348 tb,model_name_360,model_name_360 Spider,model_name_360 modena,model_name_360 modena spyder,model_name_360 spider,...,model_name_f355,model_name_f355 spider,model_name_f430,model_name_f430 spider,model_name_f50,model_name_f8 tributo,model_name_ff,model_name_kit car,model_name_mondial,model_name_mondial cabriolet
0,59728,2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [222]:
input_test_scaled = X_scaler.transform(new_carscleaned_df)


In [232]:
prediction = model.predict(input_test_scaled)
print(prediction)

[150813.39]


In [396]:
def ml_function(mileage, year, model_name):
    data = {'mileage': mileage, 'year': year, 'model_name': model_name}
    df = pd.DataFrame(data,index=[0])
    new_encoded_data = enc.transform(df[categorical_variables])
    encoded_df = pd.DataFrame(new_encoded_data,columns = enc.get_feature_names(categorical_variables))
    new_numerical_variables_df = df.drop(columns = categorical_variables)
    new_carscleaned_df = pd.concat([new_numerical_variables_df,encoded_df],axis=1)
    input_test_scaled = X_scaler.transform(new_carscleaned_df)
    prediction = model.predict(input_test_scaled)
    formatted_prediction = "The estimated selling price of your car is ${:,.0f}".format(prediction[0])
    return formatted_prediction

new_carscleaned_df

Unnamed: 0,mileage,year,model_name_308,model_name_328,model_name_348 tb,model_name_360,model_name_360 Spider,model_name_360 modena,model_name_360 modena spyder,model_name_360 spider,...,model_name_f355,model_name_f355 spider,model_name_f430,model_name_f430 spider,model_name_f50,model_name_f8 tributo,model_name_ff,model_name_kit car,model_name_mondial,model_name_mondial cabriolet
0,59728,2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [397]:
ml_function(59728,2010,'Enzo')



'The estimated selling price of your car is $3,175,735'