In [206]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

from web3 import Web3
import streamlit as st

In [207]:
cars_df = pd.read_csv(Path("used_cars_data.csv"), parse_dates=True, infer_datetime_format=True)
cars_df = cars_df[['mileage','year','model_name','price']]

In [208]:
cars_df.isnull().sum()

mileage       0
year          0
model_name    0
price         0
dtype: int64

In [209]:
categorical_variables = list(cars_df.dtypes[cars_df.dtypes == "object"].index)
categorical_variables

['model_name']

In [210]:
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_data = enc.fit_transform(cars_df[categorical_variables])
encoded_df = pd.DataFrame(encoded_data,columns = enc.get_feature_names(categorical_variables))

column_names = list(enc.categories_)
encoded_df



Unnamed: 0,model_name_308,model_name_328,model_name_348 tb,model_name_360,model_name_360 Spider,model_name_360 modena,model_name_360 modena spyder,model_name_360 spider,model_name_365,model_name_430,...,model_name_f355,model_name_f355 spider,model_name_f430,model_name_f430 spider,model_name_f50,model_name_f8 tributo,model_name_ff,model_name_kit car,model_name_mondial,model_name_mondial cabriolet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
292,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [211]:
numerical_variables_df = cars_df.drop(columns = categorical_variables)
carscleaned_df = pd.concat([numerical_variables_df,encoded_df],axis=1)
carscleaned_df

Unnamed: 0,mileage,year,price,model_name_308,model_name_328,model_name_348 tb,model_name_360,model_name_360 Spider,model_name_360 modena,model_name_360 modena spyder,...,model_name_f355,model_name_f355 spider,model_name_f430,model_name_f430 spider,model_name_f50,model_name_f8 tributo,model_name_ff,model_name_kit car,model_name_mondial,model_name_mondial cabriolet
0,1733,2020,231995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,239,2020,366995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4849,2015,362750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1792,2017,240995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4341,2011,187850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,10500,2006,124900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
290,9964,2011,185886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,34123,2004,116899,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
292,34123,2004,116899,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [212]:
model = XGBRegressor()
X = carscleaned_df.drop(['price'],axis=1)
y = carscleaned_df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X.loc[269]

mileage                         59728.0
year                             2014.0
model_name_308                      0.0
model_name_328                      0.0
model_name_348 tb                   0.0
model_name_360                      0.0
model_name_360 Spider               0.0
model_name_360 modena               0.0
model_name_360 modena spyder        0.0
model_name_360 spider               0.0
model_name_365                      0.0
model_name_430                      0.0
model_name_430 Scuderia             0.0
model_name_456M                     0.0
model_name_458 Italia               0.0
model_name_458 italia               0.0
model_name_458 spider               1.0
model_name_488                      0.0
model_name_488 gtb                  0.0
model_name_488 spider               0.0
model_name_550                      0.0
model_name_575                      0.0
model_name_575M                     0.0
model_name_599 GTB Fiorano          0.0
model_name_612 Scaglietti           0.0


In [213]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [214]:
model.fit(X_train_scaled, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [215]:
import matplotlib.pyplot as plt

y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

results = pd.DataFrame({
    "Prediction": y_pred, 
    "Actual": y_test
}).reset_index(drop=True)
results.head()

Mean Absolute Error: 55305.30026657517


Unnamed: 0,Prediction,Actual
0,150813.390625,149995
1,150813.390625,149995
2,382104.4375,449500
3,106350.125,129000
4,402764.40625,1390000


In [216]:
y_test

268     149995
269     149995
280     449500
110     129000
167    1390000
        ...   
112     330000
38      129900
205     269988
123     299995
181     204900
Name: price, Length: 74, dtype: int64

In [217]:
X_test_df = X_test.drop(X_test.index[1:])
print(X_test_df)

X_test_scaled_df = X_scaler.transform(X_test_df)

tesing_testing = model.predict(X_test_scaled_df)
#print(testing_testing)
X_test_scaled_df.shape


     mileage  year  model_name_308  model_name_328  model_name_348 tb  \
268    59728  2014             0.0             0.0                0.0   

     model_name_360  model_name_360 Spider  model_name_360 modena  \
268             0.0                    0.0                    0.0   

     model_name_360 modena spyder  model_name_360 spider  ...  \
268                           0.0                    0.0  ...   

     model_name_f355  model_name_f355 spider  model_name_f430  \
268              0.0                     0.0              0.0   

     model_name_f430 spider  model_name_f50  model_name_f8 tributo  \
268                     0.0             0.0                    0.0   

     model_name_ff  model_name_kit car  model_name_mondial  \
268            0.0                 0.0                 0.0   

     model_name_mondial cabriolet  
268                           0.0  

[1 rows x 50 columns]


(1, 50)

In [218]:
testing_testing = model.predict(X_test_df)
print(testing_testing)

[190546.86]


In [219]:
mileage = 59728
year = 2014
model_name = '458 spider'

# Create a dictionary with variable names as keys and variable values as values
data = {'mileage': mileage, 'year': year, 'model_name': model_name}

# Create a dataframe from the dictionary
df = pd.DataFrame(data, index=[0])

# Print the dataframe
print(df)

   mileage  year  model_name
0    59728  2014  458 spider


In [220]:
#enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
new_encoded_data = enc.transform(df[categorical_variables])
encoded_df = pd.DataFrame(new_encoded_data,columns = enc.get_feature_names(categorical_variables))

#enc.transform(df['model'])
encoded_df



Unnamed: 0,model_name_308,model_name_328,model_name_348 tb,model_name_360,model_name_360 Spider,model_name_360 modena,model_name_360 modena spyder,model_name_360 spider,model_name_365,model_name_430,...,model_name_f355,model_name_f355 spider,model_name_f430,model_name_f430 spider,model_name_f50,model_name_f8 tributo,model_name_ff,model_name_kit car,model_name_mondial,model_name_mondial cabriolet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [221]:
new_numerical_variables_df = df.drop(columns = categorical_variables)
new_carscleaned_df = pd.concat([new_numerical_variables_df,encoded_df],axis=1)
new_carscleaned_df.head()


Unnamed: 0,mileage,year,model_name_308,model_name_328,model_name_348 tb,model_name_360,model_name_360 Spider,model_name_360 modena,model_name_360 modena spyder,model_name_360 spider,...,model_name_f355,model_name_f355 spider,model_name_f430,model_name_f430 spider,model_name_f50,model_name_f8 tributo,model_name_ff,model_name_kit car,model_name_mondial,model_name_mondial cabriolet
0,59728,2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [222]:
input_test_scaled = X_scaler.transform(new_carscleaned_df)


In [232]:
prediction = model.predict(input_test_scaled)
print(prediction)

[150813.39]


In [252]:
def ml_function(mileage, year, model_name):
    data = {'mileage': mileage, 'year': year, 'model_name': model_name}
    df = pd.DataFrame(data,index=[0])
    new_encoded_data = enc.transform(df[categorical_variables])
    encoded_df = pd.DataFrame(new_encoded_data,columns = enc.get_feature_names(categorical_variables))
    new_numerical_variables_df = df.drop(columns = categorical_variables)
    new_carscleaned_df = pd.concat([new_numerical_variables_df,encoded_df],axis=1)
    input_test_scaled = X_scaler.transform(new_carscleaned_df)
    prediction = model.predict(input_test_scaled)
    return print(prediction)
ml_function(mileage, year, model_name)
print(mileage)

[150813.39]
59728




In [269]:
ml_function(59728,2020,'458 spider')

[184912.7]




In [226]:
#def load_contract():
    
#    with open(Path('../##Contract address')) as f:
#        contract_abi = json.load(f)
    
#    contract_address = os.getenv("SMART_CONTRACT_ADDRESS")
    
#    contract = w3.eth.contract(address=contract_address, abi=contract_abi)
#    return contract

if st.button("Estimated Price"):
    car_stats = contract.functions.XXXX().call()
    ml_function(car_stats)
    return y_pred
## gather sellers car parameters, put into dataframe and run model.predict(df)
#   return y_pred

SyntaxError: 'return' outside function (1576428760.py, line 14)