In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data.csv', index_col = 0)

In [3]:
df.head()

Unnamed: 0,title,price,registration,km,displacement,hp,fuel,transmission,category,location,link,brand,town,model
0,Volkswagen T-Cross '19 T-CROSS 1.0CC,19200,04/2019,40066,1000,96,Petrol,Manual,Manual,RETHYMNO 74100,https://www.car.gr/classifieds/cars/view/32659...,Volkswagen,RETHYMNO,T-Cross
1,Renault Clio '19 CLIO DYNAMIC 1.0CC 100HP,18400,12/2019,10059,1000,100,Petrol,Manual,Manual,RETHYMNO 74100,https://www.car.gr/classifieds/cars/view/32613...,Renault,RETHYMNO,Clio
2,Renault Laguna '03 LAGUNA,999,11/2003,188614,1998,155,Petrol,Manual,Manual,RETHYMNO 74100,https://www.car.gr/classifieds/cars/view/26385...,Renault,RETHYMNO,Laguna
3,Renault Megane '04 1.5 ΠΕΤΡΕΛΑΙΟ!!!,5000,01/2004,162800,1500,100,Diesel,Manual,Manual,AGRINIO 30100,https://www.car.gr/classifieds/cars/view/33000...,Renault,AGRINIO,Megane
4,Nissan Navara '11 Long Bed -LE Automatic,25500,09/2011,99300,2500,192,Diesel,Automatic,Automatic,ALIMOS 17456,https://www.car.gr/classifieds/cars/view/11979...,Nissan,ALIMOS,Navara


In [4]:
# features to use when givving a suggestion
# brand, model, price, registration, mileage, cc, hp, fuel, transmission
# brand and model are going to be queryied so are not going into the model
# price, registration, mileage, cc, hp, fuel, transmission

In [5]:
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

In [6]:
dummy = df.query('brand == "Opel"').query('model == "Corsa"').query('price != "Ask price"')
dummy.drop(['title', 'category', 'location', 'brand', 'model', 'link', 'town'], axis = 1, inplace = True)
dummy.reset_index(drop = True, inplace = True)
dummy['price'] = dummy['price'].values.astype('int32')
dummy.shape

(140, 7)

In [7]:
dummy.head()

Unnamed: 0,price,registration,km,displacement,hp,fuel,transmission
0,6600,01/2012,169000,1300,95,Diesel,Manual
1,555,01/2005,170000,1200,75,Petrol,Manual
2,7600,01/2012,90000,1248,95,Diesel,Manual
3,8500,09/2017,130000,1400,95,Petrol,Manual
4,3750,02/2005,267950,1248,75,Diesel,Manual


In [8]:
# fuel, transmission are going to be one hot encoded
fuel_one_hot = OneHotEncoder(drop = 'if_binary').fit_transform(dummy['fuel'].values.reshape(-1, 1))
transmission_one_hot = OneHotEncoder(drop = 'if_binary').fit_transform(dummy['transmission'].values.reshape(-1, 1))

In [9]:
dummy['fuel_type'] = fuel_one_hot.toarray()
dummy['transmission_type'] = transmission_one_hot.toarray()
dummy.drop(['fuel', 'transmission'], axis = 1, inplace = True)

In [10]:
dummy.head()

Unnamed: 0,price,registration,km,displacement,hp,fuel_type,transmission_type
0,6600,01/2012,169000,1300,95,0.0,1.0
1,555,01/2005,170000,1200,75,1.0,1.0
2,7600,01/2012,90000,1248,95,0.0,1.0
3,8500,09/2017,130000,1400,95,1.0,1.0
4,3750,02/2005,267950,1248,75,0.0,1.0


In [11]:
registration_stamp = [datetime.strptime(dummy['registration'][i], '%m/%Y').timestamp() / 3600 for i in range(len(dummy))]
dummy['registration_stamp'] = registration_stamp
dummy.drop('registration', axis = 1, inplace = True)

In [12]:
dummy.head()

Unnamed: 0,price,km,displacement,hp,fuel_type,transmission_type,registration_stamp
0,6600,169000,1300,95,0.0,1.0,368158.0
1,555,170000,1200,75,1.0,1.0,306814.0
2,7600,90000,1248,95,0.0,1.0,368158.0
3,8500,130000,1400,95,1.0,1.0,417837.0
4,3750,267950,1248,75,0.0,1.0,307558.0


In [13]:
dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   price               140 non-null    int32  
 1   km                  140 non-null    int64  
 2   displacement        140 non-null    int64  
 3   hp                  140 non-null    int64  
 4   fuel_type           140 non-null    float64
 5   transmission_type   140 non-null    float64
 6   registration_stamp  140 non-null    float64
dtypes: float64(3), int32(1), int64(3)
memory usage: 7.2 KB


In [14]:
x = dummy.iloc[:, 1:]
y = dummy['price']

linear = LinearRegression()
tree = DecisionTreeRegressor(max_depth = 3)
linear_cv_results = cross_validate(linear, x, y, cv = 3, scoring = 'neg_mean_absolute_error')
tree_cv_results = cross_validate(tree, x, y, cv = 3, scoring = 'neg_mean_absolute_error')

In [33]:
np.array([16000, 1200, 80, 1, 1, 368158.0]).reshape(-1,1)

array([[1.60000e+04],
       [1.20000e+03],
       [8.00000e+01],
       [1.00000e+00],
       [1.00000e+00],
       [3.68158e+05]])

In [44]:
tree.fit(x, y)
tree.predict(np.array([100, 1200, 90, 1, 1, 417837.0]).reshape(1,-1))



array([17000.])

In [15]:
# sorted(cv_results.keys())

In [16]:
print(np.mean(linear_cv_results['test_score']), np.mean(tree_cv_results['test_score']))

-1301.6779038759587 -1246.3426910502658


In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
linear.fit(x_train, y_train);
tree.fit(x_train, y_train);

In [24]:
print(mean_absolute_error(linear.predict(x_test), y_test),
      mean_absolute_error(tree.predict(x_test), y_test))

1271.746829793292 1451.6202902036234


In [25]:
ensemble = (linear.predict(x_test) + tree.predict(x_test)) / 2
print(mean_absolute_error(ensemble, y_test))

1156.471170349015
