In [1]:
import pandas as pd
import glob
import os

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split # module to split our data into train and test sets
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm
import statsmodels.tools

In [2]:
# Importing the cleaned cars data
dataset = "cars_new.csv"
cars_new = pd.read_csv(dataset)

## 3. Feature Engineering
#### Filling Null

In [3]:
# # fill in owners with 1 presenting it only has 1 owner
cars_new['owners'] = cars_new['owners'].fillna(
    value = 1,
    method = None,
    inplace = False
)

In [4]:
#fill in ULEZ with No
cars_new['ULEZ'] = cars_new['ULEZ'].fillna(value = 'No',
                                         method = None,
                                         inplace = False)

In [5]:
#fill in BHP with mean value of different car model
cars_new['BHP'] = cars_new.groupby('car model')['BHP'].transform(lambda x: x.fillna(x.mean()))

In [6]:
# building a function which can see the number of null value in each columns and it's percentage.
def null_vals(dataframe):

    null_vals = dataframe.isnull().sum()
    total_cnt = len(dataframe)
    null_vals = pd.DataFrame(null_vals,columns=['null'])
    null_vals['percent'] = round((null_vals['null']/total_cnt)*100,3)
    return null_vals.sort_values('percent', ascending=False)

null_vals(cars_new)

Unnamed: 0,null,percent
BHP,3,0.011
car model,0,0.0
link,0,0.0
price,0,0.0
mileage,0,0.0
transmission,0,0.0
fuel,0,0.0
owners,0,0.0
body,0,0.0
ULEZ,0,0.0


In [7]:
# Check the null value
null_vals(cars_new)

Unnamed: 0,null,percent
BHP,3,0.011
car model,0,0.0
link,0,0.0
price,0,0.0
mileage,0,0.0
transmission,0,0.0
fuel,0,0.0
owners,0,0.0
body,0,0.0
ULEZ,0,0.0


In [8]:
# the 3 null value in BHP is because there is no BHP in car model Transporter
cars_new[cars_new['BHP'].isnull()]

Unnamed: 0,car model,link,price,mileage,BHP,transmission,fuel,owners,body,ULEZ,engine,year
25326,Volkswagen_Transporter,https://www.autotrader.co.uk/car-details/20220...,29995,8886.0,,Manual,Diesel,1.0,MPV,No,2.0,2018
25327,Volkswagen_Transporter,https://www.autotrader.co.uk/car-details/20210...,37000,550.0,,Manual,Diesel,2.0,SUV,No,2.0,2020
25328,Volkswagen_Transporter,https://www.autotrader.co.uk/car-details/20220...,38850,28000.0,,Manual,Diesel,1.0,Estate,No,2.0,2021


In [9]:
#drop null
cars_new.dropna(inplace = True)

In [10]:
null_vals(cars_new)

Unnamed: 0,null,percent
car model,0,0.0
link,0,0.0
price,0,0.0
mileage,0,0.0
BHP,0,0.0
transmission,0,0.0
fuel,0,0.0
owners,0,0.0
body,0,0.0
ULEZ,0,0.0


#### Bining the data

In [11]:
# change data type
cars_new = cars_new.astype({'price':'int64',
                            'engine':'float64',
                            'year':'category',
                            'transmission':'category',
                            'fuel':'category',
                            'body':'category',
                            'ULEZ':'category',
                            'owners':'category',
                            'car model':'category'})

In [12]:
# # group engine
engine_bucket = [0, 1, 2.0, 6.0 ]
engine_name = ['small','medium','large']

cars_new['engine_bucket'] = pd.cut(cars_new['engine'],
                                   bins = engine_bucket,
                                   labels = engine_name,
                                   include_lowest= True)


In [13]:
cars_new.owners.value_counts()

1.0     18039
2.0      4985
3.0      1959
4.0       827
5.0       405
6.0        90
7.0        45
8.0        24
9.0         9
10.0        5
11.0        2
13.0        1
14.0        1
15.0        1
Name: owners, dtype: int64

In [14]:
# # group owner in terms of the numbers
owner_bucket = [0,1,3,15]
owner_name = ['1','2-3','3-15']

cars_new['owners_bucket'] = pd.cut(cars_new['owners'],
                                   bins = owner_bucket,
                                   labels = owner_name,
                                   include_lowest= True)

In [15]:
#drop the orinal two columns
cars_new.drop(columns = ['link','owners','engine'], inplace = True)

In [16]:
cars_new.head()

Unnamed: 0,car model,price,mileage,BHP,transmission,fuel,body,ULEZ,year,engine_bucket,owners_bucket
0,Volkswagen_Amarok,9960,113000.0,161.0,Manual,Diesel,Pickup,No,2011,medium,2-3
1,Volkswagen_Amarok,13995,50767.0,161.0,Manual,Diesel,Pickup,No,2011,medium,1
2,Volkswagen_Amarok,11500,112000.0,161.0,Manual,Diesel,Pickup,No,2011,medium,2-3
3,Volkswagen_Amarok,12950,128848.0,161.0,Manual,Diesel,Pickup,No,2011,medium,3-15
4,Volkswagen_Amarok,12495,92000.0,161.0,Manual,Diesel,Pickup,No,2011,medium,1


In [17]:
cars_new .columns

Index(['car model', 'price', 'mileage', 'BHP', 'transmission', 'fuel', 'body',
       'ULEZ', 'year', 'engine_bucket', 'owners_bucket'],
      dtype='object')

#### Scaling the data

In [18]:
# the columns need to scale: mileage and BHP
# set index = cars_new.index to fix the index
to_scale = ['mileage','BHP']

st_scaler = StandardScaler()
cars_scaled = st_scaler.fit_transform(cars_new[to_scale], )
cars_scaled = pd.DataFrame(cars_scaled, columns = to_scale, index = cars_new.index)

#### Encoding the data

In [19]:
# the columns need to be encoded
to_one = ['car model','transmission','fuel','ULEZ','owners_bucket','engine_bucket','year']

encoded_cars = pd.get_dummies(cars_new[to_one], columns = to_one, prefix = to_one, drop_first= True)


#### Joining dataframe

In [20]:
#join two dataframes together
cars = encoded_cars.join(cars_scaled, how = 'inner')

In [21]:
# add the 'price' column to the encoded dataframe cars
cars['price'] = cars_new['price']

In [22]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26393 entries, 0 to 26395
Data columns (total 38 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   car model_Volkswagen_Golf        26393 non-null  uint8  
 1   car model_Volkswagen_Passat      26393 non-null  uint8  
 2   car model_Volkswagen_Polo        26393 non-null  uint8  
 3   car model_Volkswagen_Scirocco_1  26393 non-null  uint8  
 4   car model_Volkswagen_T-cross     26393 non-null  uint8  
 5   car model_Volkswagen_T-roc       26393 non-null  uint8  
 6   car model_Volkswagen_Tiguan      26393 non-null  uint8  
 7   car model_Volkswagen_Touareg     26393 non-null  uint8  
 8   car model_Volkswagen_Touran      26393 non-null  uint8  
 9   car model_Volkswagen_Up!         26393 non-null  uint8  
 10  transmission_Manual              26393 non-null  uint8  
 11  fuel_Petrol                      26393 non-null  uint8  
 12  fuel_Petrol Hybrid

#### Spliting the data

In [23]:
# feature_cols to the price
feature_cols = list(cars.columns)
feature_cols.remove('price')

In [24]:
X = cars[feature_cols]
y = cars['price']
X = sm.add_constant(X)

X_train, X_test,y_train, y_test = train_test_split(X,y, test_size =0.2, random_state= 40)

## 4. Modeling

In [25]:
# put the train data in the model
lin_reg = sm.OLS(y_train, X_train)
results = lin_reg.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.878
Model:                            OLS   Adj. R-squared:                  0.878
Method:                 Least Squares   F-statistic:                     4114.
Date:                Thu, 07 Jul 2022   Prob (F-statistic):               0.00
Time:                        10:20:55   Log-Likelihood:            -1.9812e+05
No. Observations:               21114   AIC:                         3.963e+05
Df Residuals:                   21076   BIC:                         3.966e+05
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
const     

In [26]:
# predict the price through train data
X_train['y_pred'] = results.predict(X_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['y_pred'] = results.predict(X_train)


In [27]:
#show the predicted price through the model
X_train['y_pred'].head()

11655     5696.918943
3483      9806.938331
1447     13397.646342
20762    11817.668364
23081    21813.657339
Name: y_pred, dtype: float64

In [28]:
X_train.columns

Index(['const', 'car model_Volkswagen_Golf', 'car model_Volkswagen_Passat',
       'car model_Volkswagen_Polo', 'car model_Volkswagen_Scirocco_1',
       'car model_Volkswagen_T-cross', 'car model_Volkswagen_T-roc',
       'car model_Volkswagen_Tiguan', 'car model_Volkswagen_Touareg',
       'car model_Volkswagen_Touran', 'car model_Volkswagen_Up!',
       'transmission_Manual', 'fuel_Petrol', 'fuel_Petrol Hybrid',
       'fuel_Petrol Plug-in Hybrid', 'ULEZ_ULEZ', 'owners_bucket_2-3',
       'owners_bucket_3-15', 'engine_bucket_medium', 'engine_bucket_large',
       'year_2006', 'year_2007', 'year_2008', 'year_2009', 'year_2010',
       'year_2011', 'year_2012', 'year_2013', 'year_2014', 'year_2015',
       'year_2016', 'year_2017', 'year_2018', 'year_2019', 'year_2020',
       'year_2021', 'mileage', 'BHP', 'y_pred'],
      dtype='object')

In [29]:
# compare the first row of the predicted price and its actual price
y_train[12003]

KeyError: 12003

In [None]:
# use the trained model to predict the price in the test dataset
X_test['y_pred'] = results.predict(X_test)

In [None]:
#rmse of the train data
rmse_train = statsmodels.tools.eval_measures.rmse(y_train, X_train['y_pred'])
print(rmse_train)

In [None]:
#rmse of the test data
rmse_test = statsmodels.tools.eval_measures.rmse(y_test, X_test['y_pred'])
print(rmse_test)

#### Correlation

In [None]:
# show the correlation
cars.corr()

In [None]:
#plot the correlation
plt.figure (figsize = (30,30))
sns.heatmap(cars.corr(),
            annot = True,
            linecolor='black',
            center=0,
            vmin=-1,
            vmax=1,
            cmap = sns.diverging_palette(600,600, as_cmap=True),
            fmt='.4g')
plt.show()