# IMPORTING LIBRARIES

In [None]:
# For now, the libraries required for the data exploration are selected.
# The other ones needed for different type of analysis will also be importing in relevant sections. 

import numpy as np
import pandas as pd
import os 
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import pathlib
import warnings
warnings.filterwarnings("ignore")

# DIRECTORY SELECTION

In [None]:
# The directory is determined the folder at which the databases are kept.

os.getcwd()
os.chdir("C:/Users/aykut/OneDrive/Masaüstü/04 - Thesis/06 - Database")

# UPLOADING DATA

In [None]:
df_2018 = pd.read_csv("2018.csv")

In [None]:
df_2021 = pd.read_csv("2021.csv")

# EXPLORATORY DATA ANALYSIS

## Description of the Data

In [None]:
df_2018.head()

In [None]:
df_2021.head()

In [None]:
df_2018.columns

In [None]:
df_2021.columns

In [None]:
df_2018.describe()

In [None]:
df_2021.describe()

In [None]:
df_2018.info()

In [None]:
df_2021.info()

## Missing Values

In [None]:
df_2018.isna().sum()

In [None]:
df_2021.isna().sum()

In [None]:
df_2018.drop_duplicates()

In [None]:
df_2021.drop_duplicates()

In [None]:
# To visualize missing values in the dataset, the heatmaps are utilized.

In [None]:
plt.figure(figsize = (20, 7))
sns.set_style("dark")
sns.heatmap(df_2018.isnull())
plt.title('Visualization of the missing values for each column', y=-0.25)

plt.figure(figsize = (20, 7))
sns.set_style("dark")
sns.heatmap(df_2021.isnull())
plt.title('Visualization of the missing values for each column', y=-0.25)

In [None]:
# It is seen that there are many missing values in the dataset. In addition, it is presumed that some of the columns are 
# not useful at all. Therefore, unnecessary columns either that will be not be included in the model or that will 
# contribute no information at all are removed. All of the columns are re-checked and named in order to ensure coherency.

In [None]:
description_column = df_2021['description']

df_2018.drop(['url', 'VIN', 'size', 'image_url', 'lat', 'long', 'city'], inplace=True, axis=1)
df_2021.drop(['id', 'url', 'region_url', 'VIN', 'size', 'image_url', 'region', 'county', 'state', 'description', 'posting_date', 'lat', 'long'], inplace=True, axis=1)

df_2018.rename(columns = {'make': 'model'}, inplace=True)

In [None]:
# Considering of the fact that some of the columns such as "ID" or "Vehicle Identification Number (VIN)" might be only 
# the repetitions that might be the cars uploaded to the system for more than one time under different unique numbers, it
# is thought that drop_duplicates might work after the removal of those columns. So, the columns in question are removed; 
# accordingly, the duplicates are re-checked and removed from the dataset.

In [None]:
df_2018.drop_duplicates()

In [None]:
df_2021.drop_duplicates()

In [None]:
# Since it is a fact that some of the columns are playing more important role than the others for the model development, 
# all the columns are re-checked.

# In accordance with the importance of the columns, 
    # First, rows with missing values in "Year", "Manufacturer", Model", "Fuel", "Odometer" and "Transmission" are removed
    # since it is a fact that those are the ones playing a crucial role for price.
    # Second, rows with concurrent missing values in all remaining columns are removed.
    # Third, rows with at least 5 non-NA values are kept and others are removed.

In [None]:
df_2018.dropna(subset=['year', 'manufacturer', 'model', 'fuel','odometer'], axis=0, inplace=True)
df_2018.dropna(subset=['condition', 'cylinders', 'title_status', 'transmission', 'drive', 'type', 'paint_color'], axis=0, how='all', inplace=True)
df_2018.dropna(subset=['condition', 'cylinders', 'title_status', 'transmission', 'drive', 'type', 'paint_color'], axis=0, thresh=5, inplace=True)

df_2021.dropna(subset=['year', 'manufacturer', 'model', 'fuel','odometer'], axis=0, inplace=True)
df_2021.dropna(subset=['condition', 'cylinders', 'title_status', 'transmission', 'drive', 'type', 'paint_color'], axis=0, how='all', inplace=True)
df_2021.dropna(subset=['condition', 'cylinders', 'title_status', 'transmission', 'drive', 'type', 'paint_color'], axis=0, thresh=5, inplace=True)

In [None]:
# At the end of this process, a maximum of 2 missing values are concurently present in a row.

In [None]:
# Latest status of the datasets is re-checked.

In [None]:
print(df_2018.shape)
print(df_2021.shape)

In [None]:
print(df_2018.columns)
print(df_2021.columns)

In [None]:
# The indices are resetted.

In [None]:
df_2018 = df_2018.reset_index(drop=True)
df_2021 = df_2021.reset_index(drop=True)

In [None]:
# Latest status of the missing values

In [None]:
# 2018 Dataset

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(12,5))

ax1.bar(df_2018.columns, df_2018.isna().sum(), color='g')
ax1.set_ylabel('Missing values')
ax1.set_ylim(0,df_2018.shape[0])
ax1.tick_params('x',labelrotation=90)

sns.heatmap(df_2018.isna(),yticklabels=False,cbar=False, cmap='Greens',ax=ax2)
plt.show()

In [None]:
# 2021 Dataset

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(12,5))

ax1.bar(df_2021.columns, df_2021.isna().sum(), color='g')
ax1.set_ylabel('Missing values')
ax1.set_ylim(0,df_2021.shape[0])
ax1.tick_params('x',labelrotation=90)

sns.heatmap(df_2021.isna(),yticklabels=False,cbar=False, cmap='Greens',ax=ax2)
plt.show()

In [None]:
# The percentage of the missing values

In [None]:
df_2018.isna().mean().round(4).mul(100).sort_values(ascending=False)

In [None]:
df_2021.isna().mean().round(4).mul(100).sort_values(ascending=False)

In [None]:
# Saving pre-processed data

In [None]:
os.chdir("C:/Users/aykut/OneDrive/Masaüstü/04 - Thesis/18 - Preprocessed Data")

In [None]:
df_2018.to_csv("2018.csv", index=False)
df_2021.to_csv("2021.csv", index=False)
description_column.to_csv("description_column.csv")

# FEATURES ONE BY ONE

In [None]:
os.chdir("C:/Users/aykut/OneDrive/Masaüstü/04 - Thesis/18 - Preprocessed Data")

In [None]:
df_2018 = pd.read_csv('2018.csv')
df_2018 = pd.read_csv('2018.csv')
description_column = pd.read_csv('description_column.csv')

## "DESCRIPTION"

### Creation of Word Cloud

In [None]:
pip install WordCloud

In [None]:
pip install PIL

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

In [None]:
df_word_set = description_column
df_word_set = df_word_set.drop_duplicates()
word_set = ','.join(str(x) for x in df_word_set)
word_set.lower()

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["nan", "one", "minutes", "make", "point", "crew", "cab", "carvana", "carvanaauto", "INS", "text", "money", 
                  "want", "number", "re", "lot", "car", "customer", "back", "carvanaautoMercedes", "finance", "purchase", "payment", 
                  "responsible", "monthly payments", "inspection", "wheel", "carvana", "Call", "everything", "made", "minute", 
                  "trade", "information", "buy", "question", "welcome", "sale", "credit", "put", 
                  "auto", "guarantee", "tax", "please", "verify"])

In [None]:
def black_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    return("hsl(0,100%, 1%)")

wordcloud_2021 = WordCloud(stopwords = stopwords, width=6000, height=4000, max_words=500, 
                      random_state=21, relative_scaling=0.5, background_color="white").generate(word_set_2021)

wordcloud_2021.recolor(color_func = black_color_func)

In [None]:
plt.figure(figsize=(12, 12), facecolor = None)
plt.imshow(wordcloud_2021, interpolation="bilinear")
plt.axis('off')
plt.tight_layout(pad = 0)

### Saving

In [None]:
os.chdir("C:/Users/aykut/OneDrive/Masaüstü/04 - Thesis/19 - Word Cloud")

In [None]:
plt.savefig('word_cloud_2021.jpg')

In [None]:
os.chdir("C:/Users/aykut/OneDrive/Masaüstü/04 - Thesis/18 - Preprocessed Data")

## "PRICE"

### Distribution of Prices

In [None]:
print(min(df_2018['price']))
print(min(df_2021['price']))
print(max(df_2018['price']))
print(max(df_2021['price']))

In [None]:
ranges = []
i = 1
j = 0

while i < 21:    
    ranges.append(df_2018['price'].value_counts(bins=[(j + 1), (j + 5000)]))
    j = j + 5000
    i = i + 1
    
ranges.append(df_2018['price'].value_counts(bins=[100001, 4211956308]))

ranges

In [None]:
ranges = []
i = 1
j = 0

while i < 21:    
    ranges.append(df_2021['price'].value_counts(bins=[(j + 1), (j + 5000)]))
    j = j + 5000
    i = i + 1
    
ranges.append(df_2021['price'].value_counts(bins=[100001, 3736928712]))

ranges

### Excluding the cars valued for more than 100.000,00 USD and less than 0,00 USD

In [None]:
# Excluding the extreme values in the dataset

In [None]:
df_2018 = df_2018.drop(df_2018[df_2018.price > 100000].index)
df_2021 = df_2021.drop(df_2021[df_2021.price > 100000].index)

df_2018 = df_2018.drop(df_2018[df_2018.price <= 0].index)
df_2021 = df_2021.drop(df_2021[df_2021.price <= 0].index)

In [None]:
# The indices are resetted.

In [None]:
df_2018 = df_2018.reset_index(drop=True)
df_2021 = df_2021.reset_index(drop=True)

### Outlier Analysis

In [None]:
print(min(df_2018['price']))
print(min(df_2021['price']))
print(max(df_2018['price']))
print(max(df_2021['price']))

In [None]:
# The Distribution of the Prices

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(12,6))

ax1.hist(df_2018.price, bins=50, color='b')
ax1.set_ylabel('2018')
ax1.set_xlabel('Price Ranges')
ax1.tick_params('x',labelrotation=90)
ax1.xaxis.set_major_locator(plt.MultipleLocator(5000))
ax1.grid(True)

ax2.hist(df_2021.price, bins=50, color='g')
ax2.set_ylabel('2021')
ax2.set_xlabel('Price Ranges')
ax2.tick_params('x',labelrotation=90)
ax2.xaxis.set_major_locator(plt.MultipleLocator(5000))
ax2.grid(True)

plt.show()

In [None]:
# As seen from the graphs, the data is not normally distributed; therefore, there is no need to apply shapiro-wilks test
# to decide whether the data is normally distributed or not. However, by using one of the transformation technique, the
# normality can be achieved.

In [None]:
from scipy.stats import boxcox
from scipy import stats

In [None]:
df_2018['price_box_cox'], lmbda = boxcox(df_2018['price'], lmbda = None)

In [None]:
df_2021['price_box_cox'], lmbda = boxcox(df_2021['price'], lmbda = None)

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(12,6))

ax1.hist(df_2018.price_box_cox, bins=50, color='b')
ax1.set_ylabel('2018')
ax1.set_xlabel('Price Ranges')
ax1.tick_params('x',labelrotation=90)
ax1.grid(True)

ax2.hist(df_2021.price_box_cox, bins=50, color='g')
ax2.set_ylabel('2021')
ax2.set_xlabel('Price Ranges')
ax2.tick_params('x',labelrotation=90)
ax2.grid(True)

plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6))

sns.distplot(df_2018['price_box_cox'], ax = ax1)
sns.distplot(df_2021['price_box_cox'], ax = ax2)

fig.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6))

stats.probplot(df_2018['price_box_cox'], dist="norm", plot = ax1)
stats.probplot(df_2021['price_box_cox'], dist="norm", plot = ax2)

fig.show()

In [None]:
# It seems that the data is presumably normal; therefore, the data can be standardized to be processed 
# for eliminating the outliers.

In [None]:
df_2018['z_scores'] = np.abs(stats.zscore(df_2018['price_box_cox']))

In [None]:
df_2021['z_scores'] = np.abs(stats.zscore(df_2021['price_box_cox']))

In [None]:
df_2018 = df_2018.drop(df_2018[df_2018.z_scores < -2].index)

In [None]:
df_2021 = df_2021.drop(df_2021[df_2021.z_scores < -2].index)

In [None]:
df_2018 = df_2018.drop(df_2018[df_2018.z_scores > 2].index)

In [None]:
df_2021 = df_2021.drop(df_2021[df_2021.z_scores > 2].index)

### Checking Datasets

In [None]:
print(min(df_2018['price']))
print(min(df_2021['price']))
print(max(df_2018['price']))
print(max(df_2021['price']))

In [None]:
print(df_2018.shape)
print(df_2021.shape)

In [None]:
df_2018.columns

In [None]:
df_2021.columns

In [None]:
df_2018.drop(['price_box_cox', 'z_scores'], inplace=True, axis=1)
df_2021.drop(['price_box_cox', 'z_scores'], inplace=True, axis=1)

## "YEARS"

### Distribution of Years

In [None]:
df_2018['year']=pd.to_datetime(df_2018['year'],errors='ignore',format='%y')

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='year',data=df_2018,palette='Blues',dodge=False)
plt.title('Year Distribution for 2018')
plt.tick_params(labelsize=12)
plt.xticks(fontsize=9,rotation=90)
plt.tick_params(axis='y', labelleft=False, labelright=True)
plt.show()

In [None]:
df_2021['year']=pd.to_datetime(df_2021['year'],errors='ignore',format='%y')

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='year',data=df_2021,palette='Blues',dodge=False)
plt.title('Year Distribution for 2021')
plt.tick_params(labelsize=12)
plt.xticks(fontsize=9,rotation=90)
plt.tick_params(axis='y', labelleft=False, labelright=True)
plt.show()

### Disregarding Outliers

In [None]:
df_2018['year'].value_counts().sort_values()
df_2021['year'].value_counts().sort_values()

In [None]:
# It seems that there are some extreme values in the dataset. They are removed.

In [None]:
df_2018 = df_2018.drop(df_2018['year'].loc[df_2018['year']==302].index)
df_2018 = df_2018.drop(df_2018['year'].loc[df_2018['year']==718].index)
df_2018 = df_2018.drop(df_2018['year'].loc[df_2018['year']==1553].index)
df_2018 = df_2018.drop(df_2018['year'].loc[df_2018['year']==1740].index)
df_2018 = df_2018.drop(df_2018['year'].loc[df_2018['year']==1796].index)
df_2018 = df_2018.drop(df_2018['year'].loc[df_2018['year']==1813].index)

### Creation of The Age Column

In [None]:
# Knowig the fact that numeric data can be much easier to be dealt with, the age column are created 
# since year column is ordinal.

In [None]:
df_2018['car_age'] = 2018 - (df_2018['year']-1)
df_2021['car_age'] = 2021 - (df_2021['year']-1)

In [None]:
# Rechecking the distribution

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='car_age',data=df_2018, color='Yellow',dodge=False)
plt.title('Age Distribution for 2018')
plt.tick_params(labelsize=12)
plt.xticks(fontsize=9,rotation=90)
plt.tick_params(axis='y', labelleft=False, labelright=True)
plt.show()

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(x='car_age',data=df_2021,color='Black',dodge=False)
plt.title('Age Distribution for 2021')
plt.tick_params(labelsize=12)
plt.xticks(fontsize=9,rotation=90)
plt.tick_params(axis='y', labelleft=False, labelright=True)
plt.show()

In [None]:
# Disregarding the cars that are older than 25 years. In addition, knowing the fact that this analysis is built upon 
# "used car" terminology, the cars that are younger than 1 year old are also removed from the dataset.  

In [None]:
df_2018 = df_2018.loc[~(df_2018.car_age >= 25)]
df_2021 = df_2021.loc[~(df_2021.car_age >= 25)]

In [None]:
df_2018 = df_2018.loc[~(df_2018.car_age < 1)]
df_2021 = df_2021.loc[~(df_2021.car_age < 1)]

In [None]:
df_2018['car_age'] = pd.to_numeric(df_2018['car_age'])
df_2021['car_age'] = pd.to_numeric(df_2021['car_age'])

In [None]:
df_2018.drop(['year'], inplace=True, axis=1)
df_2021.drop(['year'], inplace=True, axis=1)

In [None]:
print(df_2018.shape)
print(df_2021.shape)

## "MANUFACTURER"

### Determination of the number of the manufacturers in the market

In [None]:
df_2018['manufacturer'].value_counts().sort_values()
df_2021['manufacturer'].value_counts().sort_values()

In [None]:
plt.figure(figsize = (16, 5))
sns.countplot(df_2018['manufacturer'],saturation=5)
plt.title('Number of Cars of Each Manufacturer', y=-0.23)
plt.xticks(rotation=90)

In [None]:
plt.figure(figsize = (16, 5))
sns.countplot(df_2021['manufacturer'],saturation=5)
plt.title('Number of Cars of Each Manufacturer', y=-0.23)
plt.xticks(rotation=90)

In [None]:
cat_col_2018 = ['manufacturer']
for i in cat_col_2018:
    print(df_2018[i].unique())
    
cat_col_2021 = ['manufacturer']
for i in cat_col_2021:
    print(df_2021[i].unique())

### Standardization of the Manufacturer

In [None]:
# It is seen that some of the manufacturers are written under different names. Therefore, they are fixed.

In [None]:
df_2018['manufacturer'].replace(to_replace = 'vw', value='volkswagen',inplace=True)
df_2018['manufacturer'].replace(to_replace = 'mercedesbenz', value='mercedes-benz',inplace=True)
df_2018['manufacturer'].replace(to_replace = 'mercedes', value='mercedes-benz',inplace=True)
df_2018['manufacturer'].replace(to_replace = 'infinity', value='infiniti',inplace=True)
df_2018['manufacturer'].replace(to_replace = 'harley', value='harley-davidson',inplace=True)
df_2018['manufacturer'].replace(to_replace = 'chev', value='chevy',inplace=True)

In [None]:
print(df_2018.shape)
print(df_2021.shape)

## "MODEL"

### Determination of the number of the models in the market

In [None]:
df_2018['model'].value_counts().sort_values()
df_2021['model'].value_counts().sort_values()

In [None]:
cat_col_2018 = ['model']
for i in cat_col_2018:
    print(df_2018[i].unique())
    
cat_col_2021 = ['model']
for i in cat_col_2021:
    print(df_2021[i].unique())

## "ODOMETER"

### Distribution of Odometer

In [None]:
print(min(df_2018['odometer']))
print(min(df_2021['odometer']))
print(max(df_2018['odometer']))
print(max(df_2021['odometer']))

In [None]:
km_ranges_2018 = []
i = 1
j = 0

while i < 21:    
    km_ranges_2018.append(df_2018['odometer'].value_counts(bins=[(j + 1), (j + 50000)]))
    j = j + 50000
    i = i + 1

km_ranges_2018.append(df_2018['odometer'].value_counts(bins=[1000001, 10000000]))    
    
km_ranges_2018

In [None]:
km_ranges_2021 = []
i = 1
j = 0

while i < 21:    
    km_ranges_2021.append(df_2021['odometer'].value_counts(bins=[(j + 1), (j + 50000)]))
    j = j + 50000
    i = i + 1

km_ranges_2021.append(df_2021['odometer'].value_counts(bins=[1000001, 10000000]))    
    
km_ranges_2021

### Disregarding Outliers

In [None]:
df_2018 = df_2018.drop(df_2018[df_2018.odometer > 1000000].index)
df_2021 = df_2021.drop(df_2021[df_2021.odometer > 1000000].index)

df_2018 = df_2018.drop(df_2018[df_2018.odometer <= 0].index)
df_2021 = df_2021.drop(df_2021[df_2021.odometer <= 0].index)

In [None]:
fig,ax=plt.subplots(figsize=(10,6))
df_2018.odometer.hist(bins=50)
plt.xticks(rotation=90)
plt.gca().xaxis.set_major_locator(plt.MultipleLocator(50000))

plt.title('Odometer Distribution for 2018 Dataset')
plt.grid(True)

In [None]:
fig,ax=plt.subplots(figsize=(10,6))
df_2021.odometer.hist(bins=50)
plt.xticks(rotation=90)
plt.gca().xaxis.set_major_locator(plt.MultipleLocator(50000))

plt.title('Odometer Distribution for 2021 Dataset')
plt.grid(True)

In [None]:
print(df_2018.shape)
print(df_2021.shape)

In [None]:
df_2018['odometer_box_cox'], lmbda = boxcox(df_2018['odometer'], lmbda = None)

In [None]:
df_2021['odometer_box_cox'], lmbda = boxcox(df_2021['odometer'], lmbda = None)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,6))

sns.distplot(df_2018['odometer_box_cox'], ax = ax1)
sns.distplot(df_2021['odometer_box_cox'], ax = ax2)

fig.show()

In [None]:
df_2018['z_scores'] = np.abs(stats.zscore(df_2018['odometer_box_cox']))

In [None]:
df_2021['z_scores'] = np.abs(stats.zscore(df_2021['odometer_box_cox']))

In [None]:
df_2018 = df_2018.drop(df_2018[df_2018.z_scores < -2.45].index)
df_2021 = df_2021.drop(df_2021[df_2021.z_scores < -2.45].index)
df_2018 = df_2018.drop(df_2018[df_2018.z_scores > 2.45].index)
df_2021 = df_2021.drop(df_2021[df_2021.z_scores > 2.45].index)

In [None]:
print(min(df_2018['odometer']))
print(min(df_2021['odometer']))
print(max(df_2018['odometer']))
print(max(df_2021['odometer']))

In [None]:
print(df_2018.shape)
print(df_2021.shape)

In [None]:
df_2018.columns

In [None]:
df_2021.columns

In [None]:
df_2018.drop(['odometer_box_cox', 'z_scores'], inplace=True, axis=1)
df_2021.drop(['odometer_box_cox', 'z_scores'], inplace=True, axis=1)

## "CONDITION"

In [None]:
df_2018['condition'].value_counts().sort_values()
df_2021['condition'].value_counts().sort_values()

In [None]:
print(df_2018['condition'].unique())
print(df_2021['condition'].unique())

In [None]:
plt.figure(figsize = (16, 5))
sns.countplot(df_2018['condition'],saturation=5)
plt.title('Number of Cars of Each Condition for 2018', y=-0.23)

plt.figure(figsize = (16, 5))
sns.countplot(df_2021['condition'],saturation=5)
plt.title('Number of Cars of Each Condition for 2021', y=-0.23)

In [None]:
Probabilities_Condition_2018 = pd.DataFrame(list(df_2018.condition.value_counts(normalize=True).items()),columns = ['Condition','Probabilities'])
Probabilities_Condition_2018

Probabilities_Condition_2021 = pd.DataFrame(list(df_2021.condition.value_counts(normalize=True).items()),columns = ['Condition','Probabilities'])
Probabilities_Condition_2021

In [None]:
df_2018.loc[df_2018.condition.isna(), 'condition'] = np.random.choice(Probabilities_Condition_2018.Condition, 
                                                size=df_2018.condition.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Condition_2018.Probabilities)

df_2021.loc[df_2021.condition.isna(), 'condition'] = np.random.choice(Probabilities_Condition_2021.Condition, 
                                                size=df_2021.condition.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Condition_2021.Probabilities)

## "CYLINDERS"

In [None]:
df_2018['cylinders'].value_counts().sort_values()
df_2021['cylinders'].value_counts().sort_values()

In [None]:
print(df_2018['cylinders'].unique())
print(df_2021['cylinders'].unique())

In [None]:
plt.figure(figsize = (16, 5))
sns.countplot(df_2018['cylinders'],saturation=5)
plt.title('Number of Cars of Each Cylinder for 2018', y=-0.23)

plt.figure(figsize = (16, 5))
sns.countplot(df_2021['cylinders'],saturation=5)
plt.title('Number of Cars of Each Cylinder for 2021', y=-0.23)

In [None]:
df_2018.cylinders.replace("other",np.nan,inplace=True)
df_2018.cylinders.value_counts()

df_2021.cylinders.replace("other",np.nan,inplace=True)
df_2021.cylinders.value_counts()

In [None]:
Probabilities_Cylinders_2018 = pd.DataFrame(list(df_2018.cylinders.value_counts(normalize=True).items()),columns = ['Cylinders','Probabilities'])
Probabilities_Cylinders_2018

Probabilities_Cylinders_2021 = pd.DataFrame(list(df_2021.cylinders.value_counts(normalize=True).items()),columns = ['Cylinders','Probabilities'])
Probabilities_Cylinders_2021

In [None]:
df_2018.loc[df_2018.cylinders.isna(), 'cylinders'] = np.random.choice(Probabilities_Cylinders_2018.Cylinders, 
                                                size=df_2018.cylinders.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Cylinders_2018.Probabilities)

df_2021.loc[df_2021.cylinders.isna(), 'cylinders'] = np.random.choice(Probabilities_Cylinders_2021.Cylinders, 
                                                size=df_2021.cylinders.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Cylinders_2021.Probabilities)

In [None]:
df_2018['cylinders'] = df_2018['cylinders'].apply(lambda x:x.split(' ')[0])
df_2021['cylinders'] = df_2021['cylinders'].apply(lambda x:x.split(' ')[0])

df_2018['cylinders'] = df_2018['cylinders'].astype('int')
df_2021['cylinders'] = df_2021['cylinders'].astype('int')

df_2018['cylinders'].dtype
df_2021['cylinders'].dtype

## "FUEL"

In [None]:
df_2018['fuel'].value_counts().sort_values()
df_2021['fuel'].value_counts().sort_values()

In [None]:
print(df_2018['fuel'].unique())
print(df_2021['fuel'].unique())

In [None]:
plt.figure(figsize = (16, 5))
sns.countplot(df_2018['fuel'],saturation=5)
plt.title('Number of Cars of Each Fuel Type for 2018', y=-0.23)

plt.figure(figsize = (16, 5))
sns.countplot(df_2021['fuel'],saturation=5)
plt.title('Number of Cars of Each Fuel Type for 2021', y=-0.23)

In [None]:
Probabilities_Fuel_2018 = pd.DataFrame(list(df_2018.fuel.value_counts(normalize=True).items()),columns = ['Fuel','Probabilities'])
Probabilities_Fuel_2018

In [None]:
Probabilities_Fuel_2021 = pd.DataFrame(list(df_2021.fuel.value_counts(normalize=True).items()),columns = ['Fuel','Probabilities'])
Probabilities_Fuel_2021

In [None]:
df_2018.loc[df_2018.fuel.isna(), 'fuel'] = np.random.choice(Probabilities_Fuel_2018.Fuel, 
                                                size=df_2018.fuel.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Fuel_2018.Probabilities)

In [None]:
df_2021.loc[df_2021.fuel.isna(), 'fuel'] = np.random.choice(Probabilities_Fuel_2021.Fuel, 
                                                size=df_2021.fuel.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Fuel_2021.Probabilities)

## "TITLE_STATUS"

In [None]:
df_2018['title_status'].value_counts().sort_values()
df_2021['title_status'].value_counts().sort_values()

In [None]:
print(df_2018['title_status'].unique())
print(df_2021['title_status'].unique())

In [None]:
plt.figure(figsize = (16, 5))
sns.countplot(df_2018['title_status'],saturation=5)
plt.title('Number of Cars of Each Status for 2018', y=-0.23)

plt.figure(figsize = (16, 5))
sns.countplot(df_2021['title_status'],saturation=5)
plt.title('Number of Cars of Each Status for 2021', y=-0.23)

In [None]:
Probabilities_Title_Status_2018 = pd.DataFrame(list(df_2018.title_status.value_counts(normalize=True).items()),columns = ['Title_Status','Probabilities'])
Probabilities_Title_Status_2018

In [None]:
Probabilities_Title_Status_2021 = pd.DataFrame(list(df_2021.title_status.value_counts(normalize=True).items()),columns = ['Title_Status','Probabilities'])
Probabilities_Title_Status_2021

In [None]:
df_2018.loc[df_2018.title_status.isna(), 'title_status'] = np.random.choice(Probabilities_Title_Status_2018.Title_Status, 
                                                size=df_2018.title_status.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Title_Status_2018.Probabilities)

In [None]:
df_2021.loc[df_2021.title_status.isna(), 'title_status'] = np.random.choice(Probabilities_Title_Status_2021.Title_Status, 
                                                size=df_2021.title_status.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Title_Status_2021.Probabilities)

## "TRANSMISSION"

In [None]:
df_2018['transmission'].value_counts().sort_values()
df_2021['transmission'].value_counts().sort_values()

In [None]:
print(df_2018['transmission'].unique())
print(df_2021['transmission'].unique())

In [None]:
plt.figure(figsize = (16, 5))
sns.countplot(df_2018['transmission'],saturation=5)
plt.title('Number of Cars of Each Transmission for 2018', y=-0.23)

plt.figure(figsize = (16, 5))
sns.countplot(df_2021['transmission'],saturation=5)
plt.title('Number of Cars of Each Transmission for 2021', y=-0.23)

In [None]:
Probabilities_Transmission_2018 = pd.DataFrame(list(df_2018.transmission.value_counts(normalize=True).items()),columns = ['Transmission','Probabilities'])
Probabilities_Transmission_2018

In [None]:
Probabilities_Transmission_2021 = pd.DataFrame(list(df_2021.transmission.value_counts(normalize=True).items()),columns = ['Transmission','Probabilities'])
Probabilities_Transmission_2021

In [None]:
df_2018.loc[df_2018.transmission.isna(), 'transmission'] = np.random.choice(Probabilities_Transmission_2018.Transmission, 
                                                size=df_2018.transmission.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Transmission_2018.Probabilities)

In [None]:
df_2021.loc[df_2021.transmission.isna(), 'transmission'] = np.random.choice(Probabilities_Transmission_2021.Transmission, 
                                                size=df_2021.transmission.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Transmission_2021.Probabilities)

## "DRIVE"

In [None]:
df_2018['drive'].value_counts().sort_values()
df_2021['drive'].value_counts().sort_values()

In [None]:
print(df_2018['drive'].unique())
print(df_2021['drive'].unique())

In [None]:
plt.figure(figsize = (16, 5))
sns.countplot(df_2018['drive'],saturation=5)
plt.title('Number of Cars of Each Drive for 2018', y=-0.23)

plt.figure(figsize = (16, 5))
sns.countplot(df_2021['drive'],saturation=5)
plt.title('Number of Cars of Each Drive for 2021', y=-0.23)

In [None]:
Probabilities_Drive_2018 = pd.DataFrame(list(df_2018.drive.value_counts(normalize=True).items()),columns = ['Drive','Probabilities'])
Probabilities_Drive_2018

In [None]:
Probabilities_Drive_2021 = pd.DataFrame(list(df_2021.drive.value_counts(normalize=True).items()),columns = ['Drive','Probabilities'])
Probabilities_Drive_2021

In [None]:
df_2018.loc[df_2018.drive.isna(), 'drive'] = np.random.choice(Probabilities_Drive_2018.Drive, 
                                                size=df_2018.drive.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Drive_2018.Probabilities)

In [None]:
df_2021.loc[df_2021.drive.isna(), 'drive'] = np.random.choice(Probabilities_Drive_2021.Drive, 
                                                size=df_2021.drive.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Drive_2021.Probabilities)

## "TYPE"

In [None]:
df_2018['type'].value_counts().sort_values()
df_2021['type'].value_counts().sort_values()

In [None]:
print(df_2018['type'].unique())
print(df_2021['type'].unique())

In [None]:
plt.figure(figsize = (16, 5))
sns.countplot(df_2018['type'],saturation=5)
plt.title('Number of Cars of Each Type for 2018', y=-0.23)

plt.figure(figsize = (16, 5))
sns.countplot(df_2021['type'],saturation=5)
plt.title('Number of Cars of Each Type for 2021', y=-0.23)

In [None]:
Probabilities_Type_2018 = pd.DataFrame(list(df_2018.type.value_counts(normalize=True).items()),columns = ['Type','Probabilities'])
Probabilities_Type_2018

In [None]:
Probabilities_Type_2021 = pd.DataFrame(list(df_2021.type.value_counts(normalize=True).items()),columns = ['Type','Probabilities'])
Probabilities_Type_2021

In [None]:
df_2018.loc[df_2018.type.isna(), 'type'] = np.random.choice(Probabilities_Type_2018.Type, 
                                                size=df_2018.type.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Type_2018.Probabilities)

In [None]:
df_2021.loc[df_2021.type.isna(), 'type'] = np.random.choice(Probabilities_Type_2021.Type, 
                                                size=df_2021.type.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Type_2021.Probabilities)

## "PAINT_COLOR"

In [None]:
df_2018['paint_color'].value_counts().sort_values()
df_2021['paint_color'].value_counts().sort_values()

In [None]:
print(df_2018['paint_color'].unique())
print(df_2021['paint_color'].unique())

In [None]:
plt.figure(figsize = (16, 5))
sns.countplot(df_2018['paint_color'],saturation=5)
plt.title('Number of Cars of Each Color for 2018', y=-0.23)

plt.figure(figsize = (16, 5))
sns.countplot(df_2021['paint_color'],saturation=5)
plt.title('Number of Cars of Each Color for 2021', y=-0.23)

In [None]:
Probabilities_Paint_Color_2018 = pd.DataFrame(list(df_2018.paint_color.value_counts(normalize=True).items()),columns = ['Paint_Color','Probabilities'])
Probabilities_Paint_Color_2018

In [None]:
Probabilities_Paint_Color_2021 = pd.DataFrame(list(df_2021.paint_color.value_counts(normalize=True).items()),columns = ['Paint_Color','Probabilities'])
Probabilities_Paint_Color_2021

In [None]:
df_2018.loc[df_2018.paint_color.isna(), 'paint_color'] = np.random.choice(Probabilities_Paint_Color_2018.Paint_Color, 
                                                size=df_2018.paint_color.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Paint_Color_2018.Probabilities)

In [None]:
df_2021.loc[df_2021.paint_color.isna(), 'paint_color'] = np.random.choice(Probabilities_Paint_Color_2021.Paint_Color, 
                                                size=df_2021.paint_color.isna().sum(),
                                                replace=True, 
                                                p=Probabilities_Paint_Color_2021.Probabilities)

In [None]:
os.chdir("C:/Users/aykut/OneDrive/Masaüstü/04 - Thesis/20 - Imputed Data")

In [None]:
df_2018 = df_2018.reset_index(drop=True)
df_2021 = df_2021.reset_index(drop=True)

In [None]:
df_2018.to_csv("2018.csv", index=False)
df_2021.to_csv("2021.csv", index=False)

# MODEL TRAINING & TESTING

In [None]:
os.chdir("C:/Users/aykut/OneDrive/Masaüstü/04 - Thesis/20 - Imputed Data")

In [None]:
df_2018 = pd.read_csv("2018.csv")
df_2021 = pd.read_csv("2021.csv")

In [None]:
pip install xgboost

In [None]:
pip install lightgbm

In [None]:
pip install catboost

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

### Relevancy between Columns

In [None]:
# Selection of the numerical and categorical columns

numerical_df_2018 = df_2018.select_dtypes(include = np.number)
numerical_df_2021 = df_2021.select_dtypes(include = np.number)

categorical_df_2018 = df_2018.select_dtypes(include = object)
categorical_df_2021 = df_2021.select_dtypes(include = object)

In [None]:
encode_2018 = OrdinalEncoder()
encode_2021 = OrdinalEncoder()

In [None]:
#place categorical columns to list and encode

categorical_columns_2018 = categorical_df_2018.columns.tolist()
categorical_columns_2021 = categorical_df_2021.columns.tolist()

encode_2018.fit(categorical_df_2018[categorical_columns_2018])
encode_2021.fit(categorical_df_2021[categorical_columns_2021])

In [None]:
#transform categorical encoding and place in dataframe

categorical_encoded_2018 = encode_2018.transform(categorical_df_2018[categorical_columns_2018])
categorical_encoded_2021 = encode_2021.transform(categorical_df_2021[categorical_columns_2021])

categorical_encoded_2018 = pd.DataFrame(categorical_encoded_2018, columns = categorical_columns_2018)
categorical_encoded_2021 = pd.DataFrame(categorical_encoded_2021, columns = categorical_columns_2021)

In [None]:
categorical_df_2018.reset_index(inplace=True,drop=True)
categorical_df_2021.reset_index(inplace=True,drop=True)

numerical_df_2018.reset_index(inplace=True,drop=True)
numerical_df_2021.reset_index(inplace=True,drop=True)

categorical_encoded_2018.reset_index(inplace=True,drop=True)
categorical_encoded_2021.reset_index(inplace=True,drop=True)

In [None]:
final_df_2018 = pd.concat([numerical_df_2018,categorical_encoded_2018],axis=1)
final_df_2021 = pd.concat([numerical_df_2021,categorical_encoded_2021],axis=1)

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data = final_df_2018.corr(),annot=True)

plt.figure(figsize=(16,8))
sns.heatmap(data = final_df_2021.corr(),annot=True)

### Modelling Preparation

In [None]:
labelled_df_2018 = df_2018
labelled_df_2021 = df_2021

In [None]:
le_manufacturer_2018 = LabelEncoder()
le_manufacturer_2021 = LabelEncoder()

le_model_2018 = LabelEncoder()
le_model_2021 = LabelEncoder()

le_condition_2018 = LabelEncoder()
le_condition_2021 = LabelEncoder()

le_fuel_2018 = LabelEncoder()
le_fuel_2021 = LabelEncoder()

le_title_status_2018 = LabelEncoder()
le_title_status_2021 = LabelEncoder()

le_transmission_2018 = LabelEncoder()
le_transmission_2021 = LabelEncoder()

le_drive_2018 = LabelEncoder()
le_drive_2021 = LabelEncoder()

le_type_2018 = LabelEncoder()
le_type_2021 = LabelEncoder()

le_color_2018 = LabelEncoder()
le_color_2021 = LabelEncoder()

In [None]:
labelled_df_2018["manufacturer"] = le_manufacturer_2018.fit_transform(df_2018['manufacturer'])
labelled_df_2021["manufacturer"] = le_manufacturer_2021.fit_transform(df_2021['manufacturer'])

labelled_df_2018["model"] = le_model_2018.fit_transform(df_2018['model'])
labelled_df_2021["model"] = le_model_2021.fit_transform(df_2021['model'])

labelled_df_2018["condition"] = le_condition_2018.fit_transform(df_2018['condition'])
labelled_df_2021["condition"] = le_condition_2021.fit_transform(df_2021['condition'])

labelled_df_2018["fuel"] = le_fuel_2018.fit_transform(df_2018['fuel'])
labelled_df_2021["fuel"] = le_fuel_2021.fit_transform(df_2021['fuel'])

labelled_df_2018["title_status"] = le_title_status_2018.fit_transform(df_2018['title_status'])
labelled_df_2021["title_status"] = le_title_status_2021.fit_transform(df_2021['title_status'])

labelled_df_2018["transmission"] = le_transmission_2018.fit_transform(df_2018['transmission'])
labelled_df_2021["transmission"] = le_transmission_2021.fit_transform(df_2021['transmission'])

labelled_df_2018["drive"] = le_drive_2018.fit_transform(df_2018['drive'])
labelled_df_2021["drive"] = le_drive_2021.fit_transform(df_2021['drive'])

labelled_df_2018["type"] = le_type_2018.fit_transform(df_2018['type'])
labelled_df_2021["type"] = le_type_2021.fit_transform(df_2021['type'])

labelled_df_2018["paint_color"] = le_color_2018.fit_transform(df_2018['paint_color'])
labelled_df_2021["paint_color"] = le_color_2021.fit_transform(df_2021['paint_color'])

In [None]:
X_2018 = labelled_df_2018.drop('price', axis = 1)
Y_2018 = labelled_df_2018['price']

X_2021 = labelled_df_2021.drop('price', axis = 1)
Y_2021 = labelled_df_2021['price']

In [None]:
os.chdir("C:/Users/aykut/OneDrive/Masaüstü/04 - Thesis/21 - Train & Test Data")

In [None]:
X_2018.to_csv('X_2018.csv', index = False)
Y_2018.to_csv('Y_2018.csv', index = False)
X_2021.to_csv('X_2021.csv', index = False)
Y_2021.to_csv('Y_2021.csv', index = False)

### Modelling

In [None]:
os.chdir("C:/Users/aykut/OneDrive/Masaüstü/04 - Thesis/21 - Train & Test Data")

In [None]:
X_2018 = pd.read_csv("X_2018.csv")
Y_2018 = pd.read_csv("Y_2018.csv")

X_2021 = pd.read_csv("X_2021.csv")
Y_2021 = pd.read_csv("Y_2021.csv")

In [None]:
Y_2018.drop(['Unnamed: 0'], inplace=True, axis=1)
Y_2021.drop(['Unnamed: 0'], inplace=True, axis=1)

X_2018.drop(['Unnamed: 0'], inplace=True, axis=1)
X_2021.drop(['Unnamed: 0'], inplace=True, axis=1)

In [None]:
X_train_2018, X_test_2018, Y_train_2018, Y_test_2018 = train_test_split(X_2018, Y_2018, test_size = 0.2, random_state = 42)
X_train_2021, X_test_2021, Y_train_2021, Y_test_2021 = train_test_split(X_2021, Y_2021, test_size = 0.2, random_state = 42)

### XGBOOST

#### Baseline Model

In [None]:
model_XGB_2018 = XGBRegressor()
model_XGB_2021 = XGBRegressor()

In [None]:
model_XGB_2018.fit(X_train_2018, Y_train_2018)
model_XGB_2021.fit(X_train_2021, Y_train_2021)

In [None]:
predictions_test_2018 = model_XGB_2018.predict(X_test_2018)
predictions_test_2021 = model_XGB_2021.predict(X_test_2021)

In [None]:
predictions_train_2018 = model_XGB_2018.predict(X_train_2018)
predictions_train_2021 = model_XGB_2021.predict(X_train_2021)

In [None]:
MSE_test_2018 = mean_squared_error(Y_test_2018, predictions_test_2018)
MAE_test_2018 = mean_absolute_error(Y_test_2018, predictions_test_2018)
MAPE_test_2018 = mean_absolute_percentage_error(Y_test_2018, predictions_test_2018)
R2_Score_test_2018 = model_XGB_2018.score(X_test_2018,Y_test_2018)

MSE_test_2021 = mean_squared_error(Y_test_2021, predictions_test_2021)
MAE_test_2021 = mean_absolute_error(Y_test_2021, predictions_test_2021)
MAPE_test_2021 = mean_absolute_percentage_error(Y_test_2021, predictions_test_2021)
R2_Score_test_2021 = model_XGB_2021.score(X_test_2021,Y_test_2021)

MSE_train_2018 = mean_squared_error(Y_train_2018, predictions_train_2018)
MAE_train_2018 = mean_absolute_error(Y_train_2018, predictions_train_2018)
MAPE_train_2018 = mean_absolute_percentage_error(Y_train_2018, predictions_train_2018)
R2_Score_train_2018 = model_XGB_2018.score(X_train_2018,Y_train_2018)

MSE_train_2021 = mean_squared_error(Y_train_2021, predictions_train_2021)
MAE_train_2021 = mean_absolute_error(Y_train_2021, predictions_train_2021)
MAPE_train_2021 = mean_absolute_percentage_error(Y_train_2021, predictions_train_2021)
R2_Score_train_2021 = model_XGB_2021.score(X_train_2021,Y_train_2021)

In [None]:
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2018 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2018)
print(MAE_train_2018)
print(MAPE_train_2018)
print(round(R2_Score_train_2018,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2018)
print(MAE_test_2018)
print(MAPE_test_2018)
print(round(R2_Score_test_2018,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2021 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2021)
print(MAE_train_2021)
print(MAPE_train_2021)
print(round(R2_Score_train_2021,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2021)
print(MAE_test_2021)
print(MAPE_test_2021)
print(round(R2_Score_test_2021,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')

#### Hyperparameter Tuning

In [None]:
import xgboost as xgb

from xgboost.sklearn import XGBRegressor
from sklearn import metrics   
from sklearn.model_selection import GridSearchCV

In [None]:
# First, train and test sets should be converted into D-Matrix.

In [None]:
DMatrix_X_train_2018 = xgb.DMatrix(X_train_2018, label = Y_train_2018)
DMatrix_X_test_2018 = xgb.DMatrix(X_test_2018, label = Y_test_2018)

In [None]:
DMatrix_X_train_2021 = xgb.DMatrix(X_train_2021, label = Y_train_2021)
DMatrix_X_test_2021 = xgb.DMatrix(X_test_2021, label = Y_test_2021)

In [None]:
# Second,initial paramaters should be given. Any parameter can be selected.

parameters_2018 =   {
                    'max_depth':6,
                    'min_child_weight': 1,
                    'eta':.3,
                    'objective':'reg:linear',
                    'eval_metric':'mae'
                    }

parameters_2021 =   {
                    'max_depth':6,
                    'min_child_weight': 1,
                    'eta':.3,
                    'objective':'reg:linear',
                    'eval_metric':'mae'
                    }

In [None]:
# Grid Search should be set.

gridsearch_parameters_2018 = [
    (max_depth, min_child_weight)
    for max_depth in range(6,9)
    for min_child_weight in range(5,8)
]

gridsearch_parameters_2021 = [
    (max_depth, min_child_weight)
    for max_depth in range(6,9)
    for min_child_weight in range(5,8)
]

In [None]:
# Hyperparamaters process should be started by using cross-validation.

MAE_Min_2018 = float("Inf")
Best_Parameters_2018 = None

for max_depth, min_child_weight in gridsearch_parameters_2018:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    
    parameters_2018['max_depth'] = max_depth
    parameters_2018['min_child_weight'] = min_child_weight
        
    cv_results_2018 = xgb.cv(
                            parameters_2018,
                            DMatrix_X_train_2018,
                            num_boost_round = num_boost_round,
                            seed = 42,
                            nfold = 5,
                            metrics = {'mae'},
                            early_stopping_rounds = 10
                            )
    
    MAE_Mean_2018 = cv_results_2018['test-mae-mean'].min()
    Boost_Rounds_2018 = cv_results_2018['test-mae-mean'].argmin()
    
    print("\tMAE {} for {} rounds".format(MAE_Mean_2018, Boost_Rounds_2018))
    if MAE_Mean_2018 < MAE_Min_2018:
        MAE_Min_2018 = MAE_Mean_2018
        Best_Parameters_2018 = (max_depth, min_child_weight)
        
print("Best Parameters for 2018: {}, {}, MAE for 2018: {}".format(Best_Parameters_2018[0], Best_Parameters_2018[1], MAE_Min_2018))

In [None]:
# Found parameters are saved.

parameters_2018['max_depth'] = Best_Parameters_2018[0]
parameters_2018['min_child_weight'] = Best_Parameters_2018[1]

In [None]:
# Hyperparameter process should be started by using cross-validation.

MAE_Min_2021 = float("Inf")
Best_Parameters_2021 = None

for max_depth, min_child_weight in gridsearch_parameters_2021:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    
    parameters_2021['max_depth'] = max_depth
    parameters_2021['min_child_weight'] = min_child_weight
        
    cv_results_2021 = xgb.cv(
                            parameters_2021,
                            DMatrix_X_train_2021,
                            num_boost_round = num_boost_round,
                            seed = 42,
                            nfold = 5,
                            metrics = {'mae'},
                            early_stopping_rounds = 10
                            )
    
    MAE_Mean_2021 = cv_results_2021['test-mae-mean'].min()
    Boost_Rounds_2021 = cv_results_2021['test-mae-mean'].argmin()
    
    print("\tMAE {} for {} rounds".format(MAE_Mean_2021, Boost_Rounds_2021))
    if MAE_Mean_2021 < MAE_Min_2021:
        MAE_Min_2021 = MAE_Mean_2021
        Best_Parameters_2021 = (max_depth, min_child_weight)
        
print("Best Parameters for 2021: {}, {}, MAE for 2021: {}".format(Best_Parameters_2021[0], Best_Parameters_2021[1], MAE_Min_2021))

In [None]:
# Found parameters are saved.

parameters_2021['max_depth'] = Best_Parameters_2021[0]
parameters_2021['min_child_weight'] = Best_Parameters_2021[1]

In [None]:
# Same process should also be followed for "Learning Rate".

MAE_Min_2018 = float("Inf")
Best_Parameters_2018 = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    
    parameters_2018['eta'] = eta
    
    cv_results_2018 = xgb.cv(
                            parameters_2018,
                            DMatrix_X_train_2018,
                            num_boost_round = num_boost_round,
                            seed = 42,
                            nfold = 5,
                            metrics = ['mae'],
                            early_stopping_rounds = 10
                            )
    
    MAE_Mean_2018 = cv_results_2018['test-mae-mean'].min()
    Boost_Rounds_2018 = cv_results_2018['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(MAE_Mean_2018, Boost_Rounds_2018))
    if MAE_Mean_2018 < MAE_Min_2018:
        MAE_Min_2018 = MAE_Mean_2018
        Best_Parameters_2018 = eta
print("Best Parameters for 2018: {}, MAE for 2018: {}".format(Best_Parameters_2018, MAE_Min_2018))

In [None]:
# Same process should also be followed for "Learning Rate".

MAE_Min_2021 = float("Inf")
Best_Parameters_2021 = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    
    parameters_2021['eta'] = eta
    
    cv_results_2021 = xgb.cv(
                             parameters_2021,
                             DMatrix_X_train_2021,
                             num_boost_round = num_boost_round,
                             seed = 42,
                             nfold = 5,
                             metrics = ['mae'],
                             early_stopping_rounds = 10
                             )
    
    MAE_Mean_2021 = cv_results_2021['test-mae-mean'].min()
    Boost_Rounds_2021 = cv_results_2021['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(MAE_Mean_2021, Boost_Rounds_2021))
    if MAE_Mean_2021 < MAE_Min_2021:
        MAE_Min_2021 = MAE_Mean_2021
        Best_Parameters_2021 = eta
print("Best Parameters for 2021: {}, MAE for 2021: {}".format(Best_Parameters_2021, MAE_Min_2021))

In [None]:
# After the hyperparameter tuning, below parameters have been identified.

In [None]:
parameters_2018 =   {
                    'max_depth':7,
                    'min_child_weight': 5,
                    'eta':.3,
                    'objective':'reg:linear',
                    'eval_metric':'mae'
                    }

parameters_2021 =   {
                    'max_depth':7,
                    'min_child_weight': 6,
                    'eta':.3,
                    'objective':'reg:linear',
                    'eval_metric':'mae'
                    }

#### Re-Modelling

In [None]:
# After the hyperparameter tuning, the models once again have been checked on test data.

model_XGB_2018 = XGBRegressor(max_depth = 7, min_child_weight = 5, eta = 0.3, verbose = 0, n_estimators = 2546)
model_XGB_2021 = XGBRegressor(max_depth = 7, min_child_weight = 6, eta = 0.3, verbose = 0, n_estimators = 3930)

In [None]:
model_XGB_2018.fit(X_train_2018, Y_train_2018)
model_XGB_2021.fit(X_train_2021, Y_train_2021)

In [None]:
predictions_test_2018 = model_XGB_2018.predict(X_test_2018)
predictions_test_2021 = model_XGB_2021.predict(X_test_2021)

In [None]:
predictions_train_2018 = model_XGB_2018.predict(X_train_2018)
predictions_train_2021 = model_XGB_2021.predict(X_train_2021)

In [None]:
MSE_test_2018 = mean_squared_error(Y_test_2018, predictions_test_2018)
MAE_test_2018 = mean_absolute_error(Y_test_2018, predictions_test_2018)
MAPE_test_2018 = mean_absolute_percentage_error(Y_test_2018, predictions_test_2018)
R2_Score_test_2018 = model_XGB_2018.score(X_test_2018,Y_test_2018)

MSE_test_2021 = mean_squared_error(Y_test_2021, predictions_test_2021)
MAE_test_2021 = mean_absolute_error(Y_test_2021, predictions_test_2021)
MAPE_test_2021 = mean_absolute_percentage_error(Y_test_2021, predictions_test_2021)
R2_Score_test_2021 = model_XGB_2021.score(X_test_2021,Y_test_2021)

MSE_train_2018 = mean_squared_error(Y_train_2018, predictions_train_2018)
MAE_train_2018 = mean_absolute_error(Y_train_2018, predictions_train_2018)
MAPE_train_2018 = mean_absolute_percentage_error(Y_train_2018, predictions_train_2018)
R2_Score_train_2018 = model_XGB_2018.score(X_train_2018,Y_train_2018)

MSE_train_2021 = mean_squared_error(Y_train_2021, predictions_train_2021)
MAE_train_2021 = mean_absolute_error(Y_train_2021, predictions_train_2021)
MAPE_train_2021 = mean_absolute_percentage_error(Y_train_2021, predictions_train_2021)
R2_Score_train_2021 = model_XGB_2021.score(X_train_2021,Y_train_2021)

In [None]:
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2018 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2018)
print(MAE_train_2018)
print(MAPE_train_2018)
print(round(R2_Score_train_2018,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2018)
print(MAE_test_2018)
print(MAPE_test_2018)
print(round(R2_Score_test_2018,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2021 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2021)
print(MAE_train_2021)
print(MAPE_train_2021)
print(round(R2_Score_train_2021,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2021)
print(MAE_test_2021)
print(MAPE_test_2021)
print(round(R2_Score_test_2021,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')

#### Feature Importance

In [None]:
pip install shap

In [None]:
import shap

In [None]:
explainer_2018 = shap.TreeExplainer(model_XGB_2018)
shap_values_2018 = explainer_2018.shap_values(X_test_2018)

In [None]:
shap.summary_plot(shap_values_2018, X_test_2018, plot_type="bar")

In [None]:
explainer_2021=shap.TreeExplainer(model_XGB_2021)
shap_values_2021=explainer_2021.shap_values(X_test_2021)

In [None]:
shap.summary_plot(shap_values_2021, X_test_2021, plot_type="bar")

### CATBOOST

#### Baseline Models

In [None]:
model_Cat_2018 = CatBoostRegressor()
model_Cat_2021 = CatBoostRegressor()

In [None]:
model_Cat_2018.fit(X_train_2018, Y_train_2018)
model_Cat_2021.fit(X_train_2021, Y_train_2021)

In [None]:
predictions_test_2018 = model_Cat_2018.predict(X_test_2018)
predictions_test_2021 = model_Cat_2021.predict(X_test_2021)

In [None]:
predictions_train_2018 = model_Cat_2018.predict(X_train_2018)
predictions_train_2021 = model_Cat_2021.predict(X_train_2021)

In [None]:
MSE_test_2018 = mean_squared_error(Y_test_2018, predictions_test_2018)
MAE_test_2018 = mean_absolute_error(Y_test_2018, predictions_test_2018)
MAPE_test_2018 = mean_absolute_percentage_error(Y_test_2018, predictions_test_2018)
R2_Score_test_2018 = model_Cat_2018.score(X_test_2018,Y_test_2018)

MSE_test_2021 = mean_squared_error(Y_test_2021, predictions_test_2021)
MAE_test_2021 = mean_absolute_error(Y_test_2021, predictions_test_2021)
MAPE_test_2021 = mean_absolute_percentage_error(Y_test_2021, predictions_test_2021)
R2_Score_test_2021 = model_Cat_2021.score(X_test_2021,Y_test_2021)

MSE_train_2018 = mean_squared_error(Y_train_2018, predictions_train_2018)
MAE_train_2018 = mean_absolute_error(Y_train_2018, predictions_train_2018)
MAPE_train_2018 = mean_absolute_percentage_error(Y_train_2018, predictions_train_2018)
R2_Score_train_2018 = model_Cat_2018.score(X_train_2018,Y_train_2018)

MSE_train_2021 = mean_squared_error(Y_train_2021, predictions_train_2021)
MAE_train_2021 = mean_absolute_error(Y_train_2021, predictions_train_2021)
MAPE_train_2021 = mean_absolute_percentage_error(Y_train_2021, predictions_train_2021)
R2_Score_train_2021 = model_Cat_2021.score(X_train_2021,Y_train_2021)

In [None]:
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2018 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2018)
print(MAE_train_2018)
print(MAPE_train_2018)
print(round(R2_Score_train_2018,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2018)
print(MAE_test_2018)
print(MAPE_test_2018)
print(round(R2_Score_test_2018,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2021 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2021)
print(MAE_train_2021)
print(MAPE_train_2021)
print(round(R2_Score_train_2021,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2021)
print(MAE_test_2021)
print(MAPE_test_2021)
print(round(R2_Score_test_2021,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')

#### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

model_CatBoost_2018 = CatBoostRegressor()

parameters_2018 = {
                    'depth'         : [8, 10, 12],
                    'learning_rate' : [0.1, 0.3, 0.5],
                    'iterations'    : [100, 200, 400, 800]
                  }

grid_2018 = GridSearchCV(estimator = model_CatBoost_2018, param_grid = parameters_2018, cv = 5, n_jobs = -1)

grid_2018.fit(X_train_2018, Y_train_2018)

In [None]:
print("\n The best estimator across ALL searched params:\n", grid_2018.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_2018.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_2018.best_params_)

In [None]:
model_CatBoost_2021 = CatBoostRegressor()

parameters_2021 = {
                    'depth'         : [8, 10, 12],
                    'learning_rate' : [0.1, 0,3, 0.5],
                    'iterations'    : [100, 200, 400, 800]
                  }

grid_2021 = GridSearchCV(estimator = model_CatBoost_2021, param_grid = parameters_2021, cv = 5, n_jobs = -1)

grid_2021.fit(X_train_2021, Y_train_2021)

In [None]:
print("\n The best estimator across ALL searched params:\n", grid_2021.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_2021.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_2021.best_params_)

In [None]:
# After the hyperparameter tuning, below parameters have been identified.

In [None]:
parameters_2018 =   {
                    'depth':12,
                    'eta':.3,
                    'iterations':800,
                    }

parameters_2021 =   {
                    'depth':12,
                    'eta':.5,
                    'iterations':800,
                    }

#### Re-Modelling

In [None]:
# After the hyperparameter tuning, the models once again have been checked on test data.

In [None]:
model_Cat_2018 = CatBoostRegressor(depth=10, iterations = 800, learning_rate = 0.3)
model_Cat_2021 = CatBoostRegressor(depth=10, iterations = 800, learning_rate = 0.5)

In [None]:
model_Cat_2018.fit(X_train_2018, Y_train_2018)
model_Cat_2021.fit(X_train_2021, Y_train_2021)

In [None]:
predictions_test_2018 = model_Cat_2018.predict(X_test_2018)
predictions_test_2021 = model_Cat_2021.predict(X_test_2021)

In [None]:
predictions_train_2018 = model_Cat_2018.predict(X_train_2018)
predictions_train_2021 = model_Cat_2021.predict(X_train_2021)

In [None]:
MSE_test_2018 = mean_squared_error(Y_test_2018, predictions_test_2018)
MAE_test_2018 = mean_absolute_error(Y_test_2018, predictions_test_2018)
MAPE_test_2018 = mean_absolute_percentage_error(Y_test_2018, predictions_test_2018)
R2_Score_test_2018 = model_Cat_2018.score(X_test_2018,Y_test_2018)

MSE_test_2021 = mean_squared_error(Y_test_2021, predictions_test_2021)
MAE_test_2021 = mean_absolute_error(Y_test_2021, predictions_test_2021)
MAPE_test_2021 = mean_absolute_percentage_error(Y_test_2021, predictions_test_2021)
R2_Score_test_2021 = model_Cat_2021.score(X_test_2021,Y_test_2021)

MSE_train_2018 = mean_squared_error(Y_train_2018, predictions_train_2018)
MAE_train_2018 = mean_absolute_error(Y_train_2018, predictions_train_2018)
MAPE_train_2018 = mean_absolute_percentage_error(Y_train_2018, predictions_train_2018)
R2_Score_train_2018 = model_Cat_2018.score(X_train_2018,Y_train_2018)

MSE_train_2021 = mean_squared_error(Y_train_2021, predictions_train_2021)
MAE_train_2021 = mean_absolute_error(Y_train_2021, predictions_train_2021)
MAPE_train_2021 = mean_absolute_percentage_error(Y_train_2021, predictions_train_2021)
R2_Score_train_2021 = model_Cat_2021.score(X_train_2021,Y_train_2021)

In [None]:
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2018 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2018)
print(MAE_train_2018)
print(MAPE_train_2018)
print(round(R2_Score_train_2018,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2018)
print(MAE_test_2018)
print(MAPE_test_2018)
print(round(R2_Score_test_2018,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2021 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2021)
print(MAE_train_2021)
print(MAPE_train_2021)
print(round(R2_Score_train_2021,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2021)
print(MAE_test_2021)
print(MAPE_test_2021)
print(round(R2_Score_test_2021,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')

#### Feature Importance

In [None]:
explainer_2018 = shap.TreeExplainer(model_Cat_2018)
shap_values_2018 = explainer_2018.shap_values(X_test_2018)

In [None]:
shap.summary_plot(shap_values_2018, X_test_2018, plot_type="bar")

In [None]:
explainer_2021=shap.TreeExplainer(model_Cat_2021)
shap_values_2021=explainer_2021.shap_values(X_test_2021)

In [None]:
shap.summary_plot(shap_values_2021, X_test_2021, plot_type="bar")

### LIGHTGBM

#### Baseline Models

In [None]:
model_LGBM_2018 = LGBMRegressor()
model_LGBM_2021 = LGBMRegressor()

In [None]:
model_LGBM_2018.fit(X_train_2018, Y_train_2018)
model_LGBM_2021.fit(X_train_2021, Y_train_2021)

In [None]:
predictions_test_2018 = model_LGBM_2018.predict(X_test_2018)
predictions_test_2021 = model_LGBM_2021.predict(X_test_2021)

In [None]:
predictions_train_2018 = model_LGBM_2018.predict(X_train_2018)
predictions_train_2021 = model_LGBM_2021.predict(X_train_2021)

In [None]:
MSE_test_2018 = mean_squared_error(Y_test_2018, predictions_test_2018)
MAE_test_2018 = mean_absolute_error(Y_test_2018, predictions_test_2018)
MAPE_test_2018 = mean_absolute_percentage_error(Y_test_2018, predictions_test_2018)
R2_Score_test_2018 = model_LGBM_2018.score(X_test_2018,Y_test_2018)

MSE_test_2021 = mean_squared_error(Y_test_2021, predictions_test_2021)
MAE_test_2021 = mean_absolute_error(Y_test_2021, predictions_test_2021)
MAPE_test_2021 = mean_absolute_percentage_error(Y_test_2021, predictions_test_2021)
R2_Score_test_2021 = model_LGBM_2021.score(X_test_2021,Y_test_2021)

MSE_train_2018 = mean_squared_error(Y_train_2018, predictions_train_2018)
MAE_train_2018 = mean_absolute_error(Y_train_2018, predictions_train_2018)
MAPE_train_2018 = mean_absolute_percentage_error(Y_train_2018, predictions_train_2018)
R2_Score_train_2018 = model_LGBM_2018.score(X_train_2018,Y_train_2018)

MSE_train_2021 = mean_squared_error(Y_train_2021, predictions_train_2021)
MAE_train_2021 = mean_absolute_error(Y_train_2021, predictions_train_2021)
MAPE_train_2021 = mean_absolute_percentage_error(Y_train_2021, predictions_train_2021)
R2_Score_train_2021 = model_LGBM_2021.score(X_train_2021,Y_train_2021)

In [None]:
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2018 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2018)
print(MAE_train_2018)
print(MAPE_train_2018)
print(round(R2_Score_train_2018,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2018)
print(MAE_test_2018)
print(MAPE_test_2018)
print(round(R2_Score_test_2018,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2021 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2021)
print(MAE_train_2021)
print(MAPE_train_2021)
print(round(R2_Score_train_2021,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2021)
print(MAE_test_2021)
print(MAPE_test_2021)
print(round(R2_Score_test_2021,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')

#### Hyperparameter Tuning

In [None]:
model_LightGBM_2018 = LGBMRegressor()

parameters_2018 = {
                    'max_depth'  : [6, 8, 10, 12],
                    'Num_leaves' : [20, 40, 60, 80, 100, 120]
                  }

grid_2018 = GridSearchCV(estimator = model_LightGBM_2018, param_grid = parameters_2018, cv = 5, n_jobs = -1)

grid_2018.fit(X_train_2018, Y_train_2018)

In [None]:
print("\n The best estimator across ALL searched params:\n", grid_2018.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_2018.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_2018.best_params_)

In [None]:
model_LightGBM_2021 = LGBMRegressor()

parameters_2021 = {
                    'max_depth'  : [6, 8, 10, 12],
                    'Num_leaves' : [20, 40, 60, 80, 100, 120]
                  }

grid_2021 = GridSearchCV(estimator = model_LightGBM_2021, param_grid = parameters_2021, cv = 5, n_jobs = -1)

grid_2021.fit(X_train_2021, Y_train_2021)

In [None]:
print("\n The best estimator across ALL searched params:\n", grid_2021.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_2021.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_2021.best_params_)

#### Re-Modelling

In [None]:
model_LightGBM_2018 = LGBMRegressor(max_depth = 7, num_leaves = 120)
model_LightGBM_2021 = LGBMRegressor(max_depth = 7, num_leaves = 120)

In [None]:
model_LightGBM_2018.fit(X_train_2018, Y_train_2018)
model_LightGBM_2021.fit(X_train_2021, Y_train_2021)

In [None]:
predictions_test_2018 = model_LightGBM_2018.predict(X_test_2018)
predictions_test_2021 = model_LightGBM_2021.predict(X_test_2021)

In [None]:
predictions_train_2018 = model_LightGBM_2018.predict(X_train_2018)
predictions_train_2021 = model_LightGBM_2021.predict(X_train_2021)

In [None]:
MSE_test_2018 = mean_squared_error(Y_test_2018, predictions_test_2018)
MAE_test_2018 = mean_absolute_error(Y_test_2018, predictions_test_2018)
MAPE_test_2018 = mean_absolute_percentage_error(Y_test_2018, predictions_test_2018)
R2_Score_test_2018 = model_LightGBM_2018.score(X_test_2018,Y_test_2018)

MSE_test_2021 = mean_squared_error(Y_test_2021, predictions_test_2021)
MAE_test_2021 = mean_absolute_error(Y_test_2021, predictions_test_2021)
MAPE_test_2021 = mean_absolute_percentage_error(Y_test_2021, predictions_test_2021)
R2_Score_test_2021 = model_LightGBM_2021.score(X_test_2021,Y_test_2021)

MSE_train_2018 = mean_squared_error(Y_train_2018, predictions_train_2018)
MAE_train_2018 = mean_absolute_error(Y_train_2018, predictions_train_2018)
MAPE_train_2018 = mean_absolute_percentage_error(Y_train_2018, predictions_train_2018)
R2_Score_train_2018 = model_LightGBM_2018.score(X_train_2018,Y_train_2018)

MSE_train_2021 = mean_squared_error(Y_train_2021, predictions_train_2021)
MAE_train_2021 = mean_absolute_error(Y_train_2021, predictions_train_2021)
MAPE_train_2021 = mean_absolute_percentage_error(Y_train_2021, predictions_train_2021)
R2_Score_train_2021 = model_LightGBM_2021.score(X_train_2021,Y_train_2021)

In [None]:
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2018 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2018)
print(MAE_train_2018)
print(MAPE_train_2018)
print(round(R2_Score_train_2018,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2018)
print(MAE_test_2018)
print(MAPE_test_2018)
print(round(R2_Score_test_2018,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('2021 Dataset')
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(MSE_train_2021)
print(MAE_train_2021)
print(MAPE_train_2021)
print(round(R2_Score_train_2021,4))
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(MSE_test_2021)
print(MAE_test_2021)
print(MAPE_test_2021)
print(round(R2_Score_test_2021,4))
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')

#### Feature Importance

In [None]:
explainer_2018 = shap.TreeExplainer(model_LightGBM_2018)
shap_values_2018 = explainer_2018.shap_values(X_test_2018)

In [None]:
shap.summary_plot(shap_values_2018, X_test_2018, plot_type="bar")

In [None]:
explainer_2021 = shap.TreeExplainer(model_LightGBM_2021)
shap_values_2021 = explainer_2021.shap_values(X_test_2021)

In [None]:
shap.summary_plot(shap_values_2021, X_test_2021, plot_type="bar")

### TABNET

In [None]:
pip install pytorch-tabnet

In [None]:
pip install torchvision

In [None]:
pip install tabnet

In [None]:
import pytorch_tabnet

In [None]:
import torch
import torchvision

In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

#### Baseline Models

In [None]:
model_TabNet_2018 = TabNetRegressor()
model_TabNet_2021 = TabNetRegressor()

In [None]:
X_train_2018_tabnet = X_train_2018.to_numpy()
X_train_2021_tabnet = X_train_2021.to_numpy()
X_test_2018_tabnet = X_test_2018.to_numpy()
X_test_2021_tabnet = X_test_2021.to_numpy()

Y_train_2018_tabnet = Y_train_2018.to_numpy().reshape(-1, 1)
Y_train_2021_tabnet = Y_train_2021.to_numpy().reshape(-1, 1)
Y_test_2018_tabnet = Y_test_2018.to_numpy().reshape(-1, 1)
Y_test_2021_tabnet = Y_test_2021.to_numpy().reshape(-1, 1)

In [None]:
model_TabNet_2018.fit(X_train_2018_tabnet, Y_train_2018_tabnet)
model_TabNet_2021.fit(X_train_2021_tabnet, Y_train_2021_tabnet)

In [None]:
predictions_test_2018 = model_TabNet_2018.predict(X_test_2018_tabnet)
predictions_test_2021 = model_TabNet_2021.predict(X_test_2021_tabnet)

In [None]:
predictions_train_2018 = model_TabNet_2018.predict(X_train_2018_tabnet)
predictions_train_2021 = model_TabNet_2021.predict(X_train_2021_tabnet)

In [None]:
TabNet_test_MAE_2018 = mean_absolute_error(Y_test_2018_tabnet, predictions_test_2018)
TabNet_test_MSE_2018 = mean_squared_error(Y_test_2018_tabnet, predictions_test_2018)
TabNet_test_MAPE_2018 = mean_absolute_percentage_error(Y_test_2018_tabnet, predictions_test_2018)
TabNet_test_R_2_2018 = r2_score(Y_test_2018_tabnet, predictions_test_2018)

TabNet_test_MAE_2021 = mean_absolute_error(Y_test_2021_tabnet, predictions_test_2021)
TabNet_test_MSE_2021 = mean_squared_error(Y_test_2021_tabnet, predictions_test_2021)
TabNet_test_MAPE_2021 = mean_absolute_percentage_error(Y_test_2021_tabnet, predictions_test_2021)
TabNet_test_R_2_2021 = r2_score(Y_test_2021_tabnet, predictions_test_2021)

TabNet_train_MAE_2018 = mean_absolute_error(Y_train_2018_tabnet, predictions_train_2018)
TabNet_train_MSE_2018 = mean_squared_error(Y_train_2018_tabnet, predictions_train_2018)
TabNet_train_MAPE_2018 = mean_absolute_percentage_error(Y_train_2018_tabnet, predictions_train_2018)
TabNet_train_R_2_2018 = r2_score(Y_train_2018_tabnet, predictions_train_2018)

TabNet_train_MAE_2021 = mean_absolute_error(Y_train_2021_tabnet, predictions_train_2021)
TabNet_train_MSE_2021 = mean_squared_error(Y_train_2021_tabnet, predictions_train_2021)
TabNet_train_MAPE_2021 = mean_absolute_percentage_error(Y_train_2021_tabnet, predictions_train_2021)
TabNet_train_R_2_2021 = r2_score(Y_train_2021_tabnet, predictions_train_2021)

In [None]:
print(':::::::::::::::::::::::::')
print('2018 Dataset')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(TabNet_train_MAE_2018)
print(TabNet_train_MSE_2018)
print(TabNet_train_MAPE_2018)
print(TabNet_train_R_2_2018)
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(TabNet_test_MAE_2018)
print(TabNet_test_MSE_2018)
print(TabNet_test_MAPE_2018)
print(TabNet_test_R_2_2018)
print(':::::::::::::::::::::::::')
print('2021 Dataset')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(TabNet_train_MAE_2021)
print(TabNet_train_MSE_2021)
print(TabNet_train_MAPE_2021)
print(TabNet_train_R_2_2021)
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(TabNet_test_MAE_2021)
print(TabNet_test_MSE_2021)
print(TabNet_test_MAPE_2021)
print(TabNet_test_R_2_2021)
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')

#### Hyperparameter Tuning

In [None]:
X_2018_tabnet = X_2018.to_numpy()
Y_2018_tabnet = Y_2018.to_numpy().reshape(-1, 1)

X_2021_tabnet = X_2021.to_numpy()
Y_2021_tabnet = Y_2021.to_numpy().reshape(-1, 1)

In [None]:
KF_2018_tabnet = KFold(n_splits = 5, random_state = 42, shuffle = True)
Predictions_2018_tabnet = []
Cross_Validation_Score_2018 = []

for train_index, test_index in KF_2018_tabnet.split(X_2018_tabnet):
    X_train_2018_tabnet, X_valid_2018_tabnet = X_2018_tabnet[train_index], X_2018_tabnet[test_index]
    Y_train_2018_tabnet, Y_valid_2018_tabnet = Y_2018_tabnet[train_index], Y_2018_tabnet[test_index]
    Regressor = TabNetRegressor()
    Regressor.fit(X_train = X_train_2018_tabnet, y_train = Y_train_2018_tabnet,
              eval_set=[(X_valid_2018_tabnet, Y_valid_2018_tabnet)])
    Cross_Validation_Score_2018.append(Regressor.best_cost)
    Predictions_2018_tabnet.append(np.expm1(Regressor.predict(X_test_2018_tabnet)))

tabnet_predictions_2018 = np.mean(Predictions_2018_tabnet,axis=0)

In [None]:
np.mean(Cross_Validation_Score_2018, axis=0)

In [None]:
RMSLE = np.sqrt( mean_squared_log_error(y_true, tabnet_predictions_2018) )

In [None]:
Regressor.get_params

In [None]:
KF_2021_tabnet = KFold(n_splits = 5, random_state = 42, shuffle = True)
Predictions_2021_tabnet = []
Cross_Validation_Score_2021 = []

for train_index, test_index in KF_2021_tabnet.split(X_2021_tabnet):
    X_train_2021_tabnet, X_valid_2021_tabnet = X_2021_tabnet[train_index], X_2021_tabnet[test_index]
    Y_train_2021_tabnet, Y_valid_2021_tabnet = Y_2021_tabnet[train_index], Y_2021_tabnet[test_index]
    Regressor = TabNetRegressor()
    Regressor.fit(X_train = X_train_2021_tabnet, y_train = Y_train_2021_tabnet,
              eval_set=[(X_valid_2021_tabnet, Y_valid_2021_tabnet)])
    Cross_Validation_Score_2021.append(Regressor.best_cost)
    Predictions_2021_tabnet.append(np.expm1(Regressor.predict(X_test_2021_tabnet)))

tabnet_predictions_2021 = np.mean(Predictions_2021_tabnet,axis=0)

In [None]:
np.mean(Cross_Validation_Score_2021, axis=0)

In [None]:
RMSLE = np.sqrt( mean_squared_log_error(y_true, tabnet_predictions_2021) )

In [None]:
Regressor.get_params

#### Re-Modelling

In [None]:
model_TabNet_2018 = TabNetRegressor(
                                    n_d=8, 
                                    n_a=8, 
                                    n_steps=3, 
                                    gamma=1.3, 
                                    cat_idxs=[], 
                                    cat_dims=[], 
                                    cat_emb_dim=1, 
                                    n_independent=2, 
                                    n_shared=2, 
                                    epsilon=1e-15, 
                                    momentum=0.02, 
                                    lambda_sparse=0.001, 
                                    seed=0, 
                                    clip_value=1, 
                                    verbose=1, 
                                    optimizer_fn=<class 'torch.optim.adam.Adam'>, 
                                    optimizer_params={'lr': 0.02}, 
                                    scheduler_fn=None, 
                                    scheduler_params={}, 
                                    mask_type='sparsemax', 
                                    input_dim=12, 
                                    output_dim=1, 
                                    device_name='auto', 
                                    n_shared_decoder=1, 
                                    n_indep_decoder=1
                                   )
model_TabNet_2021 = TabNetRegressor(
                                    n_d=8, 
                                    n_a=8, 
                                    n_steps=3, 
                                    gamma=1.3, 
                                    cat_idxs=[], 
                                    cat_dims=[], 
                                    cat_emb_dim=1, 
                                    n_independent=2, 
                                    n_shared=2, 
                                    epsilon=1e-15, 
                                    momentum=0.02, 
                                    lambda_sparse=0.001, 
                                    seed=0, 
                                    clip_value=1, 
                                    verbose=1, 
                                    optimizer_fn=<class 'torch.optim.adam.Adam'>, 
                                    optimizer_params={'lr': 0.02}, 
                                    scheduler_fn=None, 
                                    scheduler_params={}, 
                                    mask_type='sparsemax', 
                                    input_dim=12, 
                                    output_dim=1, 
                                    device_name='auto', 
                                    n_shared_decoder=1, 
                                    n_indep_decoder=1
                                   )

In [None]:
model_TabNet_2018.fit(X_train_2018_tabnet, Y_train_2018_tabnet)
model_TabNet_2021.fit(X_train_2021_tabnet, Y_train_2021_tabnet)

In [None]:
predictions_test_2018 = model_TabNet_2018.predict(X_test_2018_tabnet)
predictions_test_2021 = model_TabNet_2021.predict(X_test_2021_tabnet)

In [None]:
predictions_train_2018 = model_TabNet_2018.predict(X_train_2018_tabnet)
predictions_train_2021 = model_TabNet_2021.predict(X_train_2021_tabnet)

In [None]:
TabNet_test_MAE_2018 = mean_absolute_error(Y_test_2018_tabnet, predictions_test_2018)
TabNet_test_MSE_2018 = mean_squared_error(Y_test_2018_tabnet, predictions_test_2018)
TabNet_test_MAPE_2018 = mean_absolute_percentage_error(Y_test_2018_tabnet, predictions_test_2018)
TabNet_test_R_2_2018 = r2_score(Y_test_2018_tabnet, predictions_test_2018)

TabNet_test_MAE_2021 = mean_absolute_error(Y_test_2021_tabnet, predictions_test_2021)
TabNet_test_MSE_2021 = mean_squared_error(Y_test_2021_tabnet, predictions_test_2021)
TabNet_test_MAPE_2021 = mean_absolute_percentage_error(Y_test_2021_tabnet, predictions_test_2021)
TabNet_test_R_2_2021 = r2_score(Y_test_2021_tabnet, predictions_test_2021)

TabNet_train_MAE_2018 = mean_absolute_error(Y_train_2018_tabnet, predictions_train_2018)
TabNet_train_MSE_2018 = mean_squared_error(Y_train_2018_tabnet, predictions_train_2018)
TabNet_train_MAPE_2018 = mean_absolute_percentage_error(Y_train_2018_tabnet, predictions_train_2018)
TabNet_train_R_2_2018 = r2_score(Y_train_2018_tabnet, predictions_train_2018)

TabNet_train_MAE_2021 = mean_absolute_error(Y_train_2021_tabnet, predictions_train_2021)
TabNet_train_MSE_2021 = mean_squared_error(Y_train_2021_tabnet, predictions_train_2021)
TabNet_train_MAPE_2021 = mean_absolute_percentage_error(Y_train_2021_tabnet, predictions_train_2021)
TabNet_train_R_2_2021 = r2_score(Y_train_2021_tabnet, predictions_train_2021)

In [None]:
print(':::::::::::::::::::::::::')
print('2018 Dataset')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(TabNet_train_MAE_2018)
print(TabNet_train_MSE_2018)
print(TabNet_train_MAPE_2018)
print(TabNet_train_R_2_2018)
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(TabNet_test_MAE_2018)
print(TabNet_test_MSE_2018)
print(TabNet_test_MAPE_2018)
print(TabNet_test_R_2_2018)
print(':::::::::::::::::::::::::')
print('2021 Dataset')
print(':::::::::::::::::::::::::')
print('Train Metrics')
print(':::::::::::::::::::::::::')
print(TabNet_train_MAE_2021)
print(TabNet_train_MSE_2021)
print(TabNet_train_MAPE_2021)
print(TabNet_train_R_2_2021)
print(':::::::::::::::::::::::::')
print('Test Metrics')
print(':::::::::::::::::::::::::')
print(TabNet_test_MAE_2021)
print(TabNet_test_MSE_2021)
print(TabNet_test_MAPE_2021)
print(TabNet_test_R_2_2021)
print(':::::::::::::::::::::::::')
print(':::::::::::::::::::::::::')

#### Feature Importance

In [None]:
Feature_Importance_2018 = model_TabNet_2018.feature_importances_
indices = np.argsort(Feature_Importance_2018)

In [None]:
plt.figure()
plt.title("Feature importances")
plt.barh(range(len(Feature_Importance_2018)), Feature_Importance_2018[indices],
       color="r", align="center")

# If you want to define your own labels,
# change indices to a list of labels on the following line.

plt.ylim([-1, len(Feature_Importance_2018)])
plt.show()

In [None]:
Feature_Importance_2021 = model_TabNet_2021.feature_importances_
indices = np.argsort(Feature_Importance_2021)

In [None]:
plt.figure()
plt.title("Feature importances")
plt.barh(range(len(Feature_Importance_2021)), Feature_Importance_2021[indices],
       color="r", align="center")

# If you want to define your own labels,
# change indices to a list of labels on the following line.

plt.ylim([-1, len(Feature_Importance_2021)])
plt.show()