# Automobile Dataset Description

#### This dataset consist of data From 1985 Ward's Automotive Yearbook.

#### This data set consists of three types of entities: 
 - (a) The specification of an auto in terms of various characteristics
 - (b) Its assigned insurance risk rating
 - (c) Its normalized losses in use as compared to other cars. 

#### The second rating corresponds to the degree to which the auto is more risky than its price indicates. Cars are initially assigned a risk factor symbol associated with its price. Then if it is more risky (or less), this symbol is adjusted by moving it up (or down) the scale. Actuarians call this process "symboling". A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.

#### The third factor is the relative averages loss payment per insured vehicle year. This value is normalized for all autos within a particular size classification (two-door small, station wagons, sports/speciality, etc...), and represents the averages loss per car per year.

## Importing the Libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib
from pyspark.sql import SparkSession
import uuid
import os
import shutil
import pickle
from datetime import datetime
from subprocess import run, Popen, PIPE
from pyspark import SparkContext, SparkConf
from pyspark.sql import DataFrame, SparkSession, Window
from pyspark.sql.functions import col, expr, monotonically_increasing_id, row_number,current_timestamp
from typing import  List

## Loading the Dataset from Azure

In [2]:
automobile = pd.read_csv("https://raw.githubusercontent.com/katonic-dev/Examples/master/data/automobile_data.csv")
automobile.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [3]:
automobile.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

In [4]:
automobile.keys()

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

##### Shape of the data set

In [5]:
print(automobile.shape)

(205, 26)


##### Describe the stats about the data

In [6]:
automobile.describe()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,10.142537,25.219512,30.75122
std,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,3.97204,6.542142,6.886443
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,9.4,30.0,34.0
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


##### Checking for the null values

In [7]:
# Number of missing in each column
missing = pd.DataFrame(automobile.isnull().sum()).rename(columns = {0: 'total'})

# Create a percentage missing
missing['percent'] = missing['total'] / len(automobile)

missing.sort_values('percent', ascending = False).head(10)

Unnamed: 0,total,percent
symboling,0,0.0
normalized-losses,0,0.0
highway-mpg,0,0.0
city-mpg,0,0.0
peak-rpm,0,0.0
horsepower,0,0.0
compression-ratio,0,0.0
stroke,0,0.0
bore,0,0.0
fuel-system,0,0.0


In [8]:
automobile.isnull().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

##### Find out number of records having '?' value for normalized losses

In [9]:
automobile['normalized-losses'].loc[automobile['normalized-losses'] == '?'].count()

41

##### Setting the missing value to mean of normalized losses and conver the datatype to integer


In [10]:
nl = automobile['normalized-losses'].loc[automobile['normalized-losses'] != '?']
nlmean = nl.astype(str).astype(int).mean()
automobile['normalized-losses'] = automobile['normalized-losses'].replace('?',nlmean).astype(int)
automobile['normalized-losses'].head()

0    122
1    122
2    122
3    164
4    164
Name: normalized-losses, dtype: int64

##### Find out the number of values which are not numeric


In [11]:
automobile['price'].str.isnumeric().value_counts()

True     201
False      4
Name: price, dtype: int64

##### List out the values which are not numeric


In [12]:
automobile['price'].loc[automobile['price'].str.isnumeric() == False]

9      ?
44     ?
45     ?
129    ?
Name: price, dtype: object

##### Setting the missing value to mean of price and convert the datatype to integer


In [13]:
price = automobile['price'].loc[automobile['price'] != '?']
pmean = price.astype(str).astype(int).mean()
automobile['price'] = automobile['price'].replace('?',pmean).astype(int)
automobile['price'].head()

0    13495
1    16500
2    16500
3    13950
4    17450
Name: price, dtype: int64

##### Checking the numberic and replacing with mean value and conver the datatype to integer for horsepower


In [14]:
automobile['horsepower'].str.isnumeric().value_counts()
horsepower = automobile['horsepower'].loc[automobile['horsepower'] != '?']
hpmean = horsepower.astype(str).astype(int).mean()
automobile['horsepower'] = automobile['horsepower'].replace('?',pmean).astype(int)

##### Checking the outlier of horsepower


In [15]:
automobile.loc[automobile['horsepower'] > 10000]

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
130,0,122,renault,gas,std,four,wagon,fwd,front,96.1,...,132,mpfi,3.46,3.9,8.7,13207,?,23,31,9295
131,2,122,renault,gas,std,two,hatchback,fwd,front,96.1,...,132,mpfi,3.46,3.9,8.7,13207,?,23,31,9895


##### Excluding the outlier data for horsepower


In [16]:
automobile[np.abs(automobile.horsepower-automobile.horsepower.mean())<=(3*automobile.horsepower.std())]

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,122,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,122,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
201,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
202,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
203,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.4,23.0,106,4800,26,27,22470


##### Find out the number of invalid value


In [17]:
automobile['bore'].loc[automobile['bore'] == '?']

55    ?
56    ?
57    ?
58    ?
Name: bore, dtype: object

##### Replace the non-numeric value to null and conver the datatype


In [18]:
automobile['bore'] = pd.to_numeric(automobile['bore'],errors='coerce')
automobile.dtypes

symboling              int64
normalized-losses      int64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke                object
compression-ratio    float64
horsepower             int64
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

##### Replace the non-number value to null and convert the datatype


In [19]:
automobile['stroke'] = pd.to_numeric(automobile['stroke'],errors='coerce')
automobile.dtypes

symboling              int64
normalized-losses      int64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower             int64
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

##### Convert the non-numeric data to null and convert the datatype


In [20]:
automobile['peak-rpm'] = pd.to_numeric(automobile['peak-rpm'],errors='coerce')
automobile.dtypes

symboling              int64
normalized-losses      int64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower             int64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

### Label Encoder
##### In machine learning, we usually deal with datasets which contains multiple labels in one or more than one columns. These labels can be in the form of words or numbers. Label Encoding refers to converting the labels info the labels into numeric form so as to convert it into the machine-readable form.

In [21]:
le = LabelEncoder()
automobile['make'] = le.fit_transform(automobile['make'])
automobile['fuel-type'] = le.fit_transform(automobile['fuel-type'])
automobile['aspiration'] = le.fit_transform(automobile['aspiration'])
automobile['num-of-doors'] = le.fit_transform(automobile['num-of-doors'])
automobile['body-style'] = le.fit_transform(automobile['body-style'])
automobile['drive-wheels'] = le.fit_transform(automobile['drive-wheels'])
automobile['engine-location'] = le.fit_transform(automobile['engine-location'])
automobile['engine-type'] = le.fit_transform(automobile['engine-type'])
automobile['num-of-cylinders'] = le.fit_transform(automobile['num-of-cylinders'])
automobile['fuel-system'] = le.fit_transform(automobile['fuel-system'])
automobile.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,122,0,1,0,2,0,2,0,88.6,...,130,5,3.47,2.68,9.0,111,5000.0,21,27,13495
1,3,122,0,1,0,2,0,2,0,88.6,...,130,5,3.47,2.68,9.0,111,5000.0,21,27,16500
2,1,122,0,1,0,2,2,2,0,94.5,...,152,5,2.68,3.47,9.0,154,5000.0,19,26,16500
3,2,164,1,1,0,1,3,1,0,99.8,...,109,5,3.19,3.4,10.0,102,5500.0,24,30,13950
4,2,164,1,1,0,1,3,0,0,99.4,...,136,5,3.19,3.4,8.0,115,5500.0,18,22,17450


##### Converting the datatype

In [22]:
automobile['wheel-base'] = automobile['wheel-base'].astype(int)
automobile['height'] = automobile['height'].astype(int)
automobile['length'] = automobile['length'].astype(int)
automobile['width'] = automobile['width'].astype(int)
automobile['compression-ratio'] = automobile['compression-ratio'].astype(int)

##### Filling the null value with mean and then change the datatype for stroke

In [23]:
automobile['stroke'] = automobile['stroke'].fillna(automobile['stroke'].mean())
automobile['stroke'] = automobile['stroke'].astype(int)

##### Filling the null value with mean and then change the datatype for peak-rpm

In [24]:
automobile['peak-rpm'] = automobile['peak-rpm'].fillna(automobile['peak-rpm'].mean())
automobile['peak-rpm'] = automobile['peak-rpm'].astype(int)

##### Filling the null value with mean and then change the datatype for bore

In [25]:
automobile['bore'] = automobile['bore'].fillna(automobile['bore'].mean())
automobile['bore'] = automobile['bore'].astype(int)

In [26]:
automobile.dtypes

symboling            int64
normalized-losses    int64
make                 int64
fuel-type            int64
aspiration           int64
num-of-doors         int64
body-style           int64
drive-wheels         int64
engine-location      int64
wheel-base           int64
length               int64
width                int64
height               int64
curb-weight          int64
engine-type          int64
num-of-cylinders     int64
engine-size          int64
fuel-system          int64
bore                 int64
stroke               int64
compression-ratio    int64
horsepower           int64
peak-rpm             int64
city-mpg             int64
highway-mpg          int64
price                int64
dtype: object

##### Remove the records which are having the value '?'


In [27]:
automobile['num-of-doors'].loc[automobile['num-of-doors'] == '?']
automobile = automobile[automobile['num-of-doors'] != '?']
automobile['num-of-doors'].loc[automobile['num-of-doors'] == '?']

Series([], Name: num-of-doors, dtype: int64)

### Correlation Matrix
##### A correlation matrix is a table showing correlation coefficients between variables. Each cell in the table showns the correlation between two variables. The values in the range of -1 to 1. If two variables have high correlations, we can neglect one variable from those two.

In [28]:
automobile.corr()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
symboling,1.0,0.4651899,-0.118794,0.194311,-0.059866,0.645492,-0.596135,-0.041671,0.2124709,-0.533322,...,-0.10579,0.091163,-0.122112,-0.076884,-0.171731,0.01543,0.273678,-0.035823,0.034606,-0.082201
normalized-losses,0.46519,1.0,-0.250975,0.101437,-0.006823,0.336885,-0.250893,0.311032,-2.918301e-17,-0.05961,...,0.110997,0.228035,-0.10955,0.090908,-0.115825,0.006224,0.237748,-0.218749,-0.178221,0.133999
make,-0.118794,-0.2509755,1.0,-0.113191,0.054265,-0.127416,0.089494,-0.004317,0.05460831,0.084751,...,-0.070918,0.146624,0.258004,-0.123254,0.13603,0.058665,-0.218347,0.053642,0.050022,-0.161471
fuel-type,0.194311,0.101437,-0.113191,1.0,-0.401397,0.207665,-0.147853,-0.132257,0.04006951,-0.304407,...,-0.069594,0.041529,-0.064795,-0.103911,-0.982732,0.037675,0.477058,-0.255963,-0.191392,-0.110207
aspiration,-0.059866,-0.0068227,0.054265,-0.401397,1.0,-0.066737,0.063028,0.066465,-0.05719146,0.255692,...,0.108217,0.288086,0.16683,0.07361,0.294949,-0.039211,-0.183626,-0.202362,-0.254416,0.177285
num-of-doors,0.645492,0.3368845,-0.127416,0.207665,-0.066737,1.0,-0.670198,0.112398,0.1365776,-0.415741,...,-0.005701,-0.000323,-0.157658,-0.046524,-0.188059,0.018703,0.233903,-0.000142,0.021628,-0.031934
body-style,-0.596135,-0.2508926,0.089494,-0.147853,0.063028,-0.670198,1.0,-0.155745,-0.2770093,0.40401,...,-0.073352,-0.065079,0.110676,0.059036,0.13731,0.039949,-0.109414,0.031697,-0.00717,-0.072677
drive-wheels,-0.041671,0.3110317,-0.004317,-0.132257,0.066465,0.112398,-0.155745,1.0,0.1478645,0.462127,...,0.524307,0.424686,0.181554,0.082601,0.128359,-0.042627,-0.039719,-0.449581,-0.45222,0.576867
engine-location,0.212471,-2.918301e-17,0.054608,0.04007,-0.057191,0.136578,-0.277009,0.147865,1.0,-0.187241,...,0.196826,0.105971,0.043322,-0.320364,-0.025977,-0.002373,0.198401,-0.153487,-0.102026,0.331013
wheel-base,-0.533322,-0.05960954,0.084751,-0.304407,0.255692,-0.415741,0.40401,0.462127,-0.1872415,1.0,...,0.570816,0.389963,0.32398,0.148477,0.249054,-0.026748,-0.356787,-0.474734,-0.547507,0.585736


## Model Training

In [29]:
X = automobile.drop(columns = ['price'])
Y = automobile['price']

### Logistic Regression

In [30]:
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X, Y, test_size=0.30)

In [31]:
logreg = LogisticRegression(solver='lbfgs', max_iter=110)
lr = logreg.fit(X_train_lr, y_train_lr)
acc_log = round(logreg.score(X_train_lr, y_train_lr) * 100, 2)
acc_log

63.64

### Random Forest

In [32]:
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, Y, test_size=0.30)

In [33]:
random_forest = RandomForestClassifier(n_estimators=100)
rf = random_forest.fit(X_train_rf, y_train_rf)
acc_random_forest = round(random_forest.score(X_train_rf, y_train_rf) * 100, 2)
acc_random_forest

97.9

### Gaussian Naive Bayes

In [34]:
X_train_gnb, X_test_gnb, y_train_gnb, y_test_gnb = train_test_split(X, Y, test_size=0.30)

In [35]:
gaussian = GaussianNB()
gnb = gaussian.fit(X_train_gnb, y_train_gnb)
acc_gaussian = round(gaussian.score(X_train_gnb, y_train_gnb) * 100, 2)
acc_gaussian

96.5

### Support Vector Machine

In [36]:
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X, Y, test_size=0.30)

In [37]:
linear_svc = SVC(gamma='auto')
svm = linear_svc.fit(X_train_svm, y_train_svm)
acc_linear_svc = round(linear_svc.score(X_train_svm, y_train_svm) * 100, 2)
acc_linear_svc

99.3

### Decision Tree

In [38]:
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X, Y, test_size=0.30)

In [39]:
decision_tree = DecisionTreeClassifier()
dt = decision_tree.fit(X_train_dt, y_train_dt)
acc_decision_tree = round(decision_tree.score(X_train_dt, y_train_dt) * 100, 2)
acc_decision_tree

97.2

### Preceptron

In [40]:
X_train_per, X_test_per, y_train_per, y_test_per = train_test_split(X, Y, test_size=0.30)

In [41]:
perceptron = Perceptron()
per = perceptron.fit(X_train_per, y_train_per)
acc_perceptron = round(perceptron.score(X_train_per, y_train_per) * 100, 2)
acc_perceptron

1.4

### K Nearest Neighbour

In [42]:
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X, Y, test_size=0.30)

In [43]:
knearestneighbour = KNeighborsClassifier()
knn = knearestneighbour.fit(X_train_knn, y_train_knn)
acc_knearest = round(knearestneighbour.score(X_train_knn, y_train_knn) * 100, 2)
acc_knearest

20.98

### Stochastic Gradient Descent

In [44]:
X_train_sgd, X_test_sgd, y_train_sgd, y_test_sgd = train_test_split(X, Y, test_size=0.30)

In [45]:
stochasticgrad = SGDClassifier()
sgd = stochasticgrad.fit(X_train_sgd, y_train_sgd)
acc_sgd = round(stochasticgrad.score(X_train_sgd, y_train_sgd) * 100, 2)
acc_sgd

2.1

### Gradient Boosting Classifier

In [46]:
X_train_gbc, X_test_gbc, y_train_gbc, y_test_gbc = train_test_split(X, Y, test_size=0.30)

In [47]:
gradientboostingclassifier = GradientBoostingClassifier()
gbc = gradientboostingclassifier.fit(X_train_gbc, y_train_gbc)
acc_gbc = round(gradientboostingclassifier.score(X_train_gbc, y_train_gbc) * 100, 2)
acc_gbc

97.9

## Evaluating the Models

In [48]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Gaussian Naive Bayes', 'Support Vector Machine', 'Decision Tree', 'Preceptron', 'KNearest Neighbour', 'Stochastic Gradient Descent', 'Gradient Boosting Classifier'],
    'Score': [acc_log, acc_random_forest, acc_gaussian, acc_linear_svc, acc_decision_tree, acc_perceptron, acc_knearest, acc_sgd, acc_gbc],
    'Model_abb': [lr, rf, gnb, svm, dt, per, knn, sgd, gbc]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df

Unnamed: 0_level_0,Model,Model_abb
Score,Unnamed: 1_level_1,Unnamed: 2_level_1
99.3,Support Vector Machine,SVC(gamma='auto')
97.9,Random Forest,"(DecisionTreeClassifier(max_features='auto', r..."
97.9,Gradient Boosting Classifier,([DecisionTreeRegressor(criterion='friedman_ms...
97.2,Decision Tree,DecisionTreeClassifier()
96.5,Gaussian Naive Bayes,GaussianNB()
63.64,Logistic Regression,LogisticRegression(max_iter=110)
20.98,KNearest Neighbour,KNeighborsClassifier()
2.1,Stochastic Gradient Descent,SGDClassifier()
1.4,Preceptron,Perceptron()


In [49]:
best_model = result_df['Model_abb'].iloc[0]
best_model

SVC(gamma='auto')

In [50]:
joblib.dump(best_model, 'model.joblib')

['model.joblib']

In [51]:
Model_job = joblib.load("model.joblib")
Model_job

SVC(gamma='auto')

In [52]:
type(Model_job)

sklearn.svm._classes.SVC

In [53]:
type(X_test_rf)

pandas.core.frame.DataFrame

In [54]:
X_test_rf

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
96,1,122,12,1,0,1,3,1,0,94,...,2,97,1,3,3,9,69,5200,31,37
45,0,122,6,1,0,1,3,1,0,94,...,2,90,1,3,3,9,70,5400,38,43
5,2,122,1,1,0,2,3,1,0,99,...,1,136,5,3,3,8,110,5500,19,25
16,0,122,2,1,0,2,3,2,0,103,...,3,209,5,3,3,8,182,5400,16,22
134,3,150,17,1,0,2,2,1,0,99,...,2,121,5,2,2,9,110,5250,21,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,-1,93,9,0,1,1,3,2,0,115,...,1,183,3,3,3,21,123,4350,22,25
56,3,150,8,1,0,2,2,2,0,95,...,6,70,2,3,3,9,101,6000,17,23
133,2,104,17,1,0,1,3,1,0,99,...,2,121,5,3,3,9,110,5250,21,28
112,0,161,13,0,1,1,3,2,0,107,...,2,152,3,3,3,21,95,4150,28,33


In [55]:
pred = X_test_rf.values[55].reshape(1, -1)
pred

array([[  -1,   65,   19,    1,    0,    1,    2,    1,    0,  102,  175,
          66,   53, 2458,    3,    2,  122,    5,    3,    3,    8,   92,
        4200,   27,   32]])

In [56]:
Model_job.predict(pred)

array([13207])