## 1. Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

np.random.seed(1)

## 2. Load the data

In [2]:
df=pd.read_csv("Box/MS BAIS Venkatesh/DSP/week3/RidingMowers.csv")
df.head()

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,Owner
1,85.5,16.8,Owner
2,64.8,21.6,Owner
3,61.5,20.8,Owner
4,87.0,23.6,Owner


## 3. Label Encoding

In [3]:
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df['Ownership'] = labelencoder.fit_transform(df['Ownership'])
df

Unnamed: 0,Income,Lot_Size,Ownership
0,60.0,18.4,1
1,85.5,16.8,1
2,64.8,21.6,1
3,61.5,20.8,1
4,87.0,23.6,1
5,110.1,19.2,1
6,108.0,17.6,1
7,82.8,22.4,1
8,69.0,20.0,1
9,93.0,20.8,1


## 4. Splitting of Dependent variable and Independent variable

In [4]:
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]

## 5. Splitting of data into test set and train set

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [6]:
y_test

13    0
18    0
3     1
14    0
20    0
17    0
10    1
4     1
Name: Ownership, dtype: int32

## 6. Model the data

In [7]:
## First, we will create a dataframe to hold all the results of our models.

performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### 6.1 SVM Classification model with Linear Kernel

In [8]:
svm_lin_model = SVC(kernel="linear", probability=True)
_ = svm_lin_model.fit(X_train, np.ravel(y_train))

In [9]:
model_preds = svm_lin_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"svm with linear kernel", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,svm with linear kernel,1.0,1.0,1.0,1.0


### 6.2 SVM Classification model with rbf Kernel

In [10]:
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale', probability=True)
_ = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [11]:
model_preds = svm_rbf_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"svm with rbf kernel", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,svm with linear kernel,1.0,1.0,1.0,1.0
0,svm with rbf kernel,0.75,0.666667,0.666667,0.666667


### 6.3 SVM Classification model with Polynomial Kernel

In [12]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1, C=10, probability=True)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

In [13]:
model_preds = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"svm with polynomial kernel", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,svm with linear kernel,1.0,1.0,1.0,1.0
0,svm with rbf kernel,0.75,0.666667,0.666667,0.666667
0,svm with polynomial kernel,0.875,1.0,0.666667,0.8


## 7. Summary

Sorted by accuracy, the best models are:

In [14]:
performance.sort_values(by=['Accuracy'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,svm with rbf kernel,0.75,0.666667,0.666667,0.666667
0,svm with polynomial kernel,0.875,1.0,0.666667,0.8
0,svm with linear kernel,1.0,1.0,1.0,1.0


Sorted by precision, the best models are:

In [15]:
performance.sort_values(by=['Precision'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,svm with rbf kernel,0.75,0.666667,0.666667,0.666667
0,svm with linear kernel,1.0,1.0,1.0,1.0
0,svm with polynomial kernel,0.875,1.0,0.666667,0.8


Sorted by Recall, the best models are:

In [16]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,svm with rbf kernel,0.75,0.666667,0.666667,0.666667
0,svm with polynomial kernel,0.875,1.0,0.666667,0.8
0,svm with linear kernel,1.0,1.0,1.0,1.0


Sorted by F1, the best models are:

In [17]:
performance.sort_values(by=['F1'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,svm with rbf kernel,0.75,0.666667,0.666667,0.666667
0,svm with polynomial kernel,0.875,1.0,0.666667,0.8
0,svm with linear kernel,1.0,1.0,1.0,1.0


As per the metrics above, we can see that "Linear model" is overfitting for a 30% test split. But, we see that "Polynomial model" is performing the best based on the metrics. So, we consider this to make  lawn mower ownership prediction.

In [18]:
## Saving Best winning model file as Pickle
import pickle

In [19]:
pickle.dump(svm_poly_model,open("C:/Users/vvenk/Box/MS BAIS Venkatesh/DSP/week3/pickle.pkl",'wb'))