# Import Libraries

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, f1_score 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, KFold 
from sklearn.preprocessing import MinMaxScaler

In [6]:
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.cluster import DBSCAN
from sklearn.cluster import FeatureAgglomeration
from sklearn.cluster import MeanShift

# Dataset

In [7]:
allen = pd.read_csv("Dataset.csv")

In [8]:
allen.head()

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


# Label Encoding

In [6]:
l1 = preprocessing.LabelEncoder()
f1 = l1.fit_transform(allen['Gender']) 
f1 = pd.DataFrame(data=f1, columns=['Gender'])
allen['Gender'] = f1['Gender']

In [7]:
allen.head()

Unnamed: 0,Gender,Age,Salary,Purchase Iphone
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0


# Train-test-split

In [8]:
x = allen.drop(['Purchase Iphone'],axis=1)

In [9]:
y = allen['Purchase Iphone']

In [57]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.15,random_state=30)

# Random Forest

In [58]:
model1 = RandomForestClassifier()
model1.fit(xtrain,ytrain)
p1 = model1.predict(xtest)

### Confusion Matrix 

In [59]:
print(confusion_matrix(ytest,p1))

[[38  0]
 [ 3 19]]


### Specificity and Sensitivity

In [60]:
c1 = confusion_matrix(ytest,p1)
print('Specificity: ', c1[1,1]/(c1[1,0]+c1[1,1]))
print('Sensitivity: ', c1[0,0]/(c1[0,0]+c1[0,1]))

Specificity:  0.8636363636363636
Sensitivity:  1.0


### Accuracy

In [61]:
c1 = confusion_matrix(ytest,p1)
print('Accuracy: {:.2%}'.format((c1[0,0]+c1[1,1])/sum(sum(c1))))

Accuracy: 95.00%


### F1 Score

In [62]:
f11 = f1_score(ytest,p1)
print('F1 Score: {:.2%}'.format(f11))

F1 Score: 92.68%


# Decision Tree

In [63]:
model2 = DecisionTreeClassifier()
model2.fit(xtrain,ytrain)
p2 = model2.predict(xtest)

### Confusion Matrix 

In [64]:
print(confusion_matrix(ytest,p2))

[[37  1]
 [ 5 17]]


### Specificity and Sensitivity

In [65]:
c2 = confusion_matrix(ytest,p2)
print('Specificity: ', c2[1,1]/(c2[1,0]+c2[1,1]))
print('Sensitivity: ', c2[0,0]/(c2[0,0]+c2[0,1]))

Specificity:  0.7727272727272727
Sensitivity:  0.9736842105263158


### Accuracy

In [66]:
c2 = confusion_matrix(ytest,p2)
print('Accuracy: {:.2%}'.format((c2[0,0]+c2[1,1])/sum(sum(c2))))

Accuracy: 90.00%


### F1 Score

In [67]:
f12 = f1_score(ytest,p2)
print('F1 Score: {:.2%}'.format(f12))

F1 Score: 85.00%


# Naive Bayes

In [68]:
model3 = GaussianNB()
model3.fit(xtrain,ytrain)
p3 = model3.predict(xtest)

### Confusion Matrix

In [69]:
print(confusion_matrix(ytest,p3))

[[38  0]
 [ 6 16]]


### Specificity and Sensitivity

In [70]:
c3 = confusion_matrix(ytest,p3)
print('Specificity: ', c3[1,1]/(c3[1,0]+c3[1,1]))
print('Sensitivity: ', c3[0,0]/(c3[0,0]+c3[0,1]))

Specificity:  0.7272727272727273
Sensitivity:  1.0


### Accuracy

In [71]:
c3 = confusion_matrix(ytest,p3)
print('Accuracy: {:.2%}'.format((c3[0,0]+c3[1,1])/sum(sum(c3))))

Accuracy: 90.00%


### F1 Score

In [72]:
f13 = f1_score(ytest,p3)
print('F1 Score: {:.2%}'.format(f13))

F1 Score: 84.21%


# K-Nearest Neighbour

In [73]:
model4 = KNeighborsClassifier()
model4.fit(xtrain,ytrain)
p4 = model4.predict(xtest)

### Confusion Matrix

In [74]:
print(confusion_matrix(ytest,p4))

[[36  2]
 [10 12]]


### Specificity and Sensitivity

In [75]:
c4 = confusion_matrix(ytest,p4)
print('Specificity: ', c4[1,1]/(c4[1,0]+c4[1,1]))
print('Sensitivity: ', c4[0,0]/(c4[0,0]+c4[0,1]))

Specificity:  0.5454545454545454
Sensitivity:  0.9473684210526315


### Accuracy

In [76]:
c4 = confusion_matrix(ytest,p4)
print('Accuracy: {:.2%}'.format((c4[0,0]+c4[1,1])/sum(sum(c4))))

Accuracy: 80.00%


### F1 Score

In [77]:
f14 = f1_score(ytest,p4)
print('F1 Score: {:.2%}'.format(f14))

F1 Score: 66.67%


# Logistic Regression

In [78]:
model5 = LogisticRegression()
model5.fit(xtrain,ytrain)
p5 = model5.predict(xtest)

### Confusion Matrix

In [79]:
print(confusion_matrix(ytest,p5))

[[38  0]
 [22  0]]


### Specificity and Sensitivity

In [80]:
c5 = confusion_matrix(ytest,p5)
print('Specificity: ', c5[1,1]/(c5[1,0]+c5[1,1]))
print('Sensitivity: ', c5[0,0]/(c5[0,0]+c5[0,1]))

Specificity:  0.0
Sensitivity:  1.0


### Accuracy

In [81]:
c5 = confusion_matrix(ytest,p5)
print('Accuracy: {:.2%}'.format((c5[0,0]+c5[1,1])/sum(sum(c5))))

Accuracy: 63.33%


### F1 Score

In [82]:
f15 = f1_score(ytest,p5)
print('F1 Score: {:.2%}'.format(f15))

F1 Score: 0.00%


# K-Fold

In [172]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=30)

### Random Forest

In [173]:
acc = np.mean(cross_val_score(model1, xtrain, ytrain, scoring='accuracy', cv=kfold))
pr = np.mean(cross_val_score(model1, xtrain, ytrain, scoring='precision', cv=kfold))
f1s = np.mean(cross_val_score(model1, xtrain, ytrain, scoring='f1', cv=kfold))

In [174]:
allen1 = pd.DataFrame({'Model':'Random Forest',
                      'Accuracy':[acc],
                      'Precision':[pr],
                      'F1 Score':[f1s]})

### Decision Tree

In [175]:
acc = np.mean(cross_val_score(model2, xtrain, ytrain, scoring='accuracy', cv=kfold))
pr = np.mean(cross_val_score(model2, xtrain, ytrain, scoring='precision', cv=kfold))
f1s = np.mean(cross_val_score(model2, xtrain, ytrain, scoring='f1', cv=kfold))

In [176]:
allen2 = pd.DataFrame({'Model':'Decision Tree',
                      'Accuracy':[acc],
                      'Precision':[pr],
                      'F1 Score':[f1s]})

### Naive Bayes

In [177]:
acc = np.mean(cross_val_score(model3, xtrain, ytrain, scoring='accuracy', cv=kfold))
pr = np.mean(cross_val_score(model3, xtrain, ytrain, scoring='precision', cv=kfold))
f1s = np.mean(cross_val_score(model3, xtrain, ytrain, scoring='f1', cv=kfold))

In [178]:
allen3 = pd.DataFrame({'Model':'Naive Bayes',
                      'Accuracy':[acc],
                      'Precision':[pr],
                      'F1 Score':[f1s]})

### K-Nearest Neighbour

In [179]:
acc = np.mean(cross_val_score(model4, xtrain, ytrain, scoring='accuracy', cv=kfold))
pr = np.mean(cross_val_score(model4, xtrain, ytrain, scoring='precision', cv=kfold))
f1s = np.mean(cross_val_score(model4, xtrain, ytrain, scoring='f1', cv=kfold))

In [180]:
allen4 = pd.DataFrame({'Model':'K-Nearest Neighbour',
                      'Accuracy':[acc],
                      'Precision':[pr],
                      'F1 Score':[f1s]})

### Logistic Regression

In [181]:
acc = np.mean(cross_val_score(model5, xtrain, ytrain, scoring='accuracy', cv=kfold))
# pr = np.mean(cross_val_score(model5, xtrain, ytrain, scoring='precision', cv=kfold))
f1s = np.mean(cross_val_score(model5, xtrain, ytrain, scoring='f1', cv=kfold))

In [182]:
allen5 = pd.DataFrame({'Model':'Logistic Regression',
                      'Accuracy':[acc],
                      'Precision':'NA',
                      'F1 Score':[f1s]})

# Concat

In [183]:
al = pd.concat([allen1,allen2,allen3,allen4,allen5],axis=0).reset_index()
al = al.drop('index',axis=1)
al

Unnamed: 0,Model,Accuracy,Precision,F1 Score
0,Random Forest,0.882353,0.812629,0.851371
1,Decision Tree,0.876471,0.850546,0.836372
2,Naive Bayes,0.876471,0.857514,0.816352
3,K-Nearest Neighbour,0.805882,0.798139,0.689109
4,Logistic Regression,0.644118,,0.0
