In [612]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import unittest

In [613]:
df = pd.read_csv('train.csv') #Setting data
#df.head() #Making sure df works.

In [614]:
df['price_binary'] = np.where(df['price_range'] < 2, 0, 1) #Adding price binary column making 0-1 = 0 and 2-3 = 1
#df.head() was used to confirm that the column was added

In [615]:
#checking for null values
df.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
price_binary     0
dtype: int64

In [616]:
#checking for duplicates
duplicates = df[df.duplicated()]
print(duplicates)

Empty DataFrame
Columns: [battery_power, blue, clock_speed, dual_sim, fc, four_g, int_memory, m_dep, mobile_wt, n_cores, pc, px_height, px_width, ram, sc_h, sc_w, talk_time, three_g, touch_screen, wifi, price_range, price_binary]
Index: []

[0 rows x 22 columns]


In [617]:
#Setting predictor and response variables
X = df[['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi']]
Y = df['price_binary']

In [618]:
scaler = StandardScaler()
## fit the training data in StandardScaler and then transform
X_train = scaler.fit_transform(X_train)
## transform the test data using StandardScaler
X_test = scaler.transform(X_test)

In [619]:
#corr = df.corr()
#plt.figure(figsize = (17,10))
#sns.heatmap(corr,annot=True, fmt= ".0%" , cmap = "RdYlGn")

In [620]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1400, 20)

(600, 20)

(1400,)

(600,)

In [621]:
#Creating classifier_algorithm class
class classifier_algorithm:
    def __init__(self, name, accuracy):
        self.name = name
        self.accuracy = accuracy
    def algorithm(self):
        return("Name: " + self.name)
    def score(self):
        return(self.accuracy)

In [666]:
#Unit testing class
class test_classifier(unittest.TestCase):
    #Create algorithm
    classifier1 = classifier_algorithm("regression", .9)
    classifier2 = classifier_algorithm("classifier", .7)
    def test_algorithm(self):
        self.assertEqual(self.classifier1.algorithm(), "Name: regression")
    def test_score(self):
        self.assertEqual(self.classifier1.score(), .9)
unittest.main(argv=[''], verbosity = 2, exit=False)
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False) 
#All tests passed

test_algorithm (__main__.test_classifier) ... ok
test_score (__main__.test_classifier) ... ok

----------------------------------------------------------------------
Ran 2 tests in 0.004s

OK
..
----------------------------------------------------------------------
Ran 2 tests in 0.002s

OK


In [664]:
full_model, reduced_model  = [], [] #Creating lists, to be used later to print out data

# Logistic Regression

In [624]:
model = LogisticRegression()
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)



In [625]:
train_acc = accuracy_score(y_train , y_train_pred)*100
test_acc  = accuracy_score(y_test  , y_test_pred)*100

print("Train Set Accuracy:"+str(train_acc))
print("Test Set Accuracy:"+str(test_acc))

Train Set Accuracy:95.71428571428572
Test Set Accuracy:95.0


In [626]:
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassification Report:\n%s"%classification_report(y_test_pred,y_test))


Confusion Matrix:
[[282  15]
 [ 15 288]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       297
           1       0.95      0.95      0.95       303

    accuracy                           0.95       600
   macro avg       0.95      0.95      0.95       600
weighted avg       0.95      0.95      0.95       600



In [627]:
logistic_regression_full = classifier_algorithm("Logistic Regression", test_acc)
full_model.append(logistic_regression_full)
#print(len(full_model)) #testing append

# KNN

In [628]:
model = KNeighborsClassifier(n_neighbors = 15)

model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [629]:
train_acc = accuracy_score(y_train , y_train_pred)*100
test_acc  = accuracy_score(y_test  , y_test_pred)*100

print("Train Set Accuracy:"+str(train_acc))
print("Test Set Accuracy:"+str(test_acc))

Train Set Accuracy:98.21428571428571
Test Set Accuracy:97.33333333333334


In [630]:
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassification Report:\n%s"%classification_report(y_test_pred,y_test))


Confusion Matrix:
[[295  14]
 [  2 289]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       309
           1       0.95      0.99      0.97       291

    accuracy                           0.97       600
   macro avg       0.97      0.97      0.97       600
weighted avg       0.97      0.97      0.97       600



In [631]:
knn_full = classifier_algorithm("k-Nearest Neighbors", test_acc)
full_model.append(knn_full)
#print(full_model[1].name)
#print(full_model[0].name)
#print(len(full_model))

# SVC

In [632]:
model = SVC(kernel = 'linear',C = 2 , random_state = 1 , probability= True)

model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [633]:
train_acc = accuracy_score(y_train , y_train_pred)*100
test_acc  = accuracy_score(y_test  , y_test_pred)*100

print("Train Set Accuracy:"+str(train_acc))
print("Test Set Accuracy:"+str(test_acc))

Train Set Accuracy:99.14285714285714
Test Set Accuracy:99.33333333333333


In [634]:
svm_full = classifier_algorithm("Support Vector Machine", test_acc)
full_model.append(svm_full)

In [635]:
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassification Report:\n%s"%classification_report(y_test_pred,y_test))


Confusion Matrix:
[[297   4]
 [  0 299]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       301
           1       0.99      1.00      0.99       299

    accuracy                           0.99       600
   macro avg       0.99      0.99      0.99       600
weighted avg       0.99      0.99      0.99       600



# RandomForest

In [636]:
model = RandomForestClassifier(max_depth=11)
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)



In [637]:
train_acc = accuracy_score(y_train , y_train_pred)*100
test_acc  = accuracy_score(y_test  , y_test_pred)*100

print("Train Set Accuracy:"+str(train_acc))
print("Test Set Accuracy:"+str(test_acc))

Train Set Accuracy:99.78571428571429
Test Set Accuracy:95.83333333333334


In [638]:
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassification Report:\n%s"%classification_report(y_test_pred,y_test))


Confusion Matrix:
[[290  18]
 [  7 285]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.94      0.96       308
           1       0.94      0.98      0.96       292

    accuracy                           0.96       600
   macro avg       0.96      0.96      0.96       600
weighted avg       0.96      0.96      0.96       600



In [639]:
rf_full = classifier_algorithm("RandomForest Classifier", test_acc)
full_model.append(rf_full)

In [640]:
#Commeting this code out. It doesn't do as good a job as what I ended up doing but f \t strings are cool and I'm
#keeping this code for the sake of my own personal reference.
########
#for i in range(0,len(full_model)):
 #   print (f"{full_model[i].name}\t{full_model[i].accuracy}")

In [641]:
data = []
for i in range(0,len(full_model)):
    datapoint = (full_model[i].name, full_model[i].accuracy)
    data.append(datapoint)
df_full = pd.DataFrame(data, columns = ["Model", "Accuracy"])
print(df_full)

                     Model   Accuracy
0      Logistic Regression  95.000000
1      k-Nearest Neighbors  97.333333
2   Support Vector Machine  99.333333
3  RandomForest Classifier  95.833333


# Reduced Model

In [642]:
#New Xs took out everything with a correlation < 10%
X = df[['battery_power','px_height','px_width', 'ram']]
Y = df['price_binary']

In [643]:
scaler = StandardScaler()
## fit the training data in StandardScaler and then transform
X_train = scaler.fit_transform(X_train)
## transform the test data using StandardScaler
X_test = scaler.transform(X_test)

In [644]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1400, 4)

(600, 4)

(1400,)

(600,)

In [645]:
model = LogisticRegression()
model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)



In [646]:
train_acc = accuracy_score(y_train , y_train_pred)*100
test_acc  = accuracy_score(y_test  , y_test_pred)*100

print("Train Set Accuracy:"+str(train_acc))
print("Test Set Accuracy:"+str(test_acc))

Train Set Accuracy:95.92857142857143
Test Set Accuracy:97.0


In [647]:
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassification Report:\n%s"%classification_report(y_test_pred,y_test))


Confusion Matrix:
[[287   8]
 [ 10 295]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       295
           1       0.97      0.97      0.97       305

    accuracy                           0.97       600
   macro avg       0.97      0.97      0.97       600
weighted avg       0.97      0.97      0.97       600



In [648]:
logistic_regression_reduced = classifier_algorithm("Logistic Regression", test_acc)
reduced_model.append(logistic_regression_full)
#print(len(reduced_model)) #testing append

In [649]:
model = KNeighborsClassifier(n_neighbors = 15)

model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [650]:
train_acc = accuracy_score(y_train , y_train_pred)*100
test_acc  = accuracy_score(y_test  , y_test_pred)*100

print("Train Set Accuracy:"+str(train_acc))
print("Test Set Accuracy:"+str(test_acc))

Train Set Accuracy:98.28571428571429
Test Set Accuracy:97.16666666666667


In [651]:
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassification Report:\n%s"%classification_report(y_test_pred,y_test))


Confusion Matrix:
[[294  14]
 [  3 289]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       308
           1       0.95      0.99      0.97       292

    accuracy                           0.97       600
   macro avg       0.97      0.97      0.97       600
weighted avg       0.97      0.97      0.97       600



In [652]:
knn_reduced = classifier_algorithm("k-Nearest Neighbors", test_acc)
reduced_model.append(knn_reduced)

In [653]:
model = SVC(kernel = 'linear',C = 2 , random_state = 1 , probability= True)

model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [654]:
train_acc = accuracy_score(y_train , y_train_pred)*100
test_acc  = accuracy_score(y_test  , y_test_pred)*100

print("Train Set Accuracy:"+str(train_acc))
print("Test Set Accuracy:"+str(test_acc))

Train Set Accuracy:98.64285714285714
Test Set Accuracy:99.0


In [655]:
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassification Report:\n%s"%classification_report(y_test_pred,y_test))


Confusion Matrix:
[[296   5]
 [  1 298]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       301
           1       0.98      1.00      0.99       299

    accuracy                           0.99       600
   macro avg       0.99      0.99      0.99       600
weighted avg       0.99      0.99      0.99       600



In [656]:
svm_reduced = classifier_algorithm("Support Vector Machine", test_acc)
reduced_model.append(svm_reduced)

In [657]:
model = RandomForestClassifier(max_depth=11)

model.fit(X_train,y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)



In [658]:
train_acc = accuracy_score(y_train , y_train_pred)*100
test_acc  = accuracy_score(y_test  , y_test_pred)*100

print("Train Set Accuracy:"+str(train_acc))
print("Test Set Accuracy:"+str(test_acc))

Train Set Accuracy:99.92857142857143
Test Set Accuracy:96.5


In [659]:
print("\nConfusion Matrix:\n%s"%confusion_matrix(y_test_pred,y_test))
print("\nClassification Report:\n%s"%classification_report(y_test_pred,y_test))


Confusion Matrix:
[[291  15]
 [  6 288]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.97       306
           1       0.95      0.98      0.96       294

    accuracy                           0.96       600
   macro avg       0.97      0.97      0.96       600
weighted avg       0.97      0.96      0.97       600



In [660]:
rf_reduced = classifier_algorithm("RandomForestClassifier", test_acc)
reduced_model.append(rf_reduced)

In [661]:
data = []
for i in range(0,len(reduced_model)):
    datapoint = (reduced_model[i].name, reduced_model[i].accuracy)
    data.append(datapoint)
df_reduced = pd.DataFrame(data, columns = ["Model", "Accuracy"])
print(df_reduced)

                    Model   Accuracy
0     Logistic Regression  95.000000
1     k-Nearest Neighbors  97.166667
2  Support Vector Machine  99.000000
3  RandomForestClassifier  96.500000


In [662]:
print(df_full)
print(df_reduced)

                     Model   Accuracy
0      Logistic Regression  95.000000
1      k-Nearest Neighbors  97.333333
2   Support Vector Machine  99.333333
3  RandomForest Classifier  95.833333
                    Model   Accuracy
0     Logistic Regression  95.000000
1     k-Nearest Neighbors  97.166667
2  Support Vector Machine  99.000000
3  RandomForestClassifier  96.500000


Full Model:


Unnamed: 0,Model,test_accuracy,recall,precision,f1_score
0,Logistic Regression,0.95,0.95,0.95,0.95
1,KNeighborsClassifier,0.973333,0.973333,0.974102,0.973328
2,Support vector machine,0.993333,0.993333,0.993422,0.993333
3,Random Forest Classifier,0.95,0.95,0.950333,0.949998
