In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/Users/cesurdagli/Desktop/Capstone Project/Feb17NoNull_Merged.csv')


In [3]:
df['pass_fail1']=df['pass_fail'].map({'PASS':1, 'FAIL':0})

In [4]:
#replacing empty values with null values
df=df.replace(r'^\s+$', np.nan, regex=True)

In [5]:
# Recoding the categorical values
clean_nums = {'highestEd': {'DD':5, 'MD':4, 'BD':3, 'HS':2, 'AD':1, 'OT':0},
                  'reason': {'review':4, 'learn':3, 'assignment':2, 'curious':1, 'other':0},
                  'level': {'confident':2, 'some':1, 'little':0},
             'testLevel': {'GR':1, 'UG':0}}
df.replace(clean_nums, inplace=True)
df[['highestEd', 'reason', 'level']].head()

Unnamed: 0,highestEd,reason,level
0,2.0,2.0,2.0
1,3.0,2.0,2.0
2,3.0,2.0,2.0
3,4.0,2.0,1.0
4,2.0,2.0,2.0


In [6]:
#Dropping the null values
df=df[df['reason'].notnull() & df['highestEd'].notnull() & df['level'].notnull()]

In [7]:
print(df.shape)
print(df.pass_fail1.value_counts())

(9579, 54)
0    7400
1    2179
Name: pass_fail1, dtype: int64


# Logistic Regression

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


# Define the features
X=df[['testLevel', 'highestEd', 'reason','level', 'quality_Satisfaction1', 'authenticProblems1',
       'demonstration1', 'application1', 'activation1', 'ALT2', 'ALT3',
       'quality_Satisfaction2', 'demonstration2', 'demonstration3', 'ALT4',
       'authenticProblems2', 'integration1', 'activation2',
       'authenticProblems3', 'integration2', 'application3', 'integration3',
       'activation3', 'application4', 'demonstration5',
       'ALT1Reversed', 'demonstration4Reversed',
       'quality_Satisfaction3Reversed', 'quality_Satisfaction4Reversed']]
print("X: ", type(X), X.shape)

# Define the target
y = df.pass_fail1
print("y: ", type(y), y.shape)
print("df.pass_fail shape: ", df.pass_fail1.shape)

# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(X, y, random_state=5)

X:  <class 'pandas.core.frame.DataFrame'> (9579, 29)
y:  <class 'pandas.core.series.Series'> (9579,)
df.pass_fail shape:  (9579,)


In [9]:
print("\n")
print(type(Xlr), Xlr.shape, len(Xlr)) #TrainX

print("\n")
print(type(Xtestlr), Xtestlr.shape, len(Xtestlr)) #TestX

print("\n")
print(type(ylr), ylr.shape, len(ylr)) #Trainy

print("\n")
print(type(ytestlr), ytestlr.shape, len(ytestlr)) #Testy



<class 'pandas.core.frame.DataFrame'> (7184, 29) 7184


<class 'pandas.core.frame.DataFrame'> (2395, 29) 2395


<class 'pandas.core.series.Series'> (7184,) 7184


<class 'pandas.core.series.Series'> (2395,) 2395


In [15]:
from sklearn.metrics import classification_report
from sklearn import metrics

# Construct the LogisticRegression model
clf = LogisticRegression()

# Fit the model on the training data.
clf.fit(Xlr, ylr) 

# Print the accuracy from the testing data.

y_predict_test = clf.predict(Xtestlr)
print("\n")
print("[Test] Accuracy score (y_predict_test, ytestlr):",accuracy_score(y_predict_test, ytestlr))


# Printout the training score
y_predict_training = clf.predict(Xlr)
print("\n")
print("[Training] Accuracy score: (ylr, y_predict_training)",accuracy_score(ylr, y_predict_training))



print("Accuracy:",metrics.accuracy_score(y_predict_test, ytestlr))
print("Precision:",metrics.precision_score(ytestlr, y_predict_test))
print("Recall:",metrics.recall_score(ylr, y_predict_training))
print("\n")
print('Confusion Matrix:')

cnf_matrix = metrics.confusion_matrix(ytestlr, y_predict_test)
print(cnf_matrix)
print('')
print("[Test Classification Report:]")
print(classification_report(ytestlr, y_predict_test))



[Test] Accuracy score (y_predict_test, ytestlr): 0.7803757828810021


[Training] Accuracy score: (ylr, y_predict_training) 0.7700445434298441
Accuracy: 0.7803757828810021
Precision: 0.5151515151515151
Recall: 0.027239709443099273


Confusion Matrix:
[[1852   16]
 [ 510   17]]

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.78      0.99      0.88      1868
           1       0.52      0.03      0.06       527

    accuracy                           0.78      2395
   macro avg       0.65      0.51      0.47      2395
weighted avg       0.72      0.78      0.70      2395





# Resampling techniques — Undersample majority class
using from imblearn.under_sampling import RandomUnderSampler 

In [18]:
from imblearn.under_sampling import RandomUnderSampler 
from collections import Counter


rus = RandomUnderSampler(random_state=5)
X_rus, y_rus = rus.fit_resample(Xlr, ylr)
print('Resampled dataset shape %s' % Counter(y_rus))

Resampled dataset shape Counter({0: 1652, 1: 1652})


In [20]:
downsampled = LogisticRegression().fit(X_rus, y_rus)
downsampled_pred = downsampled.predict(Xtestlr)    

print("Accuracy:",metrics.accuracy_score(ytestlr, downsampled_pred))
print("Precision:",metrics.precision_score(ytestlr, downsampled_pred))
print("Recall:",metrics.recall_score(ytestlr, downsampled_pred))
print("F1 Score:",metrics.f1_score(ytestlr, downsampled_pred))
print("\n")
print('Confusion Matrix:')
cnf_matrix = metrics.confusion_matrix(ytestlr, downsampled_pred)
print(cnf_matrix)
print("\n")
print("[Test Classification Report:]")
print(classification_report(ytestlr, downsampled_pred))



Accuracy: 0.5883089770354906
Precision: 0.296
Recall: 0.6318785578747628
F1 Score: 0.40314769975786924


Confusion Matrix:
[[1076  792]
 [ 194  333]]


[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.85      0.58      0.69      1868
           1       0.30      0.63      0.40       527

    accuracy                           0.59      2395
   macro avg       0.57      0.60      0.54      2395
weighted avg       0.73      0.59      0.62      2395



# Resampling techniques — Oversample majority class
using from imblearn.under_sampling import RandomUnderSampler

In [21]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=5)
X_res, y_res = ros.fit_resample(Xlr, ylr)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({1: 5532, 0: 5532})


In [22]:
upsampled = LogisticRegression().fit(X_res, y_res)
upsampled_pred = upsampled.predict(Xtestlr)    

print("Accuracy:",metrics.accuracy_score(ytestlr, upsampled_pred))
print("Precision:",metrics.precision_score(ytestlr, upsampled_pred))
print("Recall:",metrics.recall_score(ytestlr, upsampled_pred))
print("F1 Score:",metrics.f1_score(ytestlr, upsampled_pred))
print("\n")
print('Confusion Matrix:')
cnf_matrix = metrics.confusion_matrix(ytestlr, upsampled_pred)
print(cnf_matrix)
print("\n")
print("[Test Classification Report:]")
print(classification_report(ytestlr, upsampled_pred))



Accuracy: 0.6041753653444677
Precision: 0.3056325023084026
Recall: 0.6280834914611005
F1 Score: 0.41118012422360245


Confusion Matrix:
[[1116  752]
 [ 196  331]]


[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.85      0.60      0.70      1868
           1       0.31      0.63      0.41       527

    accuracy                           0.60      2395
   macro avg       0.58      0.61      0.56      2395
weighted avg       0.73      0.60      0.64      2395



# SMOTE Generate synthetic samples

In [24]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=5, ratio=1.0)
SX_train, Sy_train = sm.fit_sample(Xlr, ylr)

In [27]:
Counter(Sy_train)

Counter({1: 5532, 0: 5532})

In [32]:
smote = LogisticRegression().fit(SX_train, Sy_train)

smote_pred = smote.predict(Xtestlr)
    
print("Accuracy:",metrics.accuracy_score(ytestlr, smote_pred))
print("Precision:",metrics.precision_score(ytestlr, smote_pred))
print("Recall:",metrics.recall_score(ytestlr, smote_pred))
print("F1 Score:",metrics.f1_score(ytestlr, smote_pred))
print("\n")
print("Confusion Matrix")
cnf_matrix = metrics.confusion_matrix(ytestlr, smote_pred)
print(cnf_matrix)
print("\n")
print("[Test Classification Report:]")
print(classification_report(ytestlr, smote_pred))



Accuracy: 0.5962421711899791
Precision: 0.3010849909584087
Recall: 0.6318785578747628
F1 Score: 0.4078383343539498


Confusion Matrix
[[1095  773]
 [ 194  333]]


[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.85      0.59      0.69      1868
           1       0.30      0.63      0.41       527

    accuracy                           0.60      2395
   macro avg       0.58      0.61      0.55      2395
weighted avg       0.73      0.60      0.63      2395



# ADASYN

In [33]:
from imblearn.over_sampling import ADASYN

sm = SMOTE(random_state=5, ratio=1.0)
AX_train, Ay_train = sm.fit_sample(Xlr, ylr)

In [34]:
np.bincount(Ay_train)

array([5532, 5532])

In [36]:
adasyn = LogisticRegression().fit(AX_train, Ay_train)

adasyn_pred = adasyn.predict(Xtestlr)
    
print("Accuracy:",metrics.accuracy_score(ytestlr, adasyn_pred))
print("Precision:",metrics.precision_score(ytestlr, adasyn_pred))
print("Recall:",metrics.recall_score(ytestlr, adasyn_pred))
print("F1 Score:",metrics.f1_score(ytestlr, adasyn_pred))
print("\n")
print("Confusion Matrix")
cnf_matrix = metrics.confusion_matrix(ytestlr, adasyn_pred)
print(cnf_matrix)
print("\n")
print("[Test Classification Report:]")
print(classification_report(ytestlr, adasyn_pred))

Accuracy: 0.5962421711899791
Precision: 0.3010849909584087
Recall: 0.6318785578747628
F1 Score: 0.4078383343539498


Confusion Matrix
[[1095  773]
 [ 194  333]]


[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.85      0.59      0.69      1868
           1       0.30      0.63      0.41       527

    accuracy                           0.60      2395
   macro avg       0.58      0.61      0.55      2395
weighted avg       0.73      0.60      0.63      2395





# Random Forest Classifier

In [43]:
from sklearn.ensemble import RandomForestClassifier

# train model
rfc = RandomForestClassifier(n_estimators=300).fit(Xlr, ylr)

# Predict on test set
rfc_pred = rfc.predict(Xtestlr)

print("Accuracy:",metrics.accuracy_score(ytestlr, rfc_pred))
print("Precision:",metrics.precision_score(ytestlr, rfc_pred))
print("Recall:",metrics.recall_score(ytestlr, rfc_pred))
print("F1 Score:",metrics.f1_score(ytestlr, rfc_pred))

cnf_matrix = metrics.confusion_matrix(ytestlr, rfc_pred)
print("\n")
print("Confusion Matrix")
cnf_matrix = metrics.confusion_matrix(ytestlr, rfc_pred)
print(cnf_matrix)
print("\n")
print("[Test Classification Report:]")
print(classification_report(ytestlr, rfc_pred))

Accuracy: 0.7657620041753653
Precision: 0.3068181818181818
Recall: 0.051233396584440226
F1 Score: 0.0878048780487805


Confusion Matrix
[[1807   61]
 [ 500   27]]


[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.78      0.97      0.87      1868
           1       0.31      0.05      0.09       527

    accuracy                           0.77      2395
   macro avg       0.55      0.51      0.48      2395
weighted avg       0.68      0.77      0.69      2395

