# Tech Challenge - Machine Learning Part 2

Using random over and under sampling techniques, we are looking for the best model that represents the behaviors of riders using the rideshare service.

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn import datasets
from sklearn import metrics
from collections import Counter

from sklearn.model_selection import (KFold, 
                                     cross_val_score, 
                                     GridSearchCV, 
                                     train_test_split)
from sklearn.metrics import (classification_report,
                             confusion_matrix,
                             average_precision_score,
                             precision_recall_curve,
                             recall_score,
                             f1_score)

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import imblearn
from imblearn.under_sampling import (RandomUnderSampler,
                                     EditedNearestNeighbours,
                                     NeighbourhoodCleaningRule,
                                     NearMiss)
from imblearn.over_sampling import (RandomOverSampler,
                                    SMOTE,
                                    ADASYN)

  import pandas.util.testing as tm


In [2]:
data = pd.read_csv('rider_ml.csv', index_col=0) # import data
rider = data.copy() # save a copy of data as tree

In [3]:
rider.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38325 entries, 0 to 38324
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   active                  38325 non-null  object 
 1   Astapor                 38325 non-null  int64  
 2   King's Landing          38325 non-null  int64  
 3   Winterfell              38325 non-null  int64  
 4   Unknown                 38325 non-null  int64  
 5   iPhone                  38325 non-null  int64  
 6   trips_in_first_30_days  38325 non-null  int64  
 7   avg_rating_of_driver    38325 non-null  float64
 8   avg_surge               38325 non-null  float64
 9   ultimate_black_user     38325 non-null  int64  
 10  weekday_pct             38325 non-null  float64
 11  avg_dist                38325 non-null  float64
 12  avg_rating_by_driver    38325 non-null  float64
dtypes: float64(5), int64(7), object(1)
memory usage: 4.1+ MB


In [4]:
rider.head()

Unnamed: 0,active,Astapor,King's Landing,Winterfell,Unknown,iPhone,trips_in_first_30_days,avg_rating_of_driver,avg_surge,ultimate_black_user,weekday_pct,avg_dist,avg_rating_by_driver
0,No,0,1,0,0,1,4,4.7,1.1,1,46.2,3.67,5.0
1,No,1,0,0,0,0,0,5.0,1.0,0,50.0,8.26,5.0
2,No,1,0,0,0,1,3,4.3,1.0,0,100.0,0.77,5.0
3,No,0,1,0,0,1,9,4.6,1.14,1,80.0,2.36,4.9
4,No,0,0,1,0,0,14,4.4,1.19,0,82.4,3.13,4.9


## Algorithm Functions

In [5]:
# logistic regression

def logreg(X_train, X_test, y_train, y_test):
    logreg = LogisticRegression(random_state=42).fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    
    print('Logistic Regression \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set: ', logreg.score(X_train, y_train))
    print('Accuracy Score, Test Set: ', logreg.score(X_test, y_test))
    print()
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [6]:
# decision tree classifier

def decision_tree(X_train, X_test, y_train, y_test):
    decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
    y_pred = decision_tree.predict(X_test)
    
    print('Decision Tree Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', decision_tree.score(X_train, y_train))
    print('Accuracy Score, Test Set:', decision_tree.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [7]:
# random forest classifier

def random_forest(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    
    print('Random Forest Classifier \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', rf.score(X_train, y_train))
    print('Accuracy Score, Test Set:', rf.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

In [8]:
# Gaussian naive bayes

def gaussian(X_train, X_test, y_train, y_test):
    gaussian = GaussianNB().fit(X_train, y_train)
    y_pred = gaussian.predict(X_test)
    
    print('Gaussian Naive Bayes \n')
    
    # accuracy scores
    print('Accuracy Score, Training Set:', gaussian.score(X_train, y_train))
    print('Accuracy Score, Test Set:', gaussian.score(X_test, y_test))
    
    # confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print('Confusion Matrix: \n', cm)
    print()
    
    # classification report
    print('Classification Report \n')
    print(classification_report(y_test, y_pred))

# Target and Response Variable, Train_Test_Split

In [9]:
np.random.seed(42)

In [10]:
# separate target variable 
y = rider['active'].values
X = rider.drop('active', axis=1).values

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(28743, 12) (28743,)
(9582, 12) (9582,)


In [11]:
## Baseline - DummyClassifier

# Under Sampling Methods

## Random Under Sampler

In [12]:
random_under = RandomUnderSampler(random_state=42)
X_rs, y_rs = random_under.fit_sample(X, y)

print('Random undersampling {}'.format(Counter(y_rs)))

# train test split
X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X_rs, y_rs, test_size=0.25, random_state=42)

print(X_train_rs.shape, y_train_rs.shape)
print(X_test_rs.shape, y_test_rs.shape)

Random undersampling Counter({'No': 78, 'Yes': 78})
(117, 12) (117,)
(39, 12) (39,)


In [13]:
# logistic regression
logreg(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Logistic Regression 

Accuracy Score, Training Set:  0.7435897435897436
Accuracy Score, Test Set:  0.6666666666666666

Confusion Matrix: 
 [[12  6]
 [ 7 14]]

Classification Report 

              precision    recall  f1-score   support

          No       0.63      0.67      0.65        18
         Yes       0.70      0.67      0.68        21

    accuracy                           0.67        39
   macro avg       0.67      0.67      0.67        39
weighted avg       0.67      0.67      0.67        39



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [14]:
# decision tree classifier
decision_tree(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Decision Tree Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.5897435897435898
Confusion Matrix: 
 [[12  6]
 [10 11]]

Classification Report 

              precision    recall  f1-score   support

          No       0.55      0.67      0.60        18
         Yes       0.65      0.52      0.58        21

    accuracy                           0.59        39
   macro avg       0.60      0.60      0.59        39
weighted avg       0.60      0.59      0.59        39



In [15]:
# random forest classifier
random_forest(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Random Forest Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.5897435897435898
Confusion Matrix: 
 [[12  6]
 [10 11]]

Classification Report 

              precision    recall  f1-score   support

          No       0.55      0.67      0.60        18
         Yes       0.65      0.52      0.58        21

    accuracy                           0.59        39
   macro avg       0.60      0.60      0.59        39
weighted avg       0.60      0.59      0.59        39



In [16]:
# Gaussian Naive Bayes
gaussian(X_train_rs, X_test_rs, y_train_rs, y_test_rs)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.7264957264957265
Accuracy Score, Test Set: 0.5897435897435898
Confusion Matrix: 
 [[12  6]
 [10 11]]

Classification Report 

              precision    recall  f1-score   support

          No       0.55      0.67      0.60        18
         Yes       0.65      0.52      0.58        21

    accuracy                           0.59        39
   macro avg       0.60      0.60      0.59        39
weighted avg       0.60      0.59      0.59        39



## Edited Nearest Neighbors

In [17]:
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_enn)))

# train test split
X_train_enn, X_test_enn, y_train_enn, y_test_enn = train_test_split(X_enn, y_enn, test_size=0.25, random_state=42)

print(X_train_enn.shape, y_train_enn.shape)
print(X_test_enn.shape, y_test_enn.shape)

Resampled dataset shape: Counter({'No': 38002, 'Yes': 78})
(28560, 12) (28560,)
(9520, 12) (9520,)


In [18]:
# logistic regression
logreg(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.9979341736694678
Accuracy Score, Test Set:  0.9980042016806723

Confusion Matrix: 
 [[9501    0]
 [  19    0]]

Classification Report 



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          No       1.00      1.00      1.00      9501
         Yes       0.00      0.00      0.00        19

    accuracy                           1.00      9520
   macro avg       0.50      0.50      0.50      9520
weighted avg       1.00      1.00      1.00      9520



In [19]:
# decision tree classifier
decision_tree(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Decision Tree Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.9934873949579832
Confusion Matrix: 
 [[9458   43]
 [  19    0]]

Classification Report 

              precision    recall  f1-score   support

          No       1.00      1.00      1.00      9501
         Yes       0.00      0.00      0.00        19

    accuracy                           0.99      9520
   macro avg       0.50      0.50      0.50      9520
weighted avg       1.00      0.99      0.99      9520



In [20]:
# random forest classifier
random_forest(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Random Forest Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.9980042016806723
Confusion Matrix: 
 [[9501    0]
 [  19    0]]

Classification Report 



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          No       1.00      1.00      1.00      9501
         Yes       0.00      0.00      0.00        19

    accuracy                           1.00      9520
   macro avg       0.50      0.50      0.50      9520
weighted avg       1.00      1.00      1.00      9520



In [21]:
# Gaussian Naive Bayes
gaussian(X_train_enn, X_test_enn, y_train_enn, y_test_enn)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.9807422969187675
Accuracy Score, Test Set: 0.9786764705882353
Confusion Matrix: 
 [[9316  185]
 [  18    1]]

Classification Report 

              precision    recall  f1-score   support

          No       1.00      0.98      0.99      9501
         Yes       0.01      0.05      0.01        19

    accuracy                           0.98      9520
   macro avg       0.50      0.52      0.50      9520
weighted avg       1.00      0.98      0.99      9520



## Near Miss

In [22]:
nm = NearMiss()
X_nm, y_nm = nm.fit_resample(X, y)
print('Resampled dataset shape: {}'.format(Counter(y_nm)))

# train test split
X_train_nm, X_test_nm, y_train_nm, y_test_nm = train_test_split(X_nm, y_nm, test_size=0.25, random_state=42)

print(X_train_nm.shape, y_train_nm.shape)
print(X_test_nm.shape, y_test_nm.shape)

Resampled dataset shape: Counter({'No': 78, 'Yes': 78})
(117, 12) (117,)
(39, 12) (39,)


In [23]:
# logistic regression
logreg(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Logistic Regression 

Accuracy Score, Training Set:  0.7777777777777778
Accuracy Score, Test Set:  0.7435897435897436

Confusion Matrix: 
 [[15  3]
 [ 7 14]]

Classification Report 

              precision    recall  f1-score   support

          No       0.68      0.83      0.75        18
         Yes       0.82      0.67      0.74        21

    accuracy                           0.74        39
   macro avg       0.75      0.75      0.74        39
weighted avg       0.76      0.74      0.74        39



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [24]:
# decision tree classifier
decision_tree(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Decision Tree Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.8205128205128205
Confusion Matrix: 
 [[13  5]
 [ 2 19]]

Classification Report 

              precision    recall  f1-score   support

          No       0.87      0.72      0.79        18
         Yes       0.79      0.90      0.84        21

    accuracy                           0.82        39
   macro avg       0.83      0.81      0.82        39
weighted avg       0.83      0.82      0.82        39



In [25]:
# random forest classifier
random_forest(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Random Forest Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.8717948717948718
Confusion Matrix: 
 [[16  2]
 [ 3 18]]

Classification Report 

              precision    recall  f1-score   support

          No       0.84      0.89      0.86        18
         Yes       0.90      0.86      0.88        21

    accuracy                           0.87        39
   macro avg       0.87      0.87      0.87        39
weighted avg       0.87      0.87      0.87        39



In [26]:
# Gaussian Naive Bayes
gaussian(X_train_nm, X_test_nm, y_train_nm, y_test_nm)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.7606837606837606
Accuracy Score, Test Set: 0.6153846153846154
Confusion Matrix: 
 [[17  1]
 [14  7]]

Classification Report 

              precision    recall  f1-score   support

          No       0.55      0.94      0.69        18
         Yes       0.88      0.33      0.48        21

    accuracy                           0.62        39
   macro avg       0.71      0.64      0.59        39
weighted avg       0.72      0.62      0.58        39



## Neighborhood Cleaning Rule

In [27]:
ncr = NeighbourhoodCleaningRule()
X_ncr, y_ncr = nm.fit_resample(X, y)
print('Resampled dataset shape: {}'.format(Counter(y_ncr)))

# train test split
X_train_ncr, X_test_ncr, y_train_ncr, y_test_ncr = train_test_split(X_ncr, y_ncr, test_size=0.25, random_state=42)

print(X_train_ncr.shape, y_train_ncr.shape)
print(X_test_ncr.shape, y_test_ncr.shape)

Resampled dataset shape: Counter({'No': 78, 'Yes': 78})
(117, 12) (117,)
(39, 12) (39,)


In [28]:
# logistic regression
logreg(X_train_ncr, X_test_ncr, y_train_ncr, y_test_ncr)

Logistic Regression 

Accuracy Score, Training Set:  0.7777777777777778
Accuracy Score, Test Set:  0.7435897435897436

Confusion Matrix: 
 [[15  3]
 [ 7 14]]

Classification Report 

              precision    recall  f1-score   support

          No       0.68      0.83      0.75        18
         Yes       0.82      0.67      0.74        21

    accuracy                           0.74        39
   macro avg       0.75      0.75      0.74        39
weighted avg       0.76      0.74      0.74        39



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [29]:
# decision tree classifier
decision_tree(X_train_ncr, X_test_ncr, y_train_ncr, y_test_ncr)

Decision Tree Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.8205128205128205
Confusion Matrix: 
 [[13  5]
 [ 2 19]]

Classification Report 

              precision    recall  f1-score   support

          No       0.87      0.72      0.79        18
         Yes       0.79      0.90      0.84        21

    accuracy                           0.82        39
   macro avg       0.83      0.81      0.82        39
weighted avg       0.83      0.82      0.82        39



In [30]:
# random forest classifier
random_forest(X_train_ncr, X_test_ncr, y_train_ncr, y_test_ncr)

Random Forest Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.8717948717948718
Confusion Matrix: 
 [[16  2]
 [ 3 18]]

Classification Report 

              precision    recall  f1-score   support

          No       0.84      0.89      0.86        18
         Yes       0.90      0.86      0.88        21

    accuracy                           0.87        39
   macro avg       0.87      0.87      0.87        39
weighted avg       0.87      0.87      0.87        39



In [31]:
# Gaussian Naive Bayes
gaussian(X_train_ncr, X_test_ncr, y_train_ncr, y_test_ncr)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.7606837606837606
Accuracy Score, Test Set: 0.6153846153846154
Confusion Matrix: 
 [[17  1]
 [14  7]]

Classification Report 

              precision    recall  f1-score   support

          No       0.55      0.94      0.69        18
         Yes       0.88      0.33      0.48        21

    accuracy                           0.62        39
   macro avg       0.71      0.64      0.59        39
weighted avg       0.72      0.62      0.58        39



# Over Sampling Methods

## Random Over Sampler

In [32]:
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_ros)))

# train test split
X_train_ros, X_test_ros, y_train_ros, y_test_ros = train_test_split(X_ros, y_ros, test_size=0.25, random_state=42)

print(X_train_ros.shape, y_train_ros.shape)
print(X_test_ros.shape, y_test_ros.shape)

Resampled dataset shape: Counter({'No': 38247, 'Yes': 38247})
(57370, 12) (57370,)
(19124, 12) (19124,)


In [33]:
logreg(X_train_ros, X_test_ros, y_train_ros, y_test_ros)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.646696879902388
Accuracy Score, Test Set:  0.6489751098096632

Confusion Matrix: 
 [[5922 3669]
 [3044 6489]]

Classification Report 

              precision    recall  f1-score   support

          No       0.66      0.62      0.64      9591
         Yes       0.64      0.68      0.66      9533

    accuracy                           0.65     19124
   macro avg       0.65      0.65      0.65     19124
weighted avg       0.65      0.65      0.65     19124



In [34]:
decision_tree(X_train_ros, X_test_ros, y_train_ros, y_test_ros)

Decision Tree Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.9991633549466639
Confusion Matrix: 
 [[9575   16]
 [   0 9533]]

Classification Report 

              precision    recall  f1-score   support

          No       1.00      1.00      1.00      9591
         Yes       1.00      1.00      1.00      9533

    accuracy                           1.00     19124
   macro avg       1.00      1.00      1.00     19124
weighted avg       1.00      1.00      1.00     19124



In [35]:
random_forest(X_train_ros, X_test_ros, y_train_ros, y_test_ros)

Random Forest Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 1.0
Confusion Matrix: 
 [[9591    0]
 [   0 9533]]

Classification Report 

              precision    recall  f1-score   support

          No       1.00      1.00      1.00      9591
         Yes       1.00      1.00      1.00      9533

    accuracy                           1.00     19124
   macro avg       1.00      1.00      1.00     19124
weighted avg       1.00      1.00      1.00     19124



In [36]:
gaussian(X_train_ros, X_test_ros, y_train_ros, y_test_ros)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.691075474986927
Accuracy Score, Test Set: 0.6963501359548212
Confusion Matrix: 
 [[6445 3146]
 [2661 6872]]

Classification Report 

              precision    recall  f1-score   support

          No       0.71      0.67      0.69      9591
         Yes       0.69      0.72      0.70      9533

    accuracy                           0.70     19124
   macro avg       0.70      0.70      0.70     19124
weighted avg       0.70      0.70      0.70     19124



## SMOTE

In [37]:
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X, y)
print('Resampled dataset shape: {}'.format(Counter(y_sm)))

# train test split
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_sm, y_sm, test_size=0.25, random_state=42)

print(X_train_sm.shape, y_train_sm.shape)
print(X_test_sm.shape, y_test_sm.shape)

Resampled dataset shape: Counter({'No': 38247, 'Yes': 38247})
(57370, 12) (57370,)
(19124, 12) (19124,)


In [38]:
logreg(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.6759456161757016
Accuracy Score, Test Set:  0.6771595900439239

Confusion Matrix: 
 [[6130 3461]
 [2713 6820]]

Classification Report 

              precision    recall  f1-score   support

          No       0.69      0.64      0.67      9591
         Yes       0.66      0.72      0.69      9533

    accuracy                           0.68     19124
   macro avg       0.68      0.68      0.68     19124
weighted avg       0.68      0.68      0.68     19124



In [39]:
decision_tree(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Decision Tree Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.9958690650491528
Confusion Matrix: 
 [[9548   43]
 [  36 9497]]

Classification Report 

              precision    recall  f1-score   support

          No       1.00      1.00      1.00      9591
         Yes       1.00      1.00      1.00      9533

    accuracy                           1.00     19124
   macro avg       1.00      1.00      1.00     19124
weighted avg       1.00      1.00      1.00     19124



In [40]:
random_forest(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Random Forest Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.9979606776824932
Confusion Matrix: 
 [[9587    4]
 [  35 9498]]

Classification Report 

              precision    recall  f1-score   support

          No       1.00      1.00      1.00      9591
         Yes       1.00      1.00      1.00      9533

    accuracy                           1.00     19124
   macro avg       1.00      1.00      1.00     19124
weighted avg       1.00      1.00      1.00     19124



In [41]:
gaussian(X_train_sm, X_test_sm, y_train_sm, y_test_sm)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.7518215094997386
Accuracy Score, Test Set: 0.7575820957958586
Confusion Matrix: 
 [[6713 2878]
 [1758 7775]]

Classification Report 

              precision    recall  f1-score   support

          No       0.79      0.70      0.74      9591
         Yes       0.73      0.82      0.77      9533

    accuracy                           0.76     19124
   macro avg       0.76      0.76      0.76     19124
weighted avg       0.76      0.76      0.76     19124



## ADASYN

In [42]:
ada = ADASYN(random_state=42)
X_ada, y_ada = ada.fit_resample(X, y)

print('Resampled dataset shape: {}'.format(Counter(y_ada)))

# train test split
X_train_ada, X_test_ada, y_train_ada, y_test_ada = train_test_split(X_ada, y_ada, test_size=0.25, random_state=42)

print(X_train_sm.shape, y_train_sm.shape)
print(X_test_sm.shape, y_test_sm.shape)

Resampled dataset shape: Counter({'No': 38247, 'Yes': 38218})
(57370, 12) (57370,)
(19124, 12) (19124,)


In [43]:
logreg(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression 

Accuracy Score, Training Set:  0.6751586803375881
Accuracy Score, Test Set:  0.6726996913741696

Confusion Matrix: 
 [[6134 3400]
 [2857 6726]]

Classification Report 

              precision    recall  f1-score   support

          No       0.68      0.64      0.66      9534
         Yes       0.66      0.70      0.68      9583

    accuracy                           0.67     19117
   macro avg       0.67      0.67      0.67     19117
weighted avg       0.67      0.67      0.67     19117



In [44]:
decision_tree(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Decision Tree Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.9967045038447455
Confusion Matrix: 
 [[9500   34]
 [  29 9554]]

Classification Report 

              precision    recall  f1-score   support

          No       1.00      1.00      1.00      9534
         Yes       1.00      1.00      1.00      9583

    accuracy                           1.00     19117
   macro avg       1.00      1.00      1.00     19117
weighted avg       1.00      1.00      1.00     19117



In [45]:
random_forest(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Random Forest Classifier 

Accuracy Score, Training Set: 1.0
Accuracy Score, Test Set: 0.9983784066537637
Confusion Matrix: 
 [[9532    2]
 [  29 9554]]

Classification Report 

              precision    recall  f1-score   support

          No       1.00      1.00      1.00      9534
         Yes       1.00      1.00      1.00      9583

    accuracy                           1.00     19117
   macro avg       1.00      1.00      1.00     19117
weighted avg       1.00      1.00      1.00     19117



In [46]:
gaussian(X_train_ada, X_test_ada, y_train_ada, y_test_ada)

Gaussian Naive Bayes 

Accuracy Score, Training Set: 0.7514298667782661
Accuracy Score, Test Set: 0.7538839776115499
Confusion Matrix: 
 [[6725 2809]
 [1896 7687]]

Classification Report 

              precision    recall  f1-score   support

          No       0.78      0.71      0.74      9534
         Yes       0.73      0.80      0.77      9583

    accuracy                           0.75     19117
   macro avg       0.76      0.75      0.75     19117
weighted avg       0.76      0.75      0.75     19117

