In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

### Read in the Data

In [3]:
# Import data
cv_df = pd.read_csv('../data/count_vec.csv')
tfidf_df = pd.read_csv('../data/tfidf.csv')

In [4]:
cv_df.head()

Unnamed: 0,is_serious,sent_compound,sent_neg,sent_neu,sent_pos,char_count,00,000,001,00pm,...,zealand,zen,zeppelin,zero,zip,zoloft,zombie,zone,zoned,zoo
0,0,0.9997,0.066,0.661,0.273,4862,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0.9999,0.057,0.649,0.293,6225,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0.9195,0.068,0.82,0.112,1105,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,-0.9943,0.221,0.709,0.071,1425,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0.9988,0.08,0.636,0.284,2104,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
cv_df.shape

(1694, 11251)

In [6]:
tfidf_df.head()

Unnamed: 0,is_serious,sent_compound,sent_neg,sent_neu,sent_pos,char_count,00,000,01,02,...,yr,yt,yummy,yup,zealand,zero,zip,zombie,zone,zoo
0,0,0.9997,0.066,0.661,0.273,4862,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.9999,0.057,0.649,0.293,6225,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.9195,0.068,0.82,0.112,1105,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,-0.9943,0.221,0.709,0.071,1425,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085366,0.0
4,0,0.9988,0.08,0.636,0.284,2104,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
tfidf_df.shape

(1694, 7926)

### Evaluate Various Models

#### Function: Confusion Matrix Results

In [8]:
def confusion_matrix_results(y_test, predictions):
    # Create confustion matrix and conver to dataframe
    cm = confusion_matrix(y_test, predictions)
    print('-------------CONFUSION MATRIX---------------')
    print(pd.DataFrame(cm, 
                       columns=['pred neg', 'pred pos'], 
                       index = ['actual neg', 'actual pos']))
    
    # Results
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print("\nTrue Negatives: %s" % tn)
    print("False Positives: %s" % fp)
    print("False Negatives: %s" % fn)
    print("True Positives: %s" % tp)
    print('------------------METRICS-------------------')
    print(f'Sensativity: {round(tp / (tp + fn),4)}')
    print(f'Specificity: {round(tn / (tn + fp),4)}\n\n')
    # Other metrics?? missclassification? 

#### LOGISTIC REGRESSION

In [9]:
def logisitic_reg(df):
    print('*********** LOGISTIC REGRESSION MODEL ************\n')
    features = [column for column in df.columns if column != 'is_serious']
    features.remove('sent_compound')
    features.remove('sent_neu')
    features.remove('char_count') # giving better scores
    X = df[features]
    y = df['is_serious']
    
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)
    
    # Instantiate Logistic Regression model
    lr = LogisticRegression(solver='lbfgs')
    
    # Fit the model
    lr.fit(X_train, y_train)
    
    # Score the model
    print('--------------MODEL EVALUTATION---------------')
    print('Cross Val Score: ', cross_val_score(lr, X_train, y_train, cv = 5).mean())
    print('Train Score:     ', lr.score(X_train, y_train))
    print('Test Score:      ', lr.score(X_test, y_test))
    
    # Get predictions
    predictions = lr.predict(X_test)
    
    # Confusion matrix and measures
    confusion_matrix_results(y_test, predictions)

In [10]:
# Tf-Idf
logisitic_reg(tfidf_df)

*********** LOGISTIC REGRESSION MODEL ************

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.8504352952160914
Train Score:      0.9409448818897638
Test Score:       0.8136792452830188
-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       163        36
actual pos        43       182

True Negatives: 163
False Positives: 36
False Negatives: 43
True Positives: 182
------------------METRICS-------------------
Sensativity: 0.8089
Specificity: 0.8191




In [11]:
# Count Vec
logisitic_reg(cv_df)

*********** LOGISTIC REGRESSION MODEL ************

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.8291595984819498
Train Score:      1.0
Test Score:       0.7806603773584906
-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       161        38
actual pos        55       170

True Negatives: 161
False Positives: 38
False Negatives: 55
True Positives: 170
------------------METRICS-------------------
Sensativity: 0.7556
Specificity: 0.809




Observations:
- Best Cross Val Score: 0.85 -- Logistic Regression with Tf-Idf
- Using Tf-Idf leads to a better score (higher cross val, higher sensativity and higher sepcificity) than Count Vectorize
- The Tf-Idf Logistic Regression model is still overfit (train = 0.94, Test = 0.813)

#### KNN

In [12]:
def knn(df):
    print('*********** KNN MODEL ************\n')
    features = [column for column in df.columns if column != 'is_serious']
    features.remove('sent_compound')
    features.remove('sent_neu')
    features.remove('char_count')
    
    X = df[features]
    y = df['is_serious']
    
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

    # Instantiate KNN Model
    knn = KNeighborsClassifier()
    
    # Fit the model
    knn.fit(X_train, y_train)
    
    # Score the model
    print('--------------MODEL EVALUTATION---------------')
    print('Cross Val Score: ', cross_val_score(knn, X_train, y_train, cv = 5).mean())
    print('Train Score:     ', knn.score(X_train, y_train))
    print('Test Score:      ', knn.score(X_test, y_test))
    
    # Get predictions
    predictions = knn.predict(X_test)
    
    # Confusion matrix and measures
    confusion_matrix_results(y_test, predictions)

In [13]:
# Tf-Idf
knn(tfidf_df)

*********** KNN MODEL ************

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.7386413829171145
Train Score:      0.8047244094488188
Test Score:       0.7216981132075472
-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       106        93
actual pos        25       200

True Negatives: 106
False Positives: 93
False Negatives: 25
True Positives: 200
------------------METRICS-------------------
Sensativity: 0.8889
Specificity: 0.5327




In [14]:
# Count Vec
knn(cv_df)

*********** KNN MODEL ************

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.6834420854333455
Train Score:      0.8078740157480315
Test Score:       0.660377358490566
-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       189        10
actual pos       134        91

True Negatives: 189
False Positives: 10
False Negatives: 134
True Positives: 91
------------------METRICS-------------------
Sensativity: 0.4044
Specificity: 0.9497




Observations:
- Best Cross Val Score: 0.73 -- KNN with Tf-Idf
- Tf-Idf performs better thant Count Vec (based on cross val score)
- There is a large difference in the Sensativity metric...tf-idf: 0.889, cross vec: 0.404
    - Tf-Idf really helps reduce false negatives (134 to 25)
- There is a large difference in the Specificity metric...tf-idf: 0.532, cross vec: 0.949
    - Cross Vec really helps reduce false positive (93 to 10)
- The Tf-Idk model us less overfit (still overfit though) than the Cross Vec model

#### DECISION TREE

In [15]:
def decision_tree(df):
    print('*********** DECISION TREE MODEL ************\n')
    features = [column for column in df.columns if column != 'is_serious']
    features.remove('sent_compound')
    features.remove('sent_neu')
    features.remove('char_count')
    X = df[features]
    y = df['is_serious']
    
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)
    
    # Instantiate DecisionTree Model
    dt = DecisionTreeClassifier(random_state=45,
                                max_depth = 7, 
                                min_samples_split = 15, 
                                min_samples_leaf = 6)
    # Fit the model
    dt.fit(X_train, y_train)
    
    # Score the model/Evaluate Model
    print('--------------MODEL EVALUTATION---------------')
    print('Cross Val Score: ', cross_val_score(dt, X_train, y_train, cv = 5).mean())
    print('Train Score:     ', dt.score(X_train, y_train))
    print('Test Score:      ', dt.score(X_test, y_test))
 
    # Get predictions
    predictions = dt.predict(X_test)
    
    # Confusion matrix and measures
    confusion_matrix_results(y_test, predictions)

In [16]:
# Tf-Idf
decision_tree(tfidf_df)

*********** DECISION TREE MODEL ************

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.6984245743985559
Train Score:      0.8653543307086614
Test Score:       0.6981132075471698
-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       138        61
actual pos        67       158

True Negatives: 138
False Positives: 61
False Negatives: 67
True Positives: 158
------------------METRICS-------------------
Sensativity: 0.7022
Specificity: 0.6935




In [17]:
# Count Vec
decision_tree(cv_df)

*********** DECISION TREE MODEL ************

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.7173316222010262
Train Score:      0.8551181102362204
Test Score:       0.6816037735849056
-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       160        39
actual pos        96       129

True Negatives: 160
False Positives: 39
False Negatives: 96
True Positives: 129
------------------METRICS-------------------
Sensativity: 0.5733
Specificity: 0.804




Observations:
- Best Cross Val Score: 0.71 -- Decision Tree with Cross Vec
- Cross performs better thant Tf-Idf (based on cross val score)...unlike the previous two models
- Tf-Idf improves sensativity while Count Vec improves sepcificity (like above)
- Tf-Idf has a much more evenly distributed number of false positive and false negatives than Cross Vec

#### RANDOM FOREST

In [18]:
def random_forest(df):
    print('*********** RANDOM FOREST MODEL ************\n')
    features = [column for column in df.columns if column != 'is_serious']
    features.remove('sent_compound')
    features.remove('sent_neu')
    features.remove('char_count')
    X = df[features]
    y = df['is_serious']
    
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)
    
    # Instantiate KNN Model
    rf = RandomForestClassifier(n_estimators=10, 
                                max_depth = None, 
                                max_features = 'auto')
    
    # Fit the model
    rf.fit(X_train, y_train)
    
    # Score the model
    print('--------------MODEL EVALUTATION---------------')
    print('Cross Val Score: ', cross_val_score(rf, X_train, y_train, cv = 5).mean())
    print('Train Score:     ', rf.score(X_train, y_train))
    print('Test Score:      ', rf.score(X_test, y_test))
    
    # Get predictions
    predictions = rf.predict(X_test)
    
    # Confusion matrix and measures
    confusion_matrix_results(y_test, predictions)

In [19]:
# Tf-Idf
random_forest(tfidf_df)

*********** RANDOM FOREST MODEL ************

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.7614733312951086
Train Score:      0.989763779527559
Test Score:       0.7264150943396226
-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       159        40
actual pos        76       149

True Negatives: 159
False Positives: 40
False Negatives: 76
True Positives: 149
------------------METRICS-------------------
Sensativity: 0.6622
Specificity: 0.799




In [20]:
# Count Vec
random_forest(cv_df)

*********** RANDOM FOREST MODEL ************

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.7662442049428778
Train Score:      0.9929133858267717
Test Score:       0.7311320754716981
-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       159        40
actual pos        74       151

True Negatives: 159
False Positives: 40
False Negatives: 74
True Positives: 151
------------------METRICS-------------------
Sensativity: 0.6711
Specificity: 0.799




Observations:
- Best Cross Val Score: 0.76 -- Random Forest with Cross Vec
- Cross performs better thant Tf-Idf (based on cross val score)
- Both models have a specificity metric around 82-83%
- The Corss Vec model has a slighly better senstitiviy metric (less false negatives)

#### EXTRA TREE

In [21]:
def extra_trees(df):
    print('*********** EXTRA TREES MODEL ************\n')
    features = [column for column in df.columns if column != 'is_serious']
    features.remove('sent_compound')
    features.remove('sent_neu')
    features.remove('char_count')
    X = df[features]
    y = df['is_serious']
    
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)
    
    # Instantiate KNN Model
    rf = RandomForestClassifier(n_estimators=10, 
                                max_depth = None, 
                                max_features = 'auto')
    
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)
    
    # Instantiate KNN Model
    et = ExtraTreesClassifier(n_estimators=10)
    
    # Fit the model
    et.fit(X_train, y_train)
    
    # Score the model
    print('--------------MODEL EVALUTATION---------------')
    print('Cross Val Score: ', cross_val_score(et, X_train, y_train, cv = 5).mean())
    print('Train Score:     ', et.score(X_train, y_train))
    print('Test Score:      ', et.score(X_test, y_test))
    
    # Get predictions
    predictions = et.predict(X_test)
    
    # Confusion matrix and measures
    confusion_matrix_results(y_test, predictions)

In [22]:
# Tf-idf
extra_trees(tfidf_df)

*********** EXTRA TREES MODEL ************

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.7763968337949851
Train Score:      1.0
Test Score:       0.7452830188679245
-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       154        45
actual pos        63       162

True Negatives: 154
False Positives: 45
False Negatives: 63
True Positives: 162
------------------METRICS-------------------
Sensativity: 0.72
Specificity: 0.7739




In [23]:
# Count Vec
extra_trees(cv_df)

*********** EXTRA TREES MODEL ************

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.7629922602385699
Train Score:      1.0
Test Score:       0.7523584905660378
-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       168        31
actual pos        74       151

True Negatives: 168
False Positives: 31
False Negatives: 74
True Positives: 151
------------------METRICS-------------------
Sensativity: 0.6711
Specificity: 0.8442




Observations:
- Best Cross Val Score: 0.759 -- Extra Tress with Tf-Idf
- Both models have very similar cross val scores (0.75)
- The Tf-Idf does a better job predicting less false positives

### Top 3 Models
1. Logistic Regression uing Tf-Idf (cv: 0.8504352952160914)
2. Logistic Regression uing Cross Vectorization (cv: 0.8291595984819498)
3. Random Forest using Cross Vectorization (cv: 0.7622948456716102)

### Grid Search

#### Logistic Regression using Tf-Idf

In [24]:
# Set X and y
features = [column for column in tfidf_df.columns if column != 'is_serious']
features.remove('sent_compound')
features.remove('sent_neu')
features.remove('char_count')
X = tfidf_df[features]
y = tfidf_df['is_serious']

In [25]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [26]:
# Gridsearch for Logistic Regression
lr = LogisticRegression()
lr_params = {
    'solver': ['lbfgs', 'liblinear'],
    'C' : [0.1,1,10,100],
    'max_iter': [150]
}
gs = GridSearchCV(lr, param_grid=lr_params, cv=3)

gs.fit(X_train, y_train)

# Scores
print('Best Cross Val: ', gs.best_score_)
print('Best Params: ', gs.best_params_)
print('Train w/ Params: ', gs.score(X_train, y_train))
print('Test w/ Params: ', gs.score(X_test, y_test))

Best Cross Val:  0.8637795275590551
Best Params:  {'C': 10, 'max_iter': 150, 'solver': 'liblinear'}
Train w/ Params:  0.9992125984251968
Test w/ Params:  0.8301886792452831


Obsevation: 0.1 improvement from CV before gridsearch, still overfit

#### Random Forest using Cross Vectorization

In [27]:
# Set X and y
features = [column for column in cv_df.columns if column != 'is_serious']
features.remove('sent_compound')
features.remove('sent_neu')
features.remove('char_count')
X = cv_df[features]
y = cv_df['is_serious']

In [28]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [29]:
# Gridsearch for Random Forest
rf = RandomForestClassifier()
rf_params = {
    'n_estimators': [50, 100, 150],
    'max_depth' : [10, 20, 30],
    'max_features': ['auto', 1000, 2000]
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=3) 

gs.fit(X_train, y_train)

# Scores
print('Best Cross Val: ', gs.best_score_)
print('Best Params: ', gs.best_params_)
print('Train w/ Params: ', gs.score(X_train, y_train))
print('Test w/ Params: ', gs.score(X_test, y_test))

Best Cross Val:  0.8125984251968504
Best Params:  {'max_depth': 20, 'max_features': 2000, 'n_estimators': 150}
Train w/ Params:  1.0
Test w/ Params:  0.7688679245283019


Observations: Improved CV from 0.76 to 0.81 (but still equally as overfit)

#### Best Model: Logistic Regression with Tf-Idf and Grid Search parameters

### AdaBoost

In [30]:
# Import AdaBoost
from sklearn.ensemble import AdaBoostClassifier

In [31]:
# Set X and y
features = [column for column in tfidf_df.columns if column != 'is_serious']
features.remove('sent_compound')
features.remove('sent_neu')
features.remove('char_count')
X = tfidf_df[features]
y = tfidf_df['is_serious']

In [32]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [33]:
# Logistic Regression with Tf-Idf and Grid Search Paramters
ada = AdaBoostClassifier(base_estimator=LogisticRegression(C = 10,
                                                           max_iter = 150,
                                                           solver = 'liblinear'))
ada_params = {'n_estimators': [60, 70, 80]}
                         
gs = GridSearchCV(ada, param_grid=ada_params, cv=3)
                         
gs.fit(X_train, y_train);

                         # Scores
print('Best Cross Val: ', gs.best_score_)
print('Best Params: ', gs.best_params_)
print('Train w/ Params: ', gs.score(X_train, y_train))
print('Test w/ Params: ', gs.score(X_test, y_test))

Best Cross Val:  0.8582677165354331
Best Params:  {'n_estimators': 80}
Train w/ Params:  0.9165354330708662
Test w/ Params:  0.8089622641509434


Observation: AdaBoost did NOT improve my cross val score

### SVD

In [34]:
# Import SVD
from sklearn.decomposition import TruncatedSVD

In [35]:
# Get feature matrix
features = [column for column in tfidf_df.columns if column != 'is_serious']
features.remove('sent_compound')
features.remove('sent_neu')
features.remove('char_count')
X = tfidf_df[features]

# Target
y = tfidf_df['is_serious']

In [36]:
# Instantiate the SVD
svd = TruncatedSVD(n_components=500)

# Fit and transform
svd_matrix = svd.fit_transform(X)

In [37]:
print('SVD Matrix Shape:    ', svd_matrix.shape)
print('Before SVD Shape:    ', cv_df.shape)
print('SVD Components Shape:', svd.components_.shape)

SVD Matrix Shape:     (1694, 500)
Before SVD Shape:     (1694, 11251)
SVD Components Shape: (500, 7922)


In [38]:
# How many components do we need to represent at least %70 of the variance in the data?
np.where(np.cumsum(svd.explained_variance_ratio_) < .8)[0][-1]+1

500

In [91]:
# Coonvert to dataframe
component_names = ["component_"+str(i+1) for i in range(100)]
svd_df = pd.DataFrame(svd_matrix,
                      columns=component_names)

svd_df.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_91,component_92,component_93,component_94,component_95,component_96,component_97,component_98,component_99,component_100
0,0.346687,0.182102,-0.011003,-0.020251,0.022137,-0.056799,-0.081593,-0.004827,-0.009376,-0.056244,...,-0.002169,-0.034651,-0.030947,-0.005019,-0.040476,0.000873,0.049683,-0.016431,-0.043592,-0.003841
1,0.382058,0.175217,-0.033306,0.030562,0.05514,-0.054848,0.029664,-0.03141,-0.087515,-0.084616,...,-0.039127,0.036918,-0.071137,-0.005108,0.04986,-0.01698,-0.0079,0.061943,0.066681,-0.056741
2,0.167164,0.08779,-0.079841,0.032055,0.150531,-0.041592,-0.055154,0.099348,0.046849,0.390909,...,-0.012268,-0.00576,-0.018693,-0.020313,0.00285,-0.014996,-0.004327,0.018752,0.014165,-0.018451
3,0.224264,0.004795,-0.088161,-0.08369,0.075946,-0.046175,-0.034828,-0.148332,-0.117848,-0.033487,...,0.040306,-0.008616,-0.000884,-0.002848,0.048423,-0.016204,-0.003813,0.010586,0.040345,-0.018296
4,0.310452,0.095704,-0.018119,0.011167,0.077147,0.010361,-0.050329,0.001458,-0.098694,-0.065023,...,-0.046591,-0.043783,-0.026082,0.034914,-0.030457,0.002014,-0.0609,0.002374,-0.011548,-0.032331


In [95]:
# Try on the Logistic Regression model
X_train, X_test, y_train, y_test = train_test_split(svd_df, y, stratify=y, random_state=42)

lr = LogisticRegression(C = 10, max_iter = 150, solver = 'liblinear')
lr.fit(X_train, y_train)

print('Cross Val Score: ', cross_val_score(lr, X_train, y_train, cv = 5).mean())
print('Train Score:     ', lr.score(X_train, y_train))
print('Test Score:      ', lr.score(X_test, y_test))

Cross Val Score:  0.857506470142755
Train Score:      0.8842519685039371
Test Score:       0.8089622641509434


Observations: The cross val score is very similar but our training and test score are much closer than previously (Train: 0.9992125984251968, Test: 0.8301886792452831) which means that the model is less overfit

### Final Model: Logistic Regression with Tf-Idf and SVD

In [98]:
# Try on the Logistic Regression model
X_train, X_test, y_train, y_test = train_test_split(svd_df, y, stratify=y, random_state=42)

lr = LogisticRegression(C = 10, max_iter = 150, solver = 'liblinear')
lr.fit(X_train, y_train)

# Score the model/Evaluate Model
print('--------------MODEL EVALUTATION---------------')
print('Cross Val Score: ', cross_val_score(lr, X_train, y_train, cv = 5).mean())
print('Train Score:     ', lr.score(X_train, y_train))
print('Test Score:      ', lr.score(X_test, y_test))
print()
 
# Get predictions
predictions = lr.predict(X_test)
    
# Confusion matrix and measures
confusion_matrix_results(y_test, predictions)

--------------MODEL EVALUTATION---------------
Cross Val Score:  0.857506470142755
Train Score:      0.8842519685039371
Test Score:       0.8089622641509434

-------------CONFUSION MATRIX---------------
            pred neg  pred pos
actual neg       158        41
actual pos        40       185

True Negatives: 158
False Positives: 41
False Negatives: 40
True Positives: 185
------------------METRICS-------------------
Sensativity: 0.8222
Specificity: 0.794




Observation: 
- The model has false predictions pretty evenly
- Almost the same number of false positives and negatives
- Sensativity and specificty are pretty close