# Project 3: Naive Bayes Model

In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [14]:
# Read in Count Vectorize Data
cv_df = pd.read_csv('../data/count_vec.csv')
tfidf_df = pd.read_csv('../data/tfidf.csv')

In [3]:
cv_df.head()

Unnamed: 0,is_serious,sent_compound,sent_neg,sent_neu,sent_pos,char_count,00,000,001,00pm,...,zealand,zen,zeppelin,zero,zip,zoloft,zombie,zone,zoned,zoo
0,0,0.9997,0.066,0.661,0.273,4862,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0.9999,0.057,0.649,0.293,6225,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0.9195,0.068,0.82,0.112,1105,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,-0.9943,0.221,0.709,0.071,1425,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0.9988,0.08,0.636,0.284,2104,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
tfidf_df.head()

Unnamed: 0,is_serious,sent_compound,sent_neg,sent_neu,sent_pos,00,000,01,02,03,...,yr,yt,yummy,yup,zealand,zero,zip,zombie,zone,zoo
0,0,0.9997,0.066,0.661,0.273,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.9999,0.057,0.649,0.293,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.9195,0.068,0.82,0.112,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,-0.9943,0.221,0.709,0.071,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085366,0.0
4,0,0.9988,0.08,0.636,0.284,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Baseline Score

In [5]:
# Check for unbalanced classes
cv_df['is_serious'].value_counts(normalize=True)
# Do not need to worry about unbalanced classes given the percentages

1    0.529516
0    0.470484
Name: is_serious, dtype: float64

### Naive Bayes Model (Count Vectorize)

In [6]:
# Set X and y
features = [column for column in cv_df.columns if column != 'is_serious']
features.remove('sent_compound')
features.remove('sent_neu')
X = cv_df[features]
y = cv_df['is_serious']

In [7]:
X.head()

Unnamed: 0,sent_neg,sent_pos,char_count,00,000,001,00pm,01,02,03,...,zealand,zen,zeppelin,zero,zip,zoloft,zombie,zone,zoned,zoo
0,0.066,0.273,4862,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.057,0.293,6225,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.068,0.112,1105,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.221,0.071,1425,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0.08,0.284,2104,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Train, test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    stratify=y)

In [9]:
# Instantiate the model
nb = MultinomialNB()

# Fit the model
nb.fit(X_train, y_train)

# Score the model
# Score the model
print('--------------MODEL EVALUTATION---------------\n')
print('Accuracy:     ', cross_val_score(nb, X_train, y_train, cv = 5).mean())
print('Train Score:  ', nb.score(X_train, y_train))
print('Test Score:   ', nb.score(X_test, y_test))
    
# Get predictions
predictions = nb.predict(X_test)
    
# Confusion Matrixx
cm = confusion_matrix(y_test, predictions)
print('\n-------------CONFUSION MATRIX---------------\n')
print(pd.DataFrame(cm, 
                   columns=['pred neg', 'pred pos'], 
                   index = ['actual neg', 'actual pos']))
    

# Results
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("\nTrue Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print('\n------------------METRICS-------------------\n')
print(f'Sensativity: {round(tp / (tp + fn),4)}')
print(f'Specificity: {round(tn / (tn + fp),4)}')

--------------MODEL EVALUTATION---------------

Accuracy:      0.8386519890082328
Train Score:   0.9251968503937008
Test Score:    0.8089622641509434

-------------CONFUSION MATRIX---------------

            pred neg  pred pos
actual neg       156        43
actual pos        38       187

True Negatives: 156
False Positives: 43
False Negatives: 38
True Positives: 187

------------------METRICS-------------------

Sensativity: 0.8311
Specificity: 0.7839


Observations:
- This model looks pretty good...showing almost as good of metrics as our Logistic Regression model
- Good sensativity and specificity (do not necessarily need to optimize for one over the other since a false postiive is no worse than a false negative)
    - Although...in the case you were using this to flag posts where users are mentioning serious topics such as suicide you would not want to incorrectly mark a serious post as a casual post (false negatives)

### Naive Bayes Model (TF-IDF)

In [15]:
# Set X and y
features = [column for column in tfidf_df.columns if column != 'is_serious']
features.remove('sent_compound')
features.remove('sent_neu')
X = cv_df[features]
y = cv_df['is_serious']

In [16]:
# Train, test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=42,
                                                    stratify=y)

In [17]:
# Instantiate the model
nb = MultinomialNB()

# Fit the model
nb.fit(X_train, y_train)

# Score the model
# Score the model
print('--------------MODEL EVALUTATION---------------\n')
print('Accuracy:     ', cross_val_score(nb, X_train, y_train, cv = 5).mean())
print('Train Score:  ', nb.score(X_train, y_train))
print('Test Score:   ', nb.score(X_test, y_test))
    
# Get predictions
predictions = nb.predict(X_test)
    
# Confusion Matrixx
cm = confusion_matrix(y_test, predictions)
print('\n-------------CONFUSION MATRIX---------------\n')
print(pd.DataFrame(cm, 
                   columns=['pred neg', 'pred pos'], 
                   index = ['actual neg', 'actual pos']))
    

# Results
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("\nTrue Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print('\n------------------METRICS-------------------\n')
print(f'Sensativity: {round(tp / (tp + fn),4)}')
print(f'Specificity: {round(tn / (tn + fp),4)}')

--------------MODEL EVALUTATION---------------

Accuracy:      0.8386550768575457
Train Score:   0.9102362204724409
Test Score:    0.8066037735849056

-------------CONFUSION MATRIX---------------

            pred neg  pred pos
actual neg       158        41
actual pos        41       184

True Negatives: 158
False Positives: 41
False Negatives: 41
True Positives: 184

------------------METRICS-------------------

Sensativity: 0.8178
Specificity: 0.794


Observations: this model did only slightly worse than the model with count vectorize. I will stick with count vectorize

### GridSearch (Naive Bayes w/ Count Vectorize)
- With gridsearch this model performs better than TF-IDF

In [23]:
# Set X and y
features = [column for column in cv_df.columns if column != 'is_serious']
features.remove('sent_compound')
features.remove('sent_neu')
X = cv_df[features]
y = cv_df['is_serious']

In [24]:
# Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [25]:
# Instantiate Gridsearch.
grid = GridSearchCV(estimator = MultinomialNB(),
                    param_grid = {
                        'alpha': [0.4, 0.5, 0.6, 0.65, 0.7]},
                    cv = 5)

grid.fit(X_train, y_train)

# Best parameters & score
print('Best Params: ', grid.best_params_)

# What is the best score?
print('Best Score: ', grid.best_score_)

Best Params:  {'alpha': 0.4}
Best Score:  0.8448818897637795


In [29]:
# Test model with new parameters

# Instantiate the model
nb = MultinomialNB(alpha = 0.4)

# Fit the model
nb.fit(X_train, y_train)

# Score the model
# Score the model
print('--------------MODEL EVALUTATION---------------\n')
print('Accuracy:     ', cross_val_score(nb, X_train, y_train, cv = 5).mean())
print('Train Score:  ', nb.score(X_train, y_train))
print('Test Score:   ', nb.score(X_test, y_test))
    
# Get predictions
predictions = nb.predict(X_test)
    
# Confusion Matrixx
cm = confusion_matrix(y_test, predictions)
print('\n-------------CONFUSION MATRIX---------------\n')
print(pd.DataFrame(cm, 
                   columns=['pred neg', 'pred pos'], 
                   index = ['actual neg', 'actual pos']))
    

# Results
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print("\nTrue Negatives: %s" % tn)
print("False Positives: %s" % fp)
print("False Negatives: %s" % fn)
print("True Positives: %s" % tp)
print('\n------------------METRICS-------------------\n')
print(f'Sensativity: {round(tp / (tp + fn),4)}')
print(f'Specificity: {round(tn / (tn + fp),4)}')

--------------MODEL EVALUTATION---------------

Accuracy:      0.8449605872039768
Train Score:   0.937007874015748
Test Score:    0.8183962264150944

-------------CONFUSION MATRIX---------------

            pred neg  pred pos
actual neg       160        39
actual pos        38       187

True Negatives: 160
False Positives: 39
False Negatives: 38
True Positives: 187

------------------METRICS-------------------

Sensativity: 0.8311
Specificity: 0.804
