# Naive Bayes Practice with Sentiment Classifier 
## Cross Validation and Accuracy Addition

In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

% matplotlib inline

In [2]:
# Add some formatting to make output bold
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))


In [3]:
# Set up some sentiment keywords
# Positive and negative ones are treated the same in the model, but this ordering
# effects how the heatmap displays them
pos_keywords = ['great', 'love', 'awesome', 'amazing', 'fav', 'is',
                'excellent', 'good', 'best', 'yummy', 'pleasure', 'wow', 
                'back', 'very', 'really', 'friendly', 'time', 'fantastic', 
                'fresh', 'over', 'recommend', 'definitely', 'quite', 'deal', 'happy',
                'delicious', 'spicy', 'do', 'much', 'nice', 'wonderful', 'ambiance',
                'enjoy', 'price']

neg_keywords = ['not', 'angry', 'didn\'t', 'disgusted', 'slow', 'nasty', 
                'dirty', 'avoid', 'disappointed', 'never', 'disappointing', 
                'poor', 'bad', 'wasted', 'wasn\'t', 'rude', 'bland', 'mediocre']
    
keywords = pos_keywords + neg_keywords

In [4]:
def check_sentiment(data_path, description):
    review_raw = pd.read_csv(data_path, delimiter='\t', header=None)
    review_raw.columns = ['review', 'sentiment']
    
    # Exclamation marks more commonly found in positive reviews
    review_raw['exclamation'] = review_raw.review.str.contains('!') 

    # Yelp model actually gets moderately worse removing these punctuations
    punctuation = [',', '\.', '!', '?']
    for mark in punctuation:
        review_raw['review'] = review_raw['review'].str.replace(mark, '')
    
    # Check for spaces before word to not get partial matches
    for key in keywords:
        review_raw[str(key)] = review_raw.review.str.contains(
            ' ' + str(key), 
            case=False
        )

    #Thought maybe length of review would be related, but this didn't help
    review_raw['long'] = review_raw.review.str.len()
    review_raw['long'] = [0 if x < 80 else 1 for x in review_raw['long']]
    
    review_raw['upper'] = review_raw.review.apply(check_upper)

    #fig, ax = plt.subplots(figsize=(15,15))
    #sns.heatmap(review_raw.corr(), vmax=1, vmin=-1, cmap='coolwarm', ax=ax)
    #plt.show()
    
    # specifying variables and outcomes for SKLearn
    data = review_raw[pos_keywords + neg_keywords + ['exclamation'] + ['long'] + ['upper']]
    target = review_raw['sentiment']
    
    printmd('**' + 'Dataset Information' + '**')
    print('There are {} positive and {} negative reviews in this dataset\n'
          .format(target.sum(), len(target) - target.sum()))
    
    return review_raw

In [5]:
# This function looks to see if at least three words in the review are all uppercase
# This tends to indicate either yelling or excitement
def check_upper(review):
    words = review.split(' ')
    upper_bool = False
    upper_count = 0
    for word in words:
        # I is normally capitalized, don't count it
        if (word.isupper()) and (word != 'I'):
            upper_count += 1
    # Using less than 3 seems to make the model worse instead of better
    if upper_count >= 3:
        upper_bool = True
    return upper_bool

In [12]:
def run_models_and_print(description, data, target):   
    
    # Create model and fit all data
    bnb = BernoulliNB()
    bnb.fit(data, target)
    y_pred = bnb.predict(data)
    
    # Display our results, not split
    print('Bernoulli Naive Bayes.')
    print('Number of mislabeled points out of a total {} points : {}\n'.format(
        data.shape[0], 
        (target != y_pred).sum()
    ))
    print('Score for no holdouts: '+ str(bnb.score(data, target)))
    print('Confusion Matrix no holdouts:' + '\n' + str(confusion_matrix(target, y_pred)))
    
    # Split into test and training datasets and train again
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
    bnb.fit(X_train, y_train)
    y_pred_test = bnb.predict(X_test)
    
    print('\nScore with 20% Holdout: ' + str(bnb.score(X_test, y_test)))
    print('Confusion Matrix')
    print(confusion_matrix(y_test, y_pred_test))
    
    print('\nCross Validation Scores:')
    print(cross_val_score(bnb, data, target, cv=10))

## Write up

#### Features selected:
I used a list of key words, exclamation marks, length of the review, and whether or not there were three or more all caps words in the review as my features.  I added key words to the list until adding additional key words made no more improvement to the model. I checked this by iterating through all the words (this would be trickier to do in a larger data set) used in at least 5 different reviews.  Something that was interesting to me was that many of the most frequent words are very neutral words and not used overly frequently.

Removing punctuation made the model perform slightly worse. 

Many of the words that helped the yelp reviews sentiment model were not found in the Amazon reviews.  However, improving the Yelp model did also improve the Amazon model.  

It is likely that this model is somewhat overfit to the dataset since I did not split the data set into training and test datasets.  Future projects should be split into these subsets before fitting the model. 

# Model Variations

In [7]:
data_path = 'sentiment labelled sentences/yelp_labelled.txt'

reviews = check_sentiment(data_path, 'Yelp')

printmd('**' + 'Model 1 - Original Model' + '**')
data1 = reviews[pos_keywords + neg_keywords + ['exclamation'] + ['long'] + ['upper']]
target = reviews['sentiment']

run_models_and_print('Yelp', data1, target)

**Dataset Information**

There are 500 positive and 500 negative reviews in this dataset



**Model 1 - Original Model**

Bernoulli Naive Bayes.
Number of mislabeled points out of a total 1000 points : 179

Score for no holdouts: 0.821
Confusion Matrix no holdouts:
[[447  53]
 [126 374]]

Score with 20% Holdout: 0.775
Confusion Matrix
[[86 20]
 [25 69]]

Cross Validation Score:
[0.78 0.87 0.78 0.84 0.8  0.73 0.86 0.81 0.68 0.86]


This model leans towards predicting negatives.

In [8]:
printmd('**' + 'Model 2 - Exclude Exclamations' + '**')

data2 = reviews[pos_keywords + neg_keywords + ['long'] + ['upper']]

run_models_and_print('Yelp', data2, target)

**Model 2 - Exclude Exclamations**

Bernoulli Naive Bayes.
Number of mislabeled points out of a total 1000 points : 200

Score for no holdouts: 0.8
Confusion Matrix no holdouts:
[[463  37]
 [163 337]]

Score with 20% Holdout: 0.815
Confusion Matrix
[[93  5]
 [32 70]]

Cross Validation Score:
[0.73 0.86 0.8  0.79 0.67 0.74 0.8  0.79 0.7  0.86]


This model is leaning towards predicting negatives.

In [9]:
printmd('**' + 'Model 3 - Exclude Uppercase' + '**')

data3 = reviews[pos_keywords + neg_keywords + ['exclamation'] + ['long']]

run_models_and_print('Yelp', data3, target)

**Model 3 - Exclude Uppercase**

Bernoulli Naive Bayes.
Number of mislabeled points out of a total 1000 points : 182

Score for no holdouts: 0.818
Confusion Matrix no holdouts:
[[445  55]
 [127 373]]

Score with 20% Holdout: 0.795
Confusion Matrix
[[88  6]
 [35 71]]

Cross Validation Score:
[0.77 0.87 0.78 0.84 0.79 0.73 0.86 0.81 0.69 0.85]


In [10]:
printmd('**' + 'Model 4 - Exclude neg keywords' + '**')

data4 = reviews[pos_keywords + ['exclamation'] + ['long'] + ['upper']]

run_models_and_print('Yelp', data4, target)

**Model 4 - Exclude neg keywords**

Bernoulli Naive Bayes.
Number of mislabeled points out of a total 1000 points : 255

Score for no holdouts: 0.745
Confusion Matrix no holdouts:
[[412  88]
 [167 333]]

Score with 20% Holdout: 0.67
Confusion Matrix
[[70 33]
 [33 64]]

Cross Validation Score:
[0.7  0.77 0.71 0.76 0.71 0.61 0.73 0.75 0.67 0.75]


This model over-predicted negative reviews

In [11]:
printmd('**' + 'Model 5 - No positive keywords' + '**')

data5 = reviews[neg_keywords + ['exclamation'] + ['long'] + ['upper']]

run_models_and_print('Yelp', data5, target)

**Model 5 - No positive keywords**

Bernoulli Naive Bayes.
Number of mislabeled points out of a total 1000 points : 330

Score for no holdouts: 0.67
Confusion Matrix no holdouts:
[[272 228]
 [102 398]]

Score with 20% Holdout: 0.63
Confusion Matrix
[[61 47]
 [27 65]]

Cross Validation Score:
[0.7  0.64 0.72 0.6  0.6  0.65 0.71 0.66 0.59 0.66]


The model without positive keywords predicted more positive reviews.  This is at the expense of correctly predicting negative reviews. The cross validation on this varies a fair amount. 