In [76]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import string
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Vaibhav_Beohar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [77]:
# Load our dataset
df_yelp = pd.read_table('../data/external/yelp_labelled.txt')
df_imdb = pd.read_table('../data/external/imdb_labelled.txt')
df_amz = pd.read_table('../data/external/amazon_cells_labelled.txt')

# utilizing code from https://monstott.github.io/sentiment_analysis_and_classification_of_amazon_imdb_and_yelp_reviews
# and https://www.section.io/engineering-education/sentiment-analysis-with-spacy-and-scikit-learn/

In [78]:
# Concatenate our Datasets
frames = [df_yelp,df_imdb,df_amz]

In [79]:
# Renaming Column Headers
for colname in frames:
    colname.columns = ["Message","Target"]

In [80]:
# # Column names
# for colname in frames:
#     print(colname.columns)

In [81]:
# # Assign a Key to Make it Easier
# keys = ['Yelp','IMDB','Amazon']

In [84]:
# Merge or Concat our Datasets
# df = pd.concat(frames,keys=keys)
df = pd.concat(frames)

In [85]:
df.head()

Unnamed: 0,Message,Target
0,Crust is not good.,0
1,Not tasty and the texture was just nasty.,0
2,Stopped by during the late May bank holiday of...,1
3,The selection on the menu was great and so wer...,1
4,Now I am getting angry and I want my damn pho.,0


In [86]:
# split dataset
from sklearn import model_selection
from sklearn import preprocessing

# train-test split
x_train, x_test, y_train, y_test = model_selection.train_test_split(df.Message, df.Target) 

# label encode the target 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [87]:
# count vector
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') 
count_vect.fit(df.Message) # regexp selects tokens of 1 or more alphanumeric characters

xall_count = count_vect.transform(df.Message)
xtrain_count = count_vect.transform(x_train)
xtest_count = count_vect.transform(x_test)

In [98]:
# xtrain_count
x_train

549    I really do recommend this place, you can go w...
951    The ambiance here did not feel like a buffet s...
580             None of them are engaging or exciting.  
627    I asked multiple times for the wine list and a...
421    This is a masterful piece of film-making, with...
                             ...                        
826    For that price I can think of a few place I wo...
916                  Leopard Print is wonderfully wild!.
409            And the accents are absolutely abysmal!  
484                   A good film by a great director!  
573    Our server was super nice and checked on us ma...
Name: Message, Length: 2058, dtype: object

In [89]:
# # tf-idf
# from sklearn.feature_extraction.text import TfidfVectorizer

# # word-level tf-idf
# tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
# tfidf_vect.fit(df.Message)
# xtrain_tfidf = tfidf_vect.transform(x_train)
# xtest_tfidf = tfidf_vect.transform(x_test)

# # ngram-level tf-idf 
# tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2, 3), max_features=5000)
# tfidf_vect_ngram.fit(df.Message) # measures bi-grams and tri-grams
# xtrain_tfidf_ngram = tfidf_vect_ngram.transform(x_train)
# xtest_tfidf_ngram = tfidf_vect_ngram.transform(x_test)

In [90]:
# model wrapper function
from sklearn import metrics

def train_model(classifier, train_features, label, test_features):
    # fit the training data on classifier
    classifier.fit(train_features, label)
    
    # predict testing data labels
    predictions = classifier.predict(test_features)
    
    return metrics.accuracy_score(predictions, y_test)

In [91]:
# Naive Bayes
from sklearn import naive_bayes

# Count Vectors
nb_cv = train_model(naive_bayes.MultinomialNB(), xtrain_count, y_train, xtest_count)
print("[Naive Bayes] Count Vectors Accuracy:", round(nb_cv, 3))

# Word-Level TF-IDF Vectors
nb_wl = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, y_train, xtest_tfidf)
print("[Naive Bayes] Word-Level TF-IDF Accuracy:", round(nb_wl, 3))

# Ngram-Level TF-IDF Vectors
nb_nl = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, y_train, xtest_tfidf_ngram)
print("[Naive Bayes] N-Gram-Level TF-IDF Accuracy:", round(nb_nl, 3))

# > [Naive Bayes] Count Vectors Accuracy: 0.819
# > [Naive Bayes] Word-Level TF-IDF Accuracy: 0.809
# > [Naive Bayes] N-Gram-Level TF-IDF Accuracy: 0.663

[Naive Bayes] Count Vectors Accuracy: 0.833
[Naive Bayes] Word-Level TF-IDF Accuracy: 0.491
[Naive Bayes] N-Gram-Level TF-IDF Accuracy: 0.466


In [92]:
# Logistic Regression
from sklearn import linear_model

# Count Vectors
lr_cv = train_model(linear_model.LogisticRegression(), xtrain_count, y_train, xtest_count)
print("[Logistic Regression] Count Vectors Accuracy:", round(lr_cv, 3))

# Word-Level TF-IDF Vectors
lr_wl = train_model(linear_model.LogisticRegression(), xtrain_tfidf, y_train, xtest_tfidf)
print("[Logistic Regression] Word-Level TF-IDF Accuracy:", round(lr_wl, 3))

# Ngram-Level TF-IDF Vectors
lr_nl = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, y_train, xtest_tfidf_ngram)
print("[Logistic Regression] N-Gram TF-IDF Accuracy:", round(lr_nl, 3))

[Logistic Regression] Count Vectors Accuracy: 0.841
[Logistic Regression] Word-Level TF-IDF Accuracy: 0.483
[Logistic Regression] N-Gram TF-IDF Accuracy: 0.451


In [93]:
# Support Vector Machines
from sklearn import svm

# Count Vectors
svm_cv = train_model(svm.SVC(), xtrain_count, y_train, xtest_count)
print("[Support Vector Machines] Count Vectors Accuracy:", round(svm_cv, 3))

# Word-Level TF-IDF Vectors
svm_wl = train_model(svm.SVC(), xtrain_tfidf, y_train, xtest_tfidf)
print("[Support Vector Machines] Word-Level TF-IDF Accuracy:", round(svm_wl, 3))

# Ngram-Level TF-IDF Vectors
svm_nl = train_model(svm.SVC(), xtrain_tfidf_ngram, y_train, xtest_tfidf_ngram)
print("[Support Vector Machines] N-Gram TF-IDF Accuracy:", round(svm_nl, 3))

[Support Vector Machines] Count Vectors Accuracy: 0.801
[Support Vector Machines] Word-Level TF-IDF Accuracy: 0.479
[Support Vector Machines] N-Gram TF-IDF Accuracy: 0.472


In [94]:
# Random Forest
from sklearn import ensemble

# Count Vectors
rf_cv = train_model(ensemble.RandomForestClassifier(), xtrain_count, y_train, xtest_count)
print("[Random Forest] Count Vectors Accuracy:", round(rf_cv, 3))

# Word-Level TF-IDF Vectors
rf_wl = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, y_train, xtest_tfidf)
print("[Random Forest] Word-Level TF-IDF Accuracy:", round(rf_wl, 3))

# Ngram-Level TF-IDF Vectors
rf_nl = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, y_train, xtest_tfidf_ngram)
print("[Random Forest] N-Gram TF-IDF Accuracy:", round(rf_nl, 3))

[Random Forest] Count Vectors Accuracy: 0.803
[Random Forest] Word-Level TF-IDF Accuracy: 0.463
[Random Forest] N-Gram TF-IDF Accuracy: 0.483


In [95]:
# Extreme Gradient Boosting
import xgboost

# Count Vectors
xgb_cv = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), y_train, xtest_count.tocsc())
print("[Xtreme Gradient Boosting] Count Vectors Accuracy:", round(xgb_cv, 3))

# Word-Level TF-IDF Vectors
xgb_wl = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), y_train, xtest_tfidf.tocsc())
print("[Xtreme Gradient Boosting] Word-Level TF-IDF: ", round(xgb_wl, 3))

# Ngram-Level TF-IDF Vectors
xgb_nl = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram, y_train, xtest_tfidf_ngram)
print("[Xtreme Gradient Boosting] N-Gram TF-IDF Accuracy:", round(xgb_nl, 3))


[Xtreme Gradient Boosting] Count Vectors Accuracy: 0.776
[Xtreme Gradient Boosting] Word-Level TF-IDF:  0.485
[Xtreme Gradient Boosting] N-Gram TF-IDF Accuracy: 0.496


In [120]:
# model performance table
pd.DataFrame([[nb_cv, nb_wl, nb_nl],
              [lr_cv, lr_wl, lr_nl],
              [svm_cv, svm_wl, svm_nl],
              [rf_cv, rf_wl, rf_nl],
              [xgb_cv, xgb_wl, xgb_nl]], 
columns=['Count Vector', 'Word TF-IDF', 'n-Gram TF-IDF'], 
index=['Naive Bayes', 'Logistic Regression', 'Support Vector Machines', 'Random Forest', 'Xtreme Gradient Boosting']).round(3)

Unnamed: 0,Count Vector,Word TF-IDF,n-Gram TF-IDF
Naive Bayes,0.833,0.491,0.466
Logistic Regression,0.841,0.483,0.451
Support Vector Machines,0.801,0.479,0.472
Random Forest,0.803,0.463,0.483
Xtreme Gradient Boosting,0.776,0.485,0.496


### Testing for an example

In [128]:
nb_model = naive_bayes.MultinomialNB()
nb_model.fit(xtrain_count, y_train)
print(nb_model.predict(count_vect.transform(pd.Series('POG: indicate excitement, an epic moment'))))
print(nb_model.predict(count_vect.transform(pd.Series('I feel horrible'))))
print(nb_model.predict(count_vect.transform(pd.Series('KEKW: it suggests laughter'))))

[1]
[0]
[1]
