# YouTube Comments Sentiment Analysis 
Spring 2018


## 1 Set Up

### 1.1 Import Basic Modules

In [1]:
# Basics
import pandas as pd; import os
import csv; import numpy as np
import re; import warnings
warnings.filterwarnings('ignore')

### 1.2 Read In Data

In [2]:
#os.chdir('/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/') # change directory
os.chdir('/home/sxi/myProjects/dataScience/YoutubeAnalysis')

In [3]:
df = pd.read_csv('data/data.csv', delimiter=",", skiprows=2, encoding='utf-8', engine='python') 

#### Pre-Labeled YouTube Data

In [4]:
okgo = pd.read_csv('data/OKGO.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data
trump = pd.read_csv('data/trump.csv', delimiter=",", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python') 
swift = pd.read_csv('data/TaylorSwift.csv', delimiter=",", skiprows=2, nrows=180, encoding='utf-8', engine='python') 
royal = pd.read_csv('data/RoyalWedding.csv', delimiter=",", skiprows=2, nrows=61, encoding='utf-8', engine='python')
paul = pd.read_csv('data/LoganPaul.csv', delimiter=",", skiprows=2, nrows=200, encoding='utf-8', engine='python') 

#### Non-YouTube Data

In [5]:
blogs = pd.read_csv('data/Kagel.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data
tweets = pd.read_csv('data/twitter.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data
tweets2 = pd.read_csv('data/twitter2.csv', names=['label','tweet_id','date','query','user','comment'], encoding='latin-1')

### 1.3 Clean Data Columns

In [6]:
tweets = tweets.drop(['Topic', 'TweetId', "TweetDate"], axis = 1).dropna()
tweets.columns = ["label", "comment"]
tweets.label = tweets.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
tweets['label'] = pd.to_numeric(tweets['label'], errors='coerce')

In [7]:
tweets2 = tweets2.drop(['tweet_id', 'date', 'query','user'], axis = 1).dropna()
tweets2['label'] = tweets2['label'].map({0:-1, 2:0, 4:1})
tweets2['label'] = pd.to_numeric(tweets2['label'], errors='coerce')
for i in range(len(tweets2)):
    tweets2.comment[i] = re.sub(r'@.*?(?=\s)|([^0-9A-Za-z \t])', '', tweets2.comment[i])

In [8]:
blogs.columns = ["label", "comment"]
blogs['label'] = pd.to_numeric(blogs['label'], errors='coerce')
okgo = okgo.drop(okgo.columns[[2, 3]], axis =1).dropna()
paul = paul.drop(["Unnamed: 2"], axis =1).dropna()

In [9]:
def fix_cols(DF):
    DF.columns = ["label", "comment"]

In [10]:
fix_cols(okgo)
fix_cols(trump)
fix_cols(swift)
fix_cols(royal)
fix_cols(paul)
fix_cols(df)

### 1.3.b Create Datasets

In [11]:
videos = pd.concat([okgo, trump, swift, royal, paul], ignore_index=True)
full = pd.concat([okgo, trump, swift, royal, paul, blogs, tweets, tweets2], ignore_index=False)
videos_not_okgo = pd.concat([trump, swift, royal, paul], ignore_index=False)
videos_not_royal = pd.concat([okgo, trump, swift, paul], ignore_index=False)

DataList = [videos, full, videos_not_royal, videos_not_okgo]
excluded = [okgo, royal]

In [12]:
data = videos.copy()
data.sample(5)

Unnamed: 0,label,comment
138,1.0,Based on the comments I see here and on some o...
2501,0.0,Why are we still here...
348,-1.0,Keep killing trees for your awsome video mothe...
1693,0.0,"Subscribe to the channel ""I'MTISHA 24"""
992,0.0,I want to believe itÕs a green screen because ...


### 1.4 Remove Non-Alphabetic Characters (including numbers)

In [13]:
def AsStr(DF):
    DF["comment"]= DF["comment"].astype(str) 

for i in range(0, len(DataList)): 
    AsStr(DataList[i])

In [14]:
for i in range(0, len(excluded)): 
    AsStr(excluded[i])
    
AsStr(df)

In [15]:
def cleanerFn(b):
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)
        
def cleanerFn2(b):
    for row in range(len(b)):
        line = b.iloc[row, 1]
        b.iloc[row,1] = re.sub("[^a-zA-Z]", " ", line)

In [16]:
cleanerFn(df)
cleanerFn2(data)

for i in range(0, len(DataList)): 
    cleanerFn2(DataList[i])

In [17]:
for i in range(0, len(excluded)): 
    cleanerFn2(excluded[i])

## 2 Natural Language Processing

### 2.1 Import Packages

In [61]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
sw = stopwords.words('english')
nltk.download('stopwords')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
tfidf = TfidfVectorizer()

[nltk_data] Downloading package stopwords to /home/sxi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 2.2 Tokenize Words

In [20]:
df['com_token']=df['comment'].str.lower().str.split()

### 2.3 Remove Stop Words, Lemmatization, Stemming

In [21]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag_sents

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sxi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [22]:
def nlpFunction(DF):
    DF['com_token'] = DF['comment'].str.lower().str.split()
    DF['com_remv'] = DF['com_token'].apply(lambda x: [y for y in x if y not in sw])
    DF["com_lemma"] = DF['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    DF['com_stem'] = DF['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    DF["com_full"] = DF["com_stem"].apply(' '.join)
    DF["com_tagged"] = DF['comment'].apply(lambda x : [nltk.pos_tag(y) for y in x]) #word tagging
    DF["com_stem_str"] = DF["com_stem"].apply(', '.join)
    return DF

In [23]:
# need to count bigrams and POS tags

### POS Tagging 

In [154]:
df["com_tagged"] = df['comment'].apply(lambda x : [nltk.pos_tag(y) for y in x]) #word tagging

In [155]:
from nltk import word_tokenize, pos_tag, pos_tag_sents
nltk.download('punkt')

texts = df['comment'].tolist()
tagged_texts = pos_tag_sents(map(word_tokenize, texts))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [156]:
# tagged_texts
df["tagged"] = tagged_texts

In [26]:
df.head()

Unnamed: 0,label,comment,com_token
0,omgidk y but i just cant stop laughinglmao,,[nan]
1,There is literally not one thing about Krystal...,,[nan]
2,,,[nan]
3,Try not to laugh challenge right there,,[nan]
4,httpswwwyoutubecomwatchvKtI40e1cEN4,,[nan]


In [158]:
df = nlpFunction(df)
df.head(5)

Unnamed: 0,label,comment,com_token,com_tagged,tagged,com_remv,com_lemma,com_stem,com_full,com_stem_str
0,omgidk y but i just cant stop laughinglmao,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]",[nan],[nan],[nan],,
1,There is literally not one thing about Krystal...,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]",[nan],[nan],[nan],,
2,,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]",[nan],[nan],[nan],,
3,Try not to laugh challenge right there,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]",[nan],[nan],[nan],,
4,httpswwwyoutubecomwatchvKtI40e1cEN4,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]",[nan],[nan],[nan],,


In [32]:
for i in range(0, len(DataList)): 
    nlpFunction(DataList[i])

In [172]:
for i in range(0, len(excluded)): 
    nlpFunction(excluded[i])

In [161]:
df = nlpFunction(df)
data = nlpFunction(data)
trump = nlpFunction(trump)

In [164]:
videos.head(5)

Unnamed: 0,label,comment,com_token,com_remv,com_lemma,com_stem,com_full,com_tagged,com_stem_str
0,-1.0,Everyone knows brand s papers from But No on...,"[everyone, knows, brand, s, papers, from, but,...","[everyone, knows, brand, papers, one, knows, w...","[everyone, know, brand, paper, one, know, welf...","[everyon, know, brand, paper, one, know, welfa...",everyon know brand paper one know welfar emplo...,"[[(E, NN)], [(v, NN)], [(e, NN)], [(r, NN)], [...","everyon, know, brand, paper, one, know, welfar..."
1,0.0,Your paper cut balance is,"[your, paper, cut, balance, is]","[paper, cut, balance]","[paper, cut, balance]","[paper, cut, balanc]",paper cut balanc,"[[( , NN)], [(Y, NN)], [(o, NN)], [(u, NN)], [...","paper, cut, balanc"
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...,"[oh, shit, when, i, saw, this, on, my, front, ...","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]",oh shit saw front page love song,"[[(O, NN)], [(H, NN)], [( , NN)], [(S, NN)], [...","oh, shit, saw, front, page, love, song"
3,1.0,Blowing my mind yet again,"[blowing, my, mind, yet, again]","[blowing, mind, yet]","[blowing, mind, yet]","[blow, mind, yet]",blow mind yet,"[[(B, NN)], [(l, NN)], [(o, NN)], [(w, NN)], [...","blow, mind, yet"
4,0.0,Should have gone with Dunder Mifflin,"[should, have, gone, with, dunder, mifflin]","[gone, dunder, mifflin]","[gone, dunder, mifflin]","[gone, dunder, mifflin]",gone dunder mifflin,"[[(S, NN)], [(h, NN)], [(o, NN)], [(u, NN)], [...","gone, dunder, mifflin"


In [58]:
'''import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
def bigram_word_feats(DF, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(DF[""])
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
 
evaluate_classifier(bigram_word_feats)'''

'import itertools\nfrom nltk.collocations import BigramCollocationFinder\nfrom nltk.metrics import BigramAssocMeasures\n \ndef bigram_word_feats(DF, score_fn=BigramAssocMeasures.chi_sq, n=200):\n    bigram_finder = BigramCollocationFinder.from_words(DF[""])\n    bigrams = bigram_finder.nbest(score_fn, n)\n    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])\n \nevaluate_classifier(bigram_word_feats)'

### NER (Named Entry Recognition) Tagging

In [166]:
'''from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       '/usr/share/stanford-ner/stanford-ner.jar',
                       encoding='utf-8')

text = df["comments"]

tok_text = word_tokenize(text)
st_text = st.tag(tokenized_text)

print(st_text)'''

'from nltk.tag import StanfordNERTagger\nfrom nltk.tokenize import word_tokenize\n\nst = StanfordNERTagger(\'/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz\',\n                       \'/usr/share/stanford-ner/stanford-ner.jar\',\n                       encoding=\'utf-8\')\n\ntext = df["comments"]\n\ntok_text = word_tokenize(text)\nst_text = st.tag(tokenized_text)\n\nprint(st_text)'

## 3 Data Transformations

### 3.1 Split into Training and Test Data

In [37]:
import sklearn # machine learning
from sklearn.model_selection import train_test_split # splitting up data

In [None]:
df.head()

In [45]:
X_train = data["com_stem_str"]
X_test = trump["com_stem_str"]
Y_train = data["label"]
Y_test = trump["label"]
X_user = df["com_stem_str"]

KeyError: 'com_stem_str'

In [35]:
videos = videos.dropna(axis=0)
videos_not_okgo = videos_not_okgo.dropna(axis=0)
videos_not_royal = videos_not_royal.dropna(axis=0)
royal = royal.dropna(axis=0)
okgo = okgo.dropna(axis=0)
full = full.dropna(axis=0)

In [230]:
x_train_videos, x_test_videos, y_train_videos, y_test_videos = train_test_split(
    videos["com_stem_str"], videos["label"], test_size=0.33, random_state=42)

x_train_full, x_test_full, y_train_full, y_test_full = train_test_split(
    full["com_stem_str"], full["label"], test_size=0.33, random_state=42)

####
x_train_not_okgo = videos_not_okgo["com_stem_str"]
x_test_okgo = okgo["com_stem_str"]

y_train_not_okgo = videos_not_okgo["label"]
y_test_okgo = okgo["label"]

####
x_train_not_royal = videos_not_royal["com_stem_str"]
x_test_royal = royal["com_stem_str"]

y_train_not_royal = videos_not_royal["label"]
y_test_royal = royal["label"]

### 3.2 Check for missing values

In [231]:
print('lengths training variables: ', len(X_train),",", len(Y_train))
print('lengths testing variables: ', len(X_test),",", len(Y_test), '\n')

print('Are there any missing values?', 
      '\n * Training:', pd.isnull(X_train).values.any(), ',', pd.isnull(Y_train).values.any(), 
      '\n * Testing: ', pd.isnull(X_test).values.any(), ",", pd.isnull(Y_test).values.any())


lengths training variables:  2634 , 2634
lengths testing variables:  199 , 199 

Are there any missing values? 
 * Training: False , True 
 * Testing:  False , False


### 3.3 Transform Data to Counts 

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [233]:
tfidf = TfidfVectorizer()

xtrain = tfidf.fit_transform(X_train) # transform and fit training data
xtest = tfidf.transform(X_test) # transform trump test data from fitted transformer
xuser = tfidf.transform(X_user) # transform user selected comments to predict on

data_trans= tfidf.transform(data["com_stem_str"]) # same as X_train...transform entire dataset for cross validation
df_trans = tfidf.transform(df["com_stem_str"]) # same as X_user

In [234]:
x_train_videos = tfidf.fit_transform(x_train_videos)
x_test_videos = tfidf.transform(x_test_videos)

x_train_full = tfidf.fit_transform(x_train_full)
x_test_full = tfidf.transform(x_test_full)

In [235]:
x_train_not_okgo = tfidf.fit_transform(x_train_not_okgo)
x_test_okgo = tfidf.transform(x_test_okgo)

####
x_train_not_royal = tfidf.fit_transform(x_train_not_royal)
x_test_royal = tfidf.transform(x_test_royal)

In [223]:
#df_tagged = tfidf.transform(df["tagged"])

## 4 Machine Learning Models

In [226]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn import svm # support vector machine
from sklearn import metrics # for accuracy/ precision
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier # Stochastic Gradient Descent

In [135]:
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [238]:
mnb = MultinomialNB()
lr = LogisticRegression(solver='sag', max_iter=100, random_state=42, multi_class="multinomial")
#svm = svm.SVC()
knn = KNeighborsClassifier()
xgb = XGBClassifier()
rf = RandomForestClassifier(n_estimators=10, random_state=10)

In [239]:
'''x_train_videos = tfidf.fit_transform(x_train_videos)
x_test_videos = tfidf.transform(x_test_videos)

x_train_full = tfidf.fit_transform(x_train_full)
x_test_full = tfidf.transform(x_test_full)

####
x_train_not_okgo = tfidf.fit_transform(x_train_not_okgo)
x_test_not_okgo = tfidf.transform(x_test_not_okgo)

####
x_train_not_royal = tfidf.fit_transform(x_train_not_royal)
x_test_not_royal = tfidf.transform(x_test_not_royal)
'''

'x_train_videos = tfidf.fit_transform(x_train_videos)\nx_test_videos = tfidf.transform(x_test_videos)\n\nx_train_full = tfidf.fit_transform(x_train_full)\nx_test_full = tfidf.transform(x_test_full)\n\n####\nx_train_not_okgo = tfidf.fit_transform(x_train_not_okgo)\nx_test_not_okgo = tfidf.transform(x_test_not_okgo)\n\n####\nx_train_not_royal = tfidf.fit_transform(x_train_not_royal)\nx_test_not_royal = tfidf.transform(x_test_not_royal)\n'

In [240]:
def MLpipeline(model, xtrain, xtest, ytrain, ytest):
    model.fit(xtrain, ytrain)
    model_predict = model.predict(xtest)
    model_acc = metrics.accuracy_score(ytest, model_predict)
    print('We obtained ', round(model_acc, 6), '% accuracy for the model')

In [242]:
MLpipeline(mnb, x_train_videos, x_test_videos, y_train_videos, y_test_videos)
MLpipeline(lr, x_train_videos, x_test_videos, y_train_videos, y_test_videos)
MLpipeline(knn, x_train_videos, x_test_videos, y_train_videos, y_test_videos)
MLpipeline(xgb, x_train_videos, x_test_videos, y_train_videos, y_test_videos)
MLpipeline(rf, x_train_videos, x_test_videos, y_train_videos, y_test_videos)

We obtained  0.629459 % accuracy for the model
We obtained  0.63061 % accuracy for the model
We obtained  0.551208 % accuracy for the model
We obtained  0.594937 % accuracy for the model
We obtained  0.614499 % accuracy for the model


In [243]:
a,b,c,d = x_train_not_okgo, x_test_okgo, y_train_not_okgo, y_test_okgo

MLpipeline(mnb, a, b, c, d)
MLpipeline(lr, a, b, c, d)
MLpipeline(knn, a, b, c, d)
MLpipeline(xgb, a, b, c, d)
MLpipeline(rf, a, b, c, d)

We obtained  0.500502 % accuracy for the model
We obtained  0.512036 % accuracy for the model
We obtained  0.461886 % accuracy for the model
We obtained  0.519559 % accuracy for the model
We obtained  0.535105 % accuracy for the model


In [244]:
a,b,c,d = x_train_full, x_test_full, y_train_full, y_test_full

MLpipeline(mnb, a, b, c, d)
MLpipeline(lr, a, b, c, d)
MLpipeline(knn, a, b, c, d)
MLpipeline(xgb, a, b, c, d)
MLpipeline(rf, a, b, c, d)

We obtained  0.837965 % accuracy for the model
We obtained  0.870045 % accuracy for the model
We obtained  0.820597 % accuracy for the model
We obtained  0.847568 % accuracy for the model
We obtained  0.861872 % accuracy for the model


In [246]:
a,b,c,d = x_train_not_royal, x_test_royal, y_train_not_royal, y_test_royal

MLpipeline(mnb, a, b, c, d)
MLpipeline(lr, a, b, c, d)
MLpipeline(knn, a, b, c, d)
MLpipeline(xgb, a, b, c, d)
MLpipeline(rf, a, b, c, d)

We obtained  0.655738 % accuracy for the model
We obtained  0.639344 % accuracy for the model
We obtained  0.540984 % accuracy for the model
We obtained  0.639344 % accuracy for the model
We obtained  0.655738 % accuracy for the model


# Train test split

In [211]:
from sklearn.feature_extraction.text import CountVectorizer # frequency counts matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint, expon, uniform
import scipy.stats

In [54]:
xtrain, xtest, Y_train, Y_test = train_test_split(
    videos["comment"], videos["label"], test_size=0.33, random_state=42)

### 4.1 Multinomial Naive Bayes Model

**Fitting the Model:**

In [243]:
%%time
mnb_param_distribs = {
    'clf__alpha': [1,0.5,0.3,0.1,0.01,0.001,0],
    }

mnb = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', MultinomialNB())
               ])

mnb_search = GridSearchCV(mnb,
                        param_grid=mnb_param_distribs,
                        n_jobs=50,
                        cv=5,
                        #scoring='roc_auc',
                        )


mnb_search.fit(xtrain, Y_train) # fit the model on the training data word counts and training data lables

CPU times: user 696 ms, sys: 752 ms, total: 1.45 s
Wall time: 6.88 s


**Model Predictions:** 

In [244]:
mnb_predict = mnb_search.predict(xtest) # make our y predictions (labels) on the comment test data
mnb_acc = metrics.accuracy_score(Y_test, mnb_predict)
print('We obtained ', round(mnb_acc, 6), '% accuracy for the model')

We obtained  0.660529 % accuracy for the model


**Classification Report**

In [189]:
print(metrics.classification_report(Y_test, mnb_predict))

             precision    recall  f1-score   support

       -1.0       0.71      0.39      0.50       200
        0.0       0.63      0.79      0.70       386
        1.0       0.70      0.67      0.69       283

avg / total       0.67      0.66      0.65       869



**Confusion Matrix**

In [190]:
metrics.confusion_matrix(Y_test, mnb_predict)

array([[ 78,  95,  27],
       [ 27, 305,  54],
       [  5,  87, 191]])

**Cross Validation of Accuracy:**

In [85]:
scores = cross_val_score(mnb, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.59 (+/- 0.07)


### 4.2 Logistic Regression

**Fitting the Model:**

In [192]:
%%time
lr_param_distribs = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    }

lr = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', LogisticRegression(solver='sag', 
                                           max_iter=100, 
                                           random_state=42, 
                                           multi_class="multinomial")) # set multinomial setting for multiclass data
               ])

lr_search = GridSearchCV(lr,
                        param_grid=lr_param_distribs,
                        n_jobs=50,
                        cv=5,
                        #scoring='roc_auc',
                        )

lr_search.fit(xtrain, Y_train)

CPU times: user 632 ms, sys: 716 ms, total: 1.35 s
Wall time: 4.74 s


**Model Predictions:**

In [91]:
lr_predict = lr_search.predict(xtest)
lr_acc = metrics.accuracy_score(Y_test, lr_predict)
print('We obtained ', round(lr_acc, 6), '% accuracy for the logistic regression model')

We obtained  0.660529 % accuracy for the logistic regression model


**Classification Report:**

In [92]:
print(metrics.classification_report(Y_test, lr_predict))

             precision    recall  f1-score   support

       -1.0       0.66      0.41      0.50       200
        0.0       0.60      0.83      0.70       386
        1.0       0.80      0.60      0.69       283

avg / total       0.68      0.66      0.65       869



**Confusion Matrix:**

In [93]:
metrics.confusion_matrix(Y_test, lr_predict)

array([[ 82, 104,  14],
       [ 37, 321,  28],
       [  6, 106, 171]])

**Cross Validation:**

In [94]:
scores = cross_val_score(lr, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.61 (+/- 0.06)


### 4.3 Linear Support Vector Machine

**Fitting the Model & Predictions:**

In [232]:
from sklearn.svm import SVC

In [235]:
%%time
svm_param_distribs = {
    'clf__C': expon(scale=100), 
    'clf__gamma': expon(scale=.1),
    'clf__class_weight':['balanced', None]
    }

svm = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', SVC())
               ])

svm_search = RandomizedSearchCV(svm,
                        param_distributions=svm_param_distribs,
                        n_iter=50,
                        cv=5,
                        #scoring='roc_auc',
                        random_state=42)

svm_search.fit(xtrain, Y_train)



CPU times: user 3min 20s, sys: 0 ns, total: 3min 20s
Wall time: 3min 20s


In [246]:
svm_predict = svm_search.predict(xtest)
svm_acc = metrics.accuracy_score(Y_test, svm_predict)
print('We obtained ', round(svm_acc, 6), '% accuracy for the SVM model')

We obtained  0.653625 % accuracy for the SVM model


**Classification Report:**

In [100]:
print(metrics.classification_report(Y_test, mnb_predict))

             precision    recall  f1-score   support

       -1.0       0.82      0.21      0.33       200
        0.0       0.57      0.89      0.69       386
        1.0       0.79      0.59      0.68       283

avg / total       0.70      0.64      0.61       869



**Confusion Matrix:**

In [101]:
metrics.confusion_matrix(Y_test, lr_predict)

array([[ 82, 104,  14],
       [ 37, 321,  28],
       [  6, 106, 171]])

**Cross Validation:**

In [102]:
scores = cross_val_score(svm, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.44 (+/- 0.00)


### 4.4 K-Nearest Neighbor

**Fitting Model & Predictions:**

In [240]:
%%time
from sklearn.neighbors import KNeighborsClassifier # k-NN ensemble method

knn_param_distribs = {
    'clf__n_neighbors': randint(low=1,high=100), 
    }

knn = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', KNeighborsClassifier())
               ])

knn_search = RandomizedSearchCV(knn,
                        param_distributions=knn_param_distribs,
                        n_iter=50,
                        cv=5,
                        #scoring='roc_auc',
                        random_state=42)



CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.02 ms


In [241]:
knn_search.fit(xtrain, Y_train)

knn_predict = knn_search.predict(xtest)
knn_acc = metrics.accuracy_score(Y_test, knn_predict)
print('We obtained ', round(knn_acc, 6), '% accuracy for the KNN Bagging model')

We obtained  0.524741 % accuracy for the KNN Bagging model


**Cross Validation:**

In [104]:
scores = cross_val_score(knn, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.48 (+/- 0.02)


### 4.5 Random Forest

**Fitting Model & Predictions:**

In [44]:
from sklearn.ensemble import RandomForestClassifier # random forest ensemble method

ranfor = RandomForestClassifier(n_estimators=10, random_state=10)
ranfor = ranfor.fit(xtrain, Y_train)

rf_predict = ranfor.predict(xtest)
rf_acc = metrics.accuracy_score(Y_test, rf_predict)
print('We obtained ', round(rf_acc, 6), '% accuracy for the Random Forest model')

We obtained  0.537688 % accuracy for the Random Forest model


**Cross Validation:**

In [45]:
scores = cross_val_score(ranfor, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.45 (+/- 0.19)


#### RF Pipeline - transformations, grid search, cv

In [253]:
%%time

rf_param_distribs = {
    'clf__n_estimators': randint(low=1, high=700),
    'clf__max_features': randint(low=1, high=20),
    'clf__max_depth': randint(low=1, high=30),
    'clf__min_samples_split': randint(low=2, high=150)
    }

rf = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', RandomForestClassifier(random_state=42,
                                              n_jobs=-1,
                                              class_weight='balanced'))
               ])

rf_search = RandomizedSearchCV(rf,
                        param_distributions=rf_param_distribs,
                        n_iter=50,
                        cv=5,
                        #scoring='roc_auc',
                        random_state=42)

rf_search.fit(xtrain, Y_train)

CPU times: user 4min 56s, sys: 44.5 s, total: 5min 40s
Wall time: 4min 52s


In [254]:
rf_predict = rf_search.predict(xtest)
rf_acc = metrics.accuracy_score(Y_test, rf_predict)
print('We obtained ', round(rf_acc, 6), '% accuracy for the Random Forest model')

We obtained  0.623705 % accuracy for the Random Forest model


In [118]:
rf_predict = rf_search.predict(xtest)
rf_acc = metrics.accuracy_score(Y_test, rf_predict)
print('We obtained ', round(rf_acc, 6), '% accuracy for the Random Forest model')

We obtained  0.637514 % accuracy for the Random Forest model


In [46]:
import warnings
warnings.filterwarnings('ignore')

### 4.6 Extreme Gradient Boosting

In [47]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(xtrain, Y_train)
xgb_pred = xgb.predict(xtest)
xgb_acc = metrics.accuracy_score(Y_test, xgb_pred)
print('We obtained ', round(xgb_acc, 6), '% accuracy for the XGB Bagging model')

We obtained  0.482412 % accuracy for the XGB Bagging model


In [48]:
scores = cross_val_score(xgb, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.57 (+/- 0.15)


In [None]:
# Pipeline
%%time

gb_param_distribs = {
    'clf__max_depth': randint(3,10),
    'clf__min_child_weight': randint(1,12),
    'clf__max_depth': randint(low=2, high=8),
    'clf__gamma': [i/10.0 for i in range(0,5)],
    'clf__subsample':[i/10.0 for i in range(6,10)],
    'clf__colsample_bytree':[i/10.0 for i in range(6,10)],
    'clf__reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]
    }

gb = Pipeline([('tfidf', TfidfVectorizer()),
               ('clf', XGBClassifier(random_state=42)
               ])

gb_search = RandomizedSearchCV(gb,
                        param_distributions=gb_param_distribs,
                        n_iter=50,
                        cv=5,
                        #scoring='roc_auc',
                        random_state=42)

gb_search.fit(xtrain, Y_train)

## Save models as pkl files

In [251]:
from sklearn.externals import joblib

In [252]:
joblib.dump(mnb_search, 'mnb.pkl')
joblib.dump(lr_search, 'lr.pkl')
joblib.dump(svm_search, 'svm.pkl')
joblib.dump(knn_search, 'knn.pkl')
joblib.dump(rf_search, 'rf.pkl')
#joblib.dump(gb_search, 'gb.pkl')


['rf.pkl']

In [None]:
SavedData = open("pickled_material/data.pickle","wb")
pickle.dump(documents, SavedData)
SavedData.close()

SavedFeatures = open("pickled_material/SavedFeatures.pickle","wb")
pickle.dump(word_features, SavedFeatures)
SavedFeatures.close()

SavedModels= open("pickled_material/SavedModels.pickle","wb")
pickle.dump(models, SavedModels)
SavedModels.close()

## 5 Data Visualizations

### 5.1 Table of Model Results

In [49]:
myTable = pd.DataFrame(columns=['Naive Bayes','Support Vect Machine','Logistic Regression', 'K-NN', 'Random Forest'],
                   index=["Accuracy"])
myTable['Naive Bayes']=mnb_acc; myTable['Support Vect Machine']=svm_acc; myTable['Logistic Regression']=lr_acc
myTable['K-NN']= knn_acc; myTable['Random Forest']= rf_acc
myTable

Unnamed: 0,Naive Bayes,Support Vect Machine,Logistic Regression,K-NN,Random Forest
Accuracy,0.477387,0.437186,0.507538,0.432161,0.537688
