# YouTube Comments Sentiment Analysis 
Spring 2018


## 1 Set Up

### 1.1 Import Basic Modules

In [2]:
# Basics
import pandas as pd; import os
import csv; import numpy as np
import re; import warnings
warnings.filterwarnings('ignore')

### 1.2 Read In Data

In [3]:
os.chdir('/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/') # change directory

In [4]:
df = pd.read_csv('data/data.csv', delimiter=",", skiprows=2, encoding='utf-8', engine='python') 

#### Pre-Labeled YouTube Data

In [5]:
okgo = pd.read_csv('data/OKGO.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data
trump = pd.read_csv('data/trump.csv', delimiter=",", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python') 
swift = pd.read_csv('data/TaylorSwift.csv', delimiter=",", skiprows=2, nrows=180, encoding='utf-8', engine='python') 
royal = pd.read_csv('data/RoyalWedding.csv', delimiter=",", skiprows=2, nrows=61, encoding='utf-8', engine='python')
paul = pd.read_csv('data/LoganPaul.csv', delimiter=",", skiprows=2, nrows=200, encoding='utf-8', engine='python') 

#### Non-YouTube Data

In [6]:
blogs = pd.read_csv('data/Kagel.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data
tweets = pd.read_csv('data/twitter.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data

### 1.3 Clean Data Columns

In [7]:
tweets = tweets.drop(['Topic', 'TweetId', "TweetDate"], axis = 1).dropna()
tweets.columns = ["label", "comment"]
tweets.label = tweets.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
tweets['label'] = pd.to_numeric(tweets['label'], errors='coerce')

In [8]:
blogs.columns = ["label", "comment"]
blogs['label'] = pd.to_numeric(blogs['label'], errors='coerce')
okgo = okgo.drop(okgo.columns[[2, 3]], axis =1).dropna()
paul = paul.drop(["Unnamed: 2"], axis =1).dropna()

In [9]:
def fix_cols(DF):
    DF.columns = ["label", "comment"]

In [10]:
fix_cols(okgo)
fix_cols(trump)
fix_cols(swift)
fix_cols(royal)
fix_cols(paul)
fix_cols(df)

### 1.3.b Create Datasets

In [11]:
videos = pd.concat([okgo, trump, swift, royal, paul], ignore_index=True)
full = pd.concat([okgo, trump, swift, royal, paul, blogs, tweets], ignore_index=False)
videos_not_okgo = pd.concat([trump, swift, royal, paul], ignore_index=False)
videos_not_royal = pd.concat([okgo, trump, swift, paul], ignore_index=False)

DataList = [videos, full, videos_not_royal, videos_not_okgo]
excluded = [okgo, royal]

In [12]:
data = videos.copy()
data.sample(5)

Unnamed: 0,label,comment
857,0.0,"waiting for some music,still waiting...."
2236,0.0,Mayo 2018
929,1.0,As always awesome song _ and visually pleasing...
67,1.0,"Lol ""Phuket"" Thailand XD"
430,0.0,Try not to get satisfied video right here. I m...


### 1.4 Remove Non-Alphabetic Characters (including numbers)

In [13]:
def AsStr(DF):
    DF["comment"]= DF["comment"].astype(str) 

for i in range(0, len(DataList)): 
    AsStr(DataList[i])

In [14]:
for i in range(0, len(excluded)): 
    AsStr(excluded[i])
    
AsStr(df)

In [15]:
def cleanerFn(b):
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)
        
def cleanerFn2(b):
    for row in range(len(b)):
        line = b.iloc[row, 1]
        b.iloc[row,1] = re.sub("[^a-zA-Z]", " ", line)

In [16]:
cleanerFn(df)
cleanerFn2(data)

for i in range(0, len(DataList)): 
    cleanerFn2(DataList[i])

In [17]:
for i in range(0, len(excluded)): 
    cleanerFn2(excluded[i])

## 2 Natural Language Processing

### 2.1 Import Packages

In [18]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
sw = stopwords.words('english')
nltk.download('stopwords')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
tfidf = TfidfVectorizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 2.2 Tokenize Words

In [20]:
df['com_token']=df['comment'].str.lower().str.split()

### 2.3 Remove Stop Words, Lemmatization, Stemming

In [21]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag_sents

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [22]:
def nlpFunction(DF):
    DF['com_token'] = DF['comment'].str.lower().str.split()
    DF['com_remv'] = DF['com_token'].apply(lambda x: [y for y in x if y not in sw])
    DF["com_lemma"] = DF['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    DF['com_stem'] = DF['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    DF["com_full"] = DF["com_stem"].apply(' '.join)
    DF["com_tagged"] = DF['comment'].apply(lambda x : [nltk.pos_tag(y) for y in x]) #word tagging
    DF["com_stem_str"] = DF["com_stem"].apply(', '.join)
    return DF

In [23]:
# need to count bigrams and POS tags

### POS Tagging 

In [24]:
df["com_tagged"] = df['comment'].apply(lambda x : [nltk.pos_tag(y) for y in x]) #word tagging

In [25]:
from nltk import word_tokenize, pos_tag, pos_tag_sents
nltk.download('punkt')

texts = df['comment'].tolist()
tagged_texts = pos_tag_sents(map(word_tokenize, texts))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
# tagged_texts
df["tagged"] = tagged_texts

In [27]:
df.head(5)

Unnamed: 0,label,comment,com_token,com_tagged,tagged
0,omgidk y but i just cant stop laughinglmao,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]"
1,There is literally not one thing about Krystal...,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]"
2,,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]"
3,Try not to laugh challenge right there,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]"
4,httpswwwyoutubecomwatchvKtI40e1cEN4,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]"


In [28]:
df = nlpFunction(df)
df.head(5)

Unnamed: 0,label,comment,com_token,com_tagged,tagged,com_remv,com_lemma,com_stem,com_full,com_stem_str
0,omgidk y but i just cant stop laughinglmao,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]",[nan],[nan],[nan],,
1,There is literally not one thing about Krystal...,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]",[nan],[nan],[nan],,
2,,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]",[nan],[nan],[nan],,
3,Try not to laugh challenge right there,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]",[nan],[nan],[nan],,
4,httpswwwyoutubecomwatchvKtI40e1cEN4,,[nan],"[[(n, NN)], [(a, DT)], [(n, NN)]]","[(nan, NN)]",[nan],[nan],[nan],,


In [29]:
for i in range(0, len(DataList)): 
    nlpFunction(DataList[i])

In [30]:
for i in range(0, len(excluded)): 
    nlpFunction(excluded[i])

In [31]:
df = nlpFunction(df)
data = nlpFunction(data)
trump = nlpFunction(trump)

In [32]:
videos.head(5)

Unnamed: 0,label,comment,com_token,com_remv,com_lemma,com_stem,com_full,com_tagged,com_stem_str
0,-1.0,Everyone knows brand s papers from But No on...,"[everyone, knows, brand, s, papers, from, but,...","[everyone, knows, brand, papers, one, knows, w...","[everyone, know, brand, paper, one, know, welf...","[everyon, know, brand, paper, one, know, welfa...",everyon know brand paper one know welfar emplo...,"[[(E, NN)], [(v, NN)], [(e, NN)], [(r, NN)], [...","everyon, know, brand, paper, one, know, welfar..."
1,0.0,Your paper cut balance is,"[your, paper, cut, balance, is]","[paper, cut, balance]","[paper, cut, balance]","[paper, cut, balanc]",paper cut balanc,"[[( , NN)], [(Y, NN)], [(o, NN)], [(u, NN)], [...","paper, cut, balanc"
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...,"[oh, shit, when, i, saw, this, on, my, front, ...","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]",oh shit saw front page love song,"[[(O, NN)], [(H, NN)], [( , NN)], [(S, NN)], [...","oh, shit, saw, front, page, love, song"
3,1.0,Blowing my mind yet again,"[blowing, my, mind, yet, again]","[blowing, mind, yet]","[blowing, mind, yet]","[blow, mind, yet]",blow mind yet,"[[(B, NN)], [(l, NN)], [(o, NN)], [(w, NN)], [...","blow, mind, yet"
4,0.0,Should have gone with Dunder Mifflin,"[should, have, gone, with, dunder, mifflin]","[gone, dunder, mifflin]","[gone, dunder, mifflin]","[gone, dunder, mifflin]",gone dunder mifflin,"[[(S, NN)], [(h, NN)], [(o, NN)], [(u, NN)], [...","gone, dunder, mifflin"


In [33]:
'''import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
 
def bigram_word_feats(DF, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(DF[""])
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
 
evaluate_classifier(bigram_word_feats)'''

'import itertools\nfrom nltk.collocations import BigramCollocationFinder\nfrom nltk.metrics import BigramAssocMeasures\n \ndef bigram_word_feats(DF, score_fn=BigramAssocMeasures.chi_sq, n=200):\n    bigram_finder = BigramCollocationFinder.from_words(DF[""])\n    bigrams = bigram_finder.nbest(score_fn, n)\n    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])\n \nevaluate_classifier(bigram_word_feats)'

### NER (Named Entry Recognition) Tagging

In [34]:
'''from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                       '/usr/share/stanford-ner/stanford-ner.jar',
                       encoding='utf-8')

text = df["comments"]

tok_text = word_tokenize(text)
st_text = st.tag(tokenized_text)

print(st_text)'''

'from nltk.tag import StanfordNERTagger\nfrom nltk.tokenize import word_tokenize\n\nst = StanfordNERTagger(\'/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz\',\n                       \'/usr/share/stanford-ner/stanford-ner.jar\',\n                       encoding=\'utf-8\')\n\ntext = df["comments"]\n\ntok_text = word_tokenize(text)\nst_text = st.tag(tokenized_text)\n\nprint(st_text)'

In [38]:
import sklearn.feature_extraction.text
from nltk import everygrams
import pandas as pd
from itertools import chain
from nltk import everygrams, word_tokenize

In [None]:
df['String'].apply(lambda x: [' '.join(ng) for ng in everygrams(word_tokenize(x), 1, 3)])

## Bigrams and Trigrams
https://stackoverflow.com/questions/49147128/generate-ngrams-from-a-pandas-dataframe-column
https://stackoverflow.com/questions/32252075/nltk-sklearn-unigram-bigram

In [43]:
'''i = videos.comment\
      .str.lower()\
      .str.replace('[^a-z\s]', '')\
      .str.split(expand=True)\
      .stack()

# generate bigrams by concatenating unigram columns
j = i + ' ' + i.shift(-1)
# generate trigrams by concatenating unigram and bigram columns
k = j + ' ' + i.shift(-2)

# concatenate all series vertically, and remove NaNs
pd.concat([i, j, k]).dropna().reset_index(drop=True)'''

"i = videos.comment      .str.lower()      .str.replace('[^a-z\\s]', '')      .str.split(expand=True)      .stack()\n\n# generate bigrams by concatenating unigram columns\nj = i + ' ' + i.shift(-1)\n# generate trigrams by concatenating unigram and bigram columns\nk = j + ' ' + i.shift(-2)\n\n# concatenate all series vertically, and remove NaNs\npd.concat([i, j, k]).dropna().reset_index(drop=True)"

In [42]:
#videos['comment'].apply(lambda x: [' '.join(ng) for ng in everygrams(word_tokenize(x), 1, 3)])

In [44]:
'''ngram_size = 2
vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size))

vectorizer.fit(videos["com_full"]) # build ngram dictionary
ngram = vectorizer.transform(videos["com_full"]) # get ngram
print('ngram: {0}\n'.format(ngram))
print('ngram.shape: {0}'.format(ngram.shape))
print('vectorizer.vocabulary_: {0}'.format(vectorizer.vocabulary_))'''

'ngram_size = 2\nvectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size))\n\nvectorizer.fit(videos["com_full"]) # build ngram dictionary\nngram = vectorizer.transform(videos["com_full"]) # get ngram\nprint(\'ngram: {0}\n\'.format(ngram))\nprint(\'ngram.shape: {0}\'.format(ngram.shape))\nprint(\'vectorizer.vocabulary_: {0}\'.format(vectorizer.vocabulary_))'

In [45]:
def find_bigrams(input_list):
  bigram_list = []
  for i in range(len(input_list)-1):
      bigram_list.append((input_list[i], input_list[i+1]))
  return bigram_list

In [48]:
videos['bigrams'] = videos['com_stem'].apply(lambda x: find_bigrams(x))

In [49]:
videos.head(2)

Unnamed: 0,label,comment,com_token,com_remv,com_lemma,com_stem,com_full,com_tagged,com_stem_str,bigrams
0,-1.0,Everyone knows brand s papers from But No on...,"[everyone, knows, brand, s, papers, from, but,...","[everyone, knows, brand, papers, one, knows, w...","[everyone, know, brand, paper, one, know, welf...","[everyon, know, brand, paper, one, know, welfa...",everyon know brand paper one know welfar emplo...,"[[(E, NN)], [(v, NN)], [(e, NN)], [(r, NN)], [...","everyon, know, brand, paper, one, know, welfar...","[(everyon, know), (know, brand), (brand, paper..."
1,0.0,Your paper cut balance is,"[your, paper, cut, balance, is]","[paper, cut, balance]","[paper, cut, balance]","[paper, cut, balanc]",paper cut balanc,"[[( , NN)], [(Y, NN)], [(o, NN)], [(u, NN)], [...","paper, cut, balanc","[(paper, cut), (cut, balanc)]"


#### Penalized Regressions
https://stackoverflow.com/questions/12247768/unigrams-bigrams-tf-idf-less-accurate-than-just-unigrams-ff-idf

In [None]:
sklearn.linear_model.Lasso (regression)
sklearn.linear_model.ElasticNet (regression)
sklearn.linear_model.SGDRegressor (regression) with penalty == 'elastic_net' or 'l1'
sklearn.linear_model.SGDClassifier (classification) with penalty == 'elastic_net' or 'l1'

### N-Grams

In [None]:
#df['String'].apply(lambda x: [' '.join(ng) for ng in everygrams(word_tokenize(x), 1, 3)])

In [None]:
'''ngram_size = 1
train_set = ['Cristiano plays football', 'Ronaldo like football too']

vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(ngram_size,ngram_size))
vectorizer.fit(train_set) # build ngram dictionary
ngram = vectorizer.transform(train_set) # get ngram'''

## 3 Data Transformations

### 3.1 Split into Training and Test Data

In [51]:
import sklearn # machine learning
from sklearn.model_selection import train_test_split # splitting up 

In [228]:
X_train = data["com_stem_str"]
X_test = trump["com_stem_str"]
Y_train = data["label"]
Y_test = trump["label"]
X_user = df["com_stem_str"]

In [229]:
videos = videos.dropna(axis=0)
videos_not_okgo = videos_not_okgo.dropna(axis=0)
videos_not_royal = videos_not_royal.dropna(axis=0)
royal = royal.dropna(axis=0)
okgo = okgo.dropna(axis=0)
full = full.dropna(axis=0)

In [230]:
x_train_videos, x_test_videos, y_train_videos, y_test_videos = train_test_split(
    videos["com_stem_str"], videos["label"], test_size=0.33, random_state=42)

x_train_full, x_test_full, y_train_full, y_test_full = train_test_split(
    full["com_stem_str"], full["label"], test_size=0.33, random_state=42)

####
x_train_not_okgo = videos_not_okgo["com_stem_str"]
x_test_okgo = okgo["com_stem_str"]

y_train_not_okgo = videos_not_okgo["label"]
y_test_okgo = okgo["label"]

####
x_train_not_royal = videos_not_royal["com_stem_str"]
x_test_royal = royal["com_stem_str"]

y_train_not_royal = videos_not_royal["label"]
y_test_royal = royal["label"]

In [66]:
videos.head(3)

Unnamed: 0,label,comment,com_token,com_remv,com_lemma,com_stem,com_full,com_tagged,com_stem_str,bigrams
0,-1.0,Everyone knows brand s papers from But No on...,"[everyone, knows, brand, s, papers, from, but,...","[everyone, knows, brand, papers, one, knows, w...","[everyone, know, brand, paper, one, know, welf...","[everyon, know, brand, paper, one, know, welfa...",everyon know brand paper one know welfar emplo...,"[[(E, NN)], [(v, NN)], [(e, NN)], [(r, NN)], [...","everyon, know, brand, paper, one, know, welfar...","[(everyon, know), (know, brand), (brand, paper..."
1,0.0,Your paper cut balance is,"[your, paper, cut, balance, is]","[paper, cut, balance]","[paper, cut, balance]","[paper, cut, balanc]",paper cut balanc,"[[( , NN)], [(Y, NN)], [(o, NN)], [(u, NN)], [...","paper, cut, balanc","[(paper, cut), (cut, balanc)]"
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...,"[oh, shit, when, i, saw, this, on, my, front, ...","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]",oh shit saw front page love song,"[[(O, NN)], [(H, NN)], [( , NN)], [(S, NN)], [...","oh, shit, saw, front, page, love, song","[(oh, shit), (shit, saw), (saw, front), (front..."


In [68]:
#videos["bigrams_str"] = videos["bigrams"].str

In [73]:
#videos["bigrams_str"]  = videos["bigrams"].apply(lambda x: ', '.join(x))

In [86]:
videos = videos.dropna()

In [92]:
x_train_videos_bi_tk, x_test_videos_bi_tk, y_train_videos_bi, y_test_videos_bi = train_test_split(
    videos["com_full"], videos["label"], test_size=0.33, random_state=42)

In [104]:
from sklearn.feature_extraction.text import CountVectorizer
bi_CV = CountVectorizer(ngram_range=(1, 2))
x_train_videos_bi = bi_CV.fit_transform(x_train_videos_bi_tk) # transform and fit training data
x_test_videos_bi = bi_CV.transform(x_test_videos_bi_tk)

### 3.2 Check for missing values

In [105]:
def Miss(X_train, Y_train, X_test, Y_test):
    print('lengths training variables: ', len(X_train),",", len(Y_train))
    print('lengths testing variables: ', len(X_test),",", len(Y_test), '\n')

    print('Are there any missing values?', 
          '\n * Training:', pd.isnull(X_train).values.any(), ',', pd.isnull(Y_train).values.any(), 
          '\n * Testing: ', pd.isnull(X_test).values.any(), ",", pd.isnull(Y_test).values.any())


In [106]:
Miss(x_train_videos_bi_tk, y_train_videos_bi, x_test_videos_bi_tk, y_test_videos_bi)

lengths training variables:  1763 , 1763
lengths testing variables:  869 , 869 

Are there any missing values? 
 * Training: False , False 
 * Testing:  False , False


### 3.3 Transform Data to Counts 

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [55]:
videos.head(3)

Unnamed: 0,label,comment,com_token,com_remv,com_lemma,com_stem,com_full,com_tagged,com_stem_str,bigrams
0,-1.0,Everyone knows brand s papers from But No on...,"[everyone, knows, brand, s, papers, from, but,...","[everyone, knows, brand, papers, one, knows, w...","[everyone, know, brand, paper, one, know, welf...","[everyon, know, brand, paper, one, know, welfa...",everyon know brand paper one know welfar emplo...,"[[(E, NN)], [(v, NN)], [(e, NN)], [(r, NN)], [...","everyon, know, brand, paper, one, know, welfar...","[(everyon, know), (know, brand), (brand, paper..."
1,0.0,Your paper cut balance is,"[your, paper, cut, balance, is]","[paper, cut, balance]","[paper, cut, balance]","[paper, cut, balanc]",paper cut balanc,"[[( , NN)], [(Y, NN)], [(o, NN)], [(u, NN)], [...","paper, cut, balanc","[(paper, cut), (cut, balanc)]"
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...,"[oh, shit, when, i, saw, this, on, my, front, ...","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]",oh shit saw front page love song,"[[(O, NN)], [(H, NN)], [( , NN)], [(S, NN)], [...","oh, shit, saw, front, page, love, song","[(oh, shit), (shit, saw), (saw, front), (front..."


In [233]:
tfidf = TfidfVectorizer()

xtrain = tfidf.fit_transform(X_train) # transform and fit training data
xtest = tfidf.transform(X_test) # transform trump test data from fitted transformer
xuser = tfidf.transform(X_user) # transform user selected comments to predict on

data_trans= tfidf.transform(data["com_stem_str"]) # same as X_train...transform entire dataset for cross validation
df_trans = tfidf.transform(df["com_stem_str"]) # same as X_user

In [234]:
x_train_videos = tfidf.fit_transform(x_train_videos)
x_test_videos = tfidf.transform(x_test_videos)

x_train_full = tfidf.fit_transform(x_train_full)
x_test_full = tfidf.transform(x_test_full)

In [235]:
x_train_not_okgo = tfidf.fit_transform(x_train_not_okgo)
x_test_okgo = tfidf.transform(x_test_okgo)

####
x_train_not_royal = tfidf.fit_transform(x_train_not_royal)
x_test_royal = tfidf.transform(x_test_royal)

In [223]:
#df_tagged = tfidf.transform(df["tagged"])

## 4 Machine Learning Models

In [107]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn import svm # support vector machine
from sklearn import metrics # for accuracy/ precision
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier # Stochastic Gradient Descent

In [108]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [109]:
mnb = MultinomialNB()
lr = LogisticRegression(solver='sag', max_iter=100, random_state=42, multi_class="multinomial")
#svm = svm.SVC()
knn = KNeighborsClassifier()
xgb = XGBClassifier()
rf = RandomForestClassifier(n_estimators=10, random_state=10)

In [110]:
'''x_train_videos = tfidf.fit_transform(x_train_videos)
x_test_videos = tfidf.transform(x_test_videos)

x_train_full = tfidf.fit_transform(x_train_full)
x_test_full = tfidf.transform(x_test_full)

####
x_train_not_okgo = tfidf.fit_transform(x_train_not_okgo)
x_test_not_okgo = tfidf.transform(x_test_not_okgo)

####
x_train_not_royal = tfidf.fit_transform(x_train_not_royal)
x_test_not_royal = tfidf.transform(x_test_not_royal)
'''

'x_train_videos = tfidf.fit_transform(x_train_videos)\nx_test_videos = tfidf.transform(x_test_videos)\n\nx_train_full = tfidf.fit_transform(x_train_full)\nx_test_full = tfidf.transform(x_test_full)\n\n####\nx_train_not_okgo = tfidf.fit_transform(x_train_not_okgo)\nx_test_not_okgo = tfidf.transform(x_test_not_okgo)\n\n####\nx_train_not_royal = tfidf.fit_transform(x_train_not_royal)\nx_test_not_royal = tfidf.transform(x_test_not_royal)\n'

In [111]:
def MLpipeline(model, xtrain, xtest, ytrain, ytest):
    model.fit(xtrain, ytrain)
    model_predict = model.predict(xtest)
    model_acc = metrics.accuracy_score(ytest, model_predict)
    print('We obtained ', round(model_acc, 6), '% accuracy for the model')

In [112]:
MLpipeline(mnb, x_train_videos_bi, x_test_videos_bi, y_train_videos_bi, y_test_videos_bi)
MLpipeline(lr, x_train_videos_bi, x_test_videos_bi, y_train_videos_bi, y_test_videos_bi)
MLpipeline(knn, x_train_videos_bi, x_test_videos_bi, y_train_videos_bi, y_test_videos_bi)
MLpipeline(xgb, x_train_videos_bi, x_test_videos_bi, y_train_videos_bi, y_test_videos_bi)
MLpipeline(rf, x_train_videos_bi, x_test_videos_bi, y_train_videos_bi, y_test_videos_bi)

We obtained  0.621404 % accuracy for the model
We obtained  0.623705 % accuracy for the model
We obtained  0.56962 % accuracy for the model
We obtained  0.593786 % accuracy for the model
We obtained  0.621404 % accuracy for the model


In [242]:
MLpipeline(mnb, x_train_videos, x_test_videos, y_train_videos, y_test_videos)
MLpipeline(lr, x_train_videos, x_test_videos, y_train_videos, y_test_videos)
MLpipeline(knn, x_train_videos, x_test_videos, y_train_videos, y_test_videos)
MLpipeline(xgb, x_train_videos, x_test_videos, y_train_videos, y_test_videos)
MLpipeline(rf, x_train_videos, x_test_videos, y_train_videos, y_test_videos)

We obtained  0.629459 % accuracy for the model
We obtained  0.63061 % accuracy for the model
We obtained  0.551208 % accuracy for the model
We obtained  0.594937 % accuracy for the model
We obtained  0.614499 % accuracy for the model


In [243]:
a,b,c,d = x_train_not_okgo, x_test_okgo, y_train_not_okgo, y_test_okgo

MLpipeline(mnb, a, b, c, d)
MLpipeline(lr, a, b, c, d)
MLpipeline(knn, a, b, c, d)
MLpipeline(xgb, a, b, c, d)
MLpipeline(rf, a, b, c, d)

We obtained  0.500502 % accuracy for the model
We obtained  0.512036 % accuracy for the model
We obtained  0.461886 % accuracy for the model
We obtained  0.519559 % accuracy for the model
We obtained  0.535105 % accuracy for the model


In [244]:
a,b,c,d = x_train_full, x_test_full, y_train_full, y_test_full

MLpipeline(mnb, a, b, c, d)
MLpipeline(lr, a, b, c, d)
MLpipeline(knn, a, b, c, d)
MLpipeline(xgb, a, b, c, d)
MLpipeline(rf, a, b, c, d)

We obtained  0.837965 % accuracy for the model
We obtained  0.870045 % accuracy for the model
We obtained  0.820597 % accuracy for the model
We obtained  0.847568 % accuracy for the model
We obtained  0.861872 % accuracy for the model


In [246]:
a,b,c,d = x_train_not_royal, x_test_royal, y_train_not_royal, y_test_royal

MLpipeline(mnb, a, b, c, d)
MLpipeline(lr, a, b, c, d)
MLpipeline(knn, a, b, c, d)
MLpipeline(xgb, a, b, c, d)
MLpipeline(rf, a, b, c, d)

We obtained  0.655738 % accuracy for the model
We obtained  0.639344 % accuracy for the model
We obtained  0.540984 % accuracy for the model
We obtained  0.639344 % accuracy for the model
We obtained  0.655738 % accuracy for the model


### 4.1 Multinomial Naive Bayes Model

**Fitting the Model:**

In [27]:
mnb = MultinomialNB()
mnb.fit(xtrain, Y_train) # fit the model on the training data word counts and training data lables

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

**Model Predictions:** 

In [28]:
mnb_predict = mnb.predict(xtest) # make our y predictions (labels) on the comment test data
mnb_acc = metrics.accuracy_score(Y_test, mnb_predict)
print('We obtained ', round(mnb_acc, 6), '% accuracy for the model')

We obtained  0.477387 % accuracy for the model


**Classification Report**

In [29]:
print(metrics.classification_report(Y_test, mnb_predict))

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        56
          0       0.49      0.89      0.63        87
          1       0.44      0.32      0.37        56

avg / total       0.34      0.48      0.38       199



**Confusion Matrix**

In [30]:
metrics.confusion_matrix(Y_test, mnb_predict)

array([[ 0, 43, 13],
       [ 0, 77, 10],
       [ 0, 38, 18]])

**Cross Validation of Accuracy:**

In [31]:
scores = cross_val_score(mnb, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.59 (+/- 0.11)


### 4.2 Logistic Regression

**Fitting the Model:**

In [32]:
lr = LogisticRegression(solver='sag', max_iter=100, random_state=42, multi_class="multinomial") # set multinomial setting for multiclass data

**Model Predictions:**

In [33]:
lr.fit(xtrain, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
lr_predict = lr.predict(xtest)
lr_acc = metrics.accuracy_score(Y_test, lr_predict)
print('We obtained ', round(lr_acc, 6), '% accuracy for the logistic regression model')

We obtained  0.507538 % accuracy for the logistic regression model


**Classification Report:**

In [35]:
print(metrics.classification_report(Y_test, lr_predict))

             precision    recall  f1-score   support

         -1       0.75      0.05      0.10        56
          0       0.48      0.92      0.63        87
          1       0.64      0.32      0.43        56

avg / total       0.60      0.51      0.42       199



**Confusion Matrix:**

In [36]:
metrics.confusion_matrix(Y_test, lr_predict)

array([[ 3, 49,  4],
       [ 1, 80,  6],
       [ 0, 38, 18]])

**Cross Validation:**

In [37]:
scores = cross_val_score(lr, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.59 (+/- 0.16)


### 4.3 Linear Support Vector Machine

**Fitting the Model & Predictions:**

In [38]:
svm = svm.SVC()
svm.fit(xtrain, Y_train)
svm_predict = svm.predict(xtest)
svm_acc = metrics.accuracy_score(Y_test, svm_predict)
print('We obtained ', round(svm_acc, 6), '% accuracy for the SVM model')

We obtained  0.437186 % accuracy for the SVM model


**Classification Report:**

In [39]:
print(metrics.classification_report(Y_test, mnb_predict))

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00        56
          0       0.49      0.89      0.63        87
          1       0.44      0.32      0.37        56

avg / total       0.34      0.48      0.38       199



**Confusion Matrix:**

In [40]:
metrics.confusion_matrix(Y_test, lr_predict)

array([[ 3, 49,  4],
       [ 1, 80,  6],
       [ 0, 38, 18]])

**Cross Validation:**

In [41]:
scores = cross_val_score(svm, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.44 (+/- 0.01)


### 4.4 K-Nearest Neighbor

**Fitting Model & Predictions:**

In [42]:
from sklearn.neighbors import KNeighborsClassifier # k-NN ensemble method

knn = KNeighborsClassifier()
knn.fit(xtrain, Y_train)

knn_predict = knn.predict(xtest)
knn_acc = metrics.accuracy_score(Y_test, knn_predict)
print('We obtained ', round(knn_acc, 6), '% accuracy for the KNN Bagging model')

We obtained  0.432161 % accuracy for the KNN Bagging model


**Cross Validation:**

In [43]:
scores = cross_val_score(knn, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.38 (+/- 0.11)


### 4.5 Random Forest

**Fitting Model & Predictions:**

In [44]:
from sklearn.ensemble import RandomForestClassifier # random forest ensemble method

ranfor = RandomForestClassifier(n_estimators=10, random_state=10)
ranfor = ranfor.fit(xtrain, Y_train)

rf_predict = ranfor.predict(xtest)
rf_acc = metrics.accuracy_score(Y_test, rf_predict)
print('We obtained ', round(rf_acc, 6), '% accuracy for the Random Forest model')

We obtained  0.537688 % accuracy for the Random Forest model


**Cross Validation:**

In [45]:
scores = cross_val_score(ranfor, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.45 (+/- 0.19)


In [46]:
import warnings
warnings.filterwarnings('ignore')

### 4.6 Extreme Gradient Boosting

In [47]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(xtrain, Y_train)
xgb_pred = xgb.predict(xtest)
xgb_acc = metrics.accuracy_score(Y_test, xgb_pred)
print('We obtained ', round(xgb_acc, 6), '% accuracy for the XGB Bagging model')

We obtained  0.482412 % accuracy for the XGB Bagging model


In [48]:
scores = cross_val_score(xgb, xtest, Y_test, cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.57 (+/- 0.15)


In [None]:
SavedData = open("pickled_material/data.pickle","wb")
pickle.dump(documents, SavedData)
SavedData.close()

SavedFeatures = open("pickled_material/SavedFeatures.pickle","wb")
pickle.dump(word_features, SavedFeatures)
SavedFeatures.close()

SavedModels= open("pickled_material/SavedModels.pickle","wb")
pickle.dump(models, SavedModels)
SavedModels.close()

## 5 Data Visualizations

### 5.1 Table of Model Results

In [49]:
myTable = pd.DataFrame(columns=['Naive Bayes','Support Vect Machine','Logistic Regression', 'K-NN', 'Random Forest'],
                   index=["Accuracy"])
myTable['Naive Bayes']=mnb_acc; myTable['Support Vect Machine']=svm_acc; myTable['Logistic Regression']=lr_acc
myTable['K-NN']= knn_acc; myTable['Random Forest']= rf_acc
myTable

Unnamed: 0,Naive Bayes,Support Vect Machine,Logistic Regression,K-NN,Random Forest
Accuracy,0.477387,0.437186,0.507538,0.432161,0.537688
