# YouTube Comments Sentiment Analysis 
Andie's Version <br>
Spring 2018


## 1 Set Up

### 1.1 Import Basic Modules

In [97]:
# Basics
import pandas as pd
import os
import csv
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

### 1.2 Read In Data

In [98]:
os.chdir('/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/') # change directory

In [99]:
okgo = pd.read_csv('data/OKGOcomments.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data
blogs = pd.read_csv('data/Kagel_social_media_blogs.csv', delimiter="@@@", skiprows=2, encoding='latin-1', engine='python') # read in the data
tweets = pd.read_csv('data/full-corpus.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data

# test data: 
trump = pd.read_csv('data/trump.csv', delimiter="@@@", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python') 

# combine training dataframes
df = pd.read_csv('data/data.csv', delimiter="@@@", skiprows=2, encoding='utf-8', engine='python') 

In [100]:
df.head(3)

Unnamed: 0,"Deutschland, Deutschland ueber alles!",Unnamed: 1
0,Roses are Red,
1,Violets are Blue,
2,I was so happy,


### 1.3 Clean Data Columns

In [101]:
tweets = tweets.drop(['Topic', 'TweetId', "TweetDate"], axis = 1).dropna()
tweets.columns = ["label", "comment"]
tweets.label = tweets.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
tweets['label'] = pd.to_numeric(tweets['label'], errors='coerce')

In [102]:
blogs.columns = ["label", "comment"]
blogs['label'] = pd.to_numeric(blogs['label'], errors='coerce')

okgo.columns = [
  'label','comment','a','b']
okgo = okgo.drop(['a', 'b'], axis = 1).dropna() # drop columns 3 and 4 and missing values

In [103]:
data = pd.concat([okgo, blogs, tweets], ignore_index=False)

In [104]:
df.columns = ["comment", "label"]
trump.columns = ["label", "comment"]

In [105]:
data.sample(10)

Unnamed: 0,label,comment
3540,1.0,dudeee i LOVED brokeback mountain!!!
709,0.0,Go big or go home @Apple users - here are some...
4224,0.0,Combining the opinion / review from Gary and ...
3387,1.0,Brokeback Mountain was so awesome
6539,0.0,Oh and Brokeback Mountain is a TERRIBLE movie..
6391,0.0,So Brokeback Mountain was really depressing
1242,1.0,I'm most excited about Android beam & face det...
3224,0.0,"RT @NeowinFeed: Rumor: New screen sizes, specs..."
494,1.0,I love The Da Vinci Code..
3724,1.0,I love Brokeback Mountain


### 1.4 Remove Non-Alphabetic Characters (including numbers)

In [106]:
df["comment"]= df["comment"].astype(str) 
trump["comment"]= trump["comment"].astype(str) 

In [107]:
def cleanerFn(b):
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)
        
def cleanerFn2(b):
    for row in range(len(b)):
        line = b.iloc[row, 1]
        b.iloc[row,1] = re.sub("[^a-zA-Z]", " ", line)

In [108]:
cleanerFn(df)
cleanerFn2(data)
cleanerFn2(trump)

## 2 Natural Language Processing

### 2.1 Import Packages

In [109]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [110]:
sw = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
tfidf = TfidfVectorizer()

### 2.2 Tokenize Words

In [111]:
df['com_token']=df['comment'].str.lower().str.split()

### 2.3 Remove Stop Words, Lemmatization, Stemming

In [112]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
sw = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [113]:
def nlpFunction(DF):
    DF['com_token'] = DF['comment'].str.lower().str.split()
    DF['com_remv'] = DF['com_token'].apply(lambda x: [y for y in x if y not in sw])
    DF["com_lemma"] = DF['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    DF['com_stem'] = DF['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    DF["com_stem_str"] = DF["com_stem"].apply(', '.join)
    return DF

In [114]:
df = nlpFunction(df)
data = nlpFunction(data)
trump = nlpFunction(trump)

## 3 Data Transformations

### 3.1 Split into Training and Test Data

In [115]:
import sklearn # machine learning
from sklearn.model_selection import train_test_split # splitting up data

In [116]:
'''X_train, X_test, Y_train, Y_test = train_test_split(
                                    df["com_stem_str"], df["label"], 
                                    test_size=0.25, 
                                    random_state=42)'''

'X_train, X_test, Y_train, Y_test = train_test_split(\n                                    df["com_stem_str"], df["label"], \n                                    test_size=0.25, \n                                    random_state=42)'

In [117]:
X_train = data["com_stem_str"]
X_test = trump["com_stem_str"]
Y_train = data["label"]
Y_test = trump["label"]
X_user = df["com_stem_str"]

### 3.2 Check for missing values

In [118]:
print('lengths training variables: ', len(X_train),",", len(Y_train))
print('lengths testing variables: ', len(X_test),",", len(Y_test), '\n')

print('Are there any missing values?', 
      '\n * Training:', pd.isnull(X_train).values.any(), ',', pd.isnull(Y_train).values.any(), 
      '\n * Testing: ', pd.isnull(X_test).values.any(), ",", pd.isnull(Y_test).values.any())


lengths training variables:  14192 , 14192
lengths testing variables:  201 , 201 

Are there any missing values? 
 * Training: False , False 
 * Testing:  False , False


### 3.3 Transform Data to Counts 

In [119]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [120]:
tfidf = TfidfVectorizer()

X_train_ft = tfidf.fit_transform(X_train) # transform and fit training data
X_test_ft = tfidf.transform(X_test) # transform trump test data from fitted transformer
X_user_ft = tfidf.transform(X_user) # transform user selected comments to predict on

data_trans= tfidf.transform(data["com_stem_str"]) # same as X_train...transform entire dataset for cross validation
df_trans = tfidf.transform(df["com_stem_str"]) # same as X_user

## 4 Machine Learning Models

In [121]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn import svm # support vector machine
from sklearn import metrics # for accuracy/ precision
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier # Stochastic Gradient Descent

### 4.1 Multinomial Naive Bayes Model

**Fitting the Model:**

In [122]:
mnb = MultinomialNB()
mnb.fit(X_train_ft, Y_train) # fit the model on the training data word counts and training data lables

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

**Model Predictions:** 

In [123]:
mnb_predict = mnb.predict(X_test_ft) # make our y predictions (labels) on the comment test data
mnb_acc = metrics.accuracy_score(Y_test, mnb_predict)
print('We obtained ', round(mnb_acc, 6), '% accuracy for the model')

We obtained  0.0 % accuracy for the model


**Classification Report**

In [59]:
print(metrics.classification_report(Y_test, mnb_predict))

ValueError: Mix of label input types (string and number)

**Confusion Matrix**

In [88]:
metrics.confusion_matrix(Y_test, mnb_predict)

array([[   2,  254,   16],
       [   0, 1929,   64],
       [   0,  201, 1082]])

**Cross Validation of Accuracy:**

In [89]:
scores = cross_val_score(mnb, data_transformed, df["label"], cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.72 (+/- 0.18)


### 4.2 Logistic Regression

**Fitting the Model:**

In [67]:
lr = LogisticRegression(solver='sag', max_iter=100, random_state=42, multi_class="multinomial") # set multinomial setting for multiclass data

**Model Predictions:**

In [70]:
lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=42, solver='sag',
          tol=0.0001, verbose=0, warm_start=False)

In [71]:
lr_predict = lr.predict(X_test)
lr_acc = metrics.accuracy_score(Y_test, lr_predict)
print('We obtained ', round(lr_acc, 6), '% accuracy for the logistic regression model')

We obtained  0.0 % accuracy for the logistic regression model


**Classification Report:**

In [93]:
print(metrics.classification_report(Y_test, lr_predict))

             precision    recall  f1-score   support

       -1.0       0.63      0.26      0.37       272
        0.0       0.85      0.96      0.90      1993
        1.0       0.95      0.88      0.91      1283

avg / total       0.87      0.88      0.86      3548



**Confusion Matrix:**

In [94]:
metrics.confusion_matrix(Y_test, lr_predict)

array([[  71,  191,   10],
       [  30, 1909,   54],
       [  11,  146, 1126]])

**Cross Validation:**

In [95]:
scores = cross_val_score(lr, data_transformed, df["label"], cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.75 (+/- 0.17)


### 4.3 Linear Support Vector Machine

**Fitting the Model & Predictions:**

In [96]:
svm = svm.SVC()
svm.fit(X_train, Y_train)
svm_predict = svm.predict(X_test)
svm_acc = metrics.accuracy_score(Y_test, svm_predict)
print('We obtained ', round(svm_acc, 6), '% accuracy for the SVM model')

We obtained  0.561725 % accuracy for the SVM model


**Classification Report:**

In [97]:
print(metrics.classification_report(Y_test, mnb_predict))

             precision    recall  f1-score   support

       -1.0       1.00      0.01      0.01       272
        0.0       0.81      0.97      0.88      1993
        1.0       0.93      0.84      0.89      1283

avg / total       0.87      0.85      0.82      3548



**Confusion Matrix:**

In [98]:
metrics.confusion_matrix(Y_test, lr_predict)

array([[  71,  191,   10],
       [  30, 1909,   54],
       [  11,  146, 1126]])

**Cross Validation:**

In [99]:
scores = cross_val_score(svm, data_transformed, df["label"], cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.57 (+/- 0.00)


### 5.1 K-Nearest Neighbor

**Fitting Model & Predictions:**

In [100]:
from sklearn.neighbors import KNeighborsClassifier # k-NN ensemble method

knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

knn_predict = knn.predict(xtest)
knn_acc = metrics.accuracy_score(Y_test, knn_predict)
print('We obtained ', round(knn_acc, 6), '% accuracy for the KNN Bagging model')

We obtained  0.8292 % accuracy for the KNN Bagging model


**Cross Validation:**

In [101]:
scores = cross_val_score(knn, data_transformed, df["label"], cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.77 (+/- 0.19)


### 5.3 Random Forest

**Fitting Model & Predictions:**

In [102]:
from sklearn.ensemble import RandomForestClassifier # random forest ensemble method

ranfor = RandomForestClassifier(n_estimators=10, random_state=10)
ranfor = ranfor.fit(xtrain, Y_train)

rf_predict = ranfor.predict(xtest)
rf_acc = metrics.accuracy_score(Y_test, rf_predict)
print('We obtained ', round(rf_acc, 6), '% accuracy for the Random Forest model')

We obtained  0.866967 % accuracy for the Random Forest model


**Cross Validation:**

In [103]:
scores = cross_val_score(ranfor, data_transformed, df["label"], cv=5) # 5 fold cross validation
print("Confidence Interval for Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Confidence Interval for Accuracy: 0.66 (+/- 0.27)


## 6 Data Visualizations

### 6.1 Table of Model Results

In [104]:
myTable = pd.DataFrame(columns=['Naive Bayes','Support Vect Machine','Logistic Regression', 'K-NN', 'Random Forest'],
                   index=["Accuracy"])
myTable['Naive Bayes']=mnb_acc; myTable['Support Vect Machine']=svm_acc; myTable['Logistic Regression']=lr_acc
myTable['K-NN']= knn_acc; myTable['Random Forest']= rf_acc
myTable

Unnamed: 0,Naive Bayes,Support Vect Machine,Logistic Regression,K-NN,Random Forest
Accuracy,0.849211,0.561725,0.875423,0.8292,0.866967


In [105]:
labels = list(myTable.columns.values)
values = myTable.iloc[0].values
ypos = np.arange(len(labels))

In [106]:
myTable.iloc[0].values

array([0.84921082, 0.56172492, 0.87542277, 0.82919955, 0.86696731])

In [111]:
import matplotlib.pyplot as plt

In [113]:
'''
plt.plot(ypos, values, align='center', alpha=0.5, type='bar')
plt.xticks(ypos, labels); plt.ylabel('% Accuracy')
plt.xlabel('Model Type'); plt.title('Accuracy of Classification Models')
plt.xticks(rotation=90); plt.show()
'''

"\nplt.plot(ypos, values, align='center', alpha=0.5, type='bar')\nplt.xticks(ypos, labels); plt.ylabel('% Accuracy')\nplt.xlabel('Model Type'); plt.title('Accuracy of Classification Models')\nplt.xticks(rotation=90); plt.show()\n"

## 7 Final Remarks

From the five different machine learning models attempted, Linear Support Vector Machine gave the lowest accuracy (45.3%) while Multinomial Logistic Regression gave the highest (65.5%). Upon first glance, 65% accuracy seems fairly low--however, considering that this was a multinomial classification task (Positive, Neutral or Negative), 65% is significantly higher than a random distribution. Furthermore, because the data was from YouTube comments rather than from a formal body of text, the corpus contained a higher frequency of spelling errors, slang, emojis, names, and foreign languages than would otherwise be expected, all of which added noise to the models. Future advancements may focus on better recognizing and processing these attributes common to social media textual data. 