In [4]:
## import necessary modules
import pandas as pd
import os
import csv
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn import svm # support vector machine
from sklearn import metrics # for accuracy/ precision
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier # Stochastic Gradient Descent
from sklearn.neighbors import KNeighborsClassifier # k-NN ensemble method
from sklearn.ensemble import RandomForestClassifier 

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
import sklearn # machine learning
from sklearn.model_selection import train_test_split # splitting up data
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# set path
path = '/Users/andiedonovan/myProjects/Youtube_Python_Project/AndiesBranch/'
os.chdir(path) # change directory

# load in data 

# training data
okgo = pd.read_csv('data/OKGOcomments.csv', delimiter=";", skiprows=2, encoding='latin-1', engine='python') # read in the data
blogs = pd.read_csv('data/Kagel_social_media_blogs.csv', delimiter="@@@", skiprows=2, encoding='latin-1', engine='python') # read in the data
tweets = pd.read_csv('data/full-corpus.csv', delimiter=",", skiprows=2, encoding='latin-1', engine='python') # read in the data
# test data: 
trump = pd.read_csv('data/trump.csv', delimiter="@@@", skiprows=2, encoding='utf-8', error_bad_lines=False, engine='python') 
# combine training dataframes
df = pd.read_csv('data/data.csv', delimiter="@@@", skiprows=2, encoding='utf-8', engine='python') 

# clean dataframes 
tweets = tweets.drop(['Topic', 'TweetId', "TweetDate"], axis = 1).dropna()
tweets.columns = ["label", "comment"]
tweets.label = tweets.label.replace({'positive': '1.0', 'negative':'-1.0', 'neutral': '0.0', 'irrelevant': '0.0'}, regex=True)
tweets['label'] = pd.to_numeric(tweets['label'], errors='coerce')
blogs.columns = ["label", "comment"]
blogs['label'] = pd.to_numeric(blogs['label'], errors='coerce')
okgo.columns = [
  'label','comment','a','b']
okgo = okgo.drop(['a', 'b'], axis = 1).dropna() # drop columns 3 and 4 and missing values
data = pd.concat([okgo, blogs, tweets], ignore_index=False)
df.columns = ["comment", "label"]
trump.columns = ["label", "comment"]

# clean up textual data (remove symbols)
df["comment"]= df["comment"].astype(str) 
trump["comment"]= trump["comment"].astype(str) 

def cleanerFn(b):
    for row in range(len(b)):
        line = b.loc[row, "comment"]
        b.loc[row,"comment"] = re.sub("[^a-zA-Z]", " ", line)
        
def cleanerFn2(b):
    for row in range(len(b)):
        line = b.iloc[row, 1]
        b.iloc[row,1] = re.sub("[^a-zA-Z]", " ", line)

cleanerFn(df)
cleanerFn2(data)
cleanerFn2(trump)

sw = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
tfidf = TfidfVectorizer()

def nlpFunction(a):
    a['com_token']=a['comment'].str.lower().str.split()
    a['com_remv']=a['com_token'].apply(lambda x: [y for y in x if y not in sw])
    a["com_lemma"] = a['com_remv'].apply(lambda x : [lemmatizer.lemmatize(y) for y in x]) # lemmatization
    a['com_stem']=a['com_lemma'].apply(lambda x : [ps.stem(y) for y in x]) # stemming
    a["com_stem_str"] = a["com_stem"].apply(', '.join)
    return a

df = nlpFunction(df)
data = nlpFunction(data)
trump = nlpFunction(trump)

X_train = data["com_stem_str"]
X_test = trump["com_stem_str"]
Y_train = data["label"]
Y_test = trump["label"]
X_user = df["com_stem_str"]

tfidf = TfidfVectorizer()
xtrain = tfidf.fit_transform(X_train) # transform and fit training data
xtest = tfidf.transform(X_test) # transform test data from fitted transformer
xuser = tfidf.transform(X_user)
data_trans= tfidf.transform(data["com_stem_str"]) # transform entire dataset for cross validation
df_trans = tfidf.transform(df["com_stem_str"])

# running models
from sklearn.svm import SVC

rs = 10
lr = LogisticRegression(solver='sag', max_iter=100, random_state=rs, multi_class="multinomial")
mnb = MultinomialNB()
svm = svm.SVC()
rf = RandomForestClassifier(n_estimators=10, random_state=rs)
knn = KNeighborsClassifier()

models = ['lr', 'mnb', 'svm', 'rf', 'knn']
labels = ['label_' + str(models[i]) for i in range(0,len(models))]
predictions = [str(models[i])+"_predict" for i in range(0,len(models))]
d = {}
initModels = [lr, mnb, svm, rf, knn]

for i in range(0,5):
    initModels[i].fit(xtrain, Y_train)
    d[predictions[i]] = initModels[i].predict(xuser)

    # Create table of prediction accuracy rates
Table = pd.DataFrame(columns=['comment', 'label_lr', 'label_mnb', 'label_svm', 'label_rf', 'label_knn'])
for i in range(0, len(models)):
    Table[labels[i]] = d[predictions[i]]
Table["comment"] = df["comment"]

# Create table of predicted sentiment ratios
Ratios = pd.DataFrame(columns=['label_lr', 'label_mnb', 'label_svm', 'label_rf', 'label_knn'], 
    index=range(0,3))
def RatioFinder(model): 
    pos = Table[Table[model]== 1.0]
    neg = Table[Table[model]== -1.0]
    neu = Table[Table[model]== 0.0]

    pos_len = len(pos); neg_len = len(neg); neu_len = len(neu)
    total = pos_len + neg_len + neu_len
    
    neg_ratio = round(neg_len / float(total), 2) * 100
    pos_ratio = round(pos_len / float(total), 2) * 100
    neu_ratio = round(neu_len / float(total), 2) * 100
    
    ratios = [pos_ratio, neu_ratio, neg_ratio]
    return ratios

for i in range(0,3):
        for j in range(0,5):
            Ratios.iloc[i,j] = RatioFinder(labels[j])[i]

all_models = pd.DataFrame(columns=['average'], index=range(0,3))
all_models["average"]= Ratios.mean(axis=1)

# set the prediction to the mode of the row
Table["Prediction"] = 0
Table["Prediction"] = Table[['label_lr','label_mnb','label_svm','label_rf','label_knn']].mode(axis=1)
df.label = Table["Prediction"]

# extracting comments for each label
df["com_remv"] = df["com_remv"].apply(', '.join)
df["com_remv"] = df["com_remv"].str.replace(",","").astype(str)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andiedonovan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
df_words = df[["label","com_remv"]]
positive = df_words[df_words["label"]==1.0]
neutral = df_words[df_words["label"]==0.0]
negative = df_words[df_words["label"]==-1.0]


In [12]:
positive = df[df["label"]==1]
positive = positive["com_remv"]
most_freq_pos = pd.Series(' '.join(positive).lower().split()).value_counts()[:10]

In [13]:
most_freq_pos

germany     8
brazil      6
love        5
best        4
amazing     3
great       3
team        3
football    3
day         3
good        2
dtype: int64

In [None]:
# most frequent words in each label
most_freq_pos = pd.Series(' '.join(positive).lower().split()).value_counts()[:10]
most_freq_neg = pd.Series(' '.join(negative).lower().split()).value_counts()[:10]
most_freq_neu = pd.Series(' '.join(neutral).lower().split()).value_counts()[:10]

In [5]:
Table.head(3)

Unnamed: 0,comment,label_lr,label_mnb,label_svm,label_rf,label_knn,Prediction
0,Roses are Red,0.0,0.0,0.0,0.0,0.0,0.0
1,Violets are Blue,0.0,0.0,0.0,0.0,0.0,0.0
2,I was so happy,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
Ratios.head(3)

Unnamed: 0,label_lr,label_mnb,label_svm,label_rf,label_knn
0,14,25,0,14,4
1,85,75,100,84,94
2,1,0,0,3,2


In [7]:
most_freq_pos

com_remv    1
label       1
dtype: int64

In [14]:
data.head(3)

Unnamed: 0,label,comment,com_token,com_remv,com_lemma,com_stem,com_stem_str
0,-1.0,Everyone knows brand s papers from But No on...,"[everyone, knows, brand, s, papers, from, but,...","[everyone, knows, brand, papers, one, knows, w...","[everyone, know, brand, paper, one, know, welf...","[everyon, know, brand, paper, one, know, welfa...","everyon, know, brand, paper, one, know, welfar..."
1,0.0,Your paper cut balance is,"[your, paper, cut, balance, is]","[paper, cut, balance]","[paper, cut, balance]","[paper, cut, balanc]","paper, cut, balanc"
2,1.0,OH SHIT WHEN I SAW THIS ON MY FRONT PAGE ...,"[oh, shit, when, i, saw, this, on, my, front, ...","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","[oh, shit, saw, front, page, love, song]","oh, shit, saw, front, page, love, song"


In [15]:
df.head(3)

Unnamed: 0,comment,label,com_token,com_remv,com_lemma,com_stem,com_stem_str
0,Roses are Red,0.0,"[roses, are, red]",roses red,"[rose, red]","[rose, red]","rose, red"
1,Violets are Blue,0.0,"[violets, are, blue]",violets blue,"[violet, blue]","[violet, blue]","violet, blue"
2,I was so happy,0.0,"[i, was, so, happy]",happy,[happy],[happi],happi


In [18]:
Manager = "label_lr"
list(Ratios[str(Manager)])

[14.000000000000002, 85.0, 1.0]