In [324]:
!pip install twython



In [0]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import twython

In [326]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [327]:
%ls

Final_Salty.csv  [0m[01;34msample_data[0m/


In [0]:
#Using the Hackernews dataset

df = pd.read_csv('/content/Final_Salty.csv')

In [329]:
#SaltyScore created using Vader, highest salt on top

df.head()

Unnamed: 0.1,Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,SaltyScore
0,1118,23334754,Fuck em,rStar,23331287,False,0.778
1,120,23338465,"Not great, not terrible",laretluval,23334339,False,0.731
2,1353,23320362,Risk aversion and capital?,eximius,23319848,False,0.714
3,1265,23335878,AMP is a terrifying solution to an awful problem.,tobyhinloopen,23322730,False,0.599
4,2595,23329336,launch aborted :(,tosh,23322948,False,0.592


In [0]:
#Use vader to create a column for classification

def analyze_sentiment_vader_lexicon(review, threshold=0.1,
                                    verbose=False):
  # analyze the sentiment for review
  analyzer = SentimentIntensityAnalyzer()
  scores = analyzer.polarity_scores(review)
  # get aggregate scores and final sentiment
  agg_score = scores['compound']
  if agg_score >= threshold:
    final_sentiment = 'positive'
  elif agg_score <= -threshold:
    final_sentiment = 'negative'
  else:
    final_sentiment = 'neutral'

  if verbose:
    # display detailed sentiment statistics
    positive = str(round(scores['pos'], 2)*100)+'%'
    final = round(agg_score, 2)
    negative = str(round(scores['neg'], 2)*100)+'%'
    neutral = str(round(scores['neu'], 2)*100)+'%'
    sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,
                                      negative, neutral]], columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'],['Predicted Sentiment', 'Polarity Score','Positive', 'Negative', 'Neutral']],codes=[[0,0,0,0,0],[0,1,2,3,4]]))
    print(sentiment_frame)
  return final_sentiment

In [0]:
df['sentiment'] = df['Comment'].apply(analyze_sentiment_vader_lexicon)

In [332]:
df.head()

Unnamed: 0.1,Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,SaltyScore,sentiment
0,1118,23334754,Fuck em,rStar,23331287,False,0.778,negative
1,120,23338465,"Not great, not terrible",laretluval,23334339,False,0.731,negative
2,1353,23320362,Risk aversion and capital?,eximius,23319848,False,0.714,negative
3,1265,23335878,AMP is a terrifying solution to an awful problem.,tobyhinloopen,23322730,False,0.599,negative
4,2595,23329336,launch aborted :(,tosh,23322948,False,0.592,negative


In [333]:
#may need to adjust threshold

df['sentiment'].value_counts()

positive    1765
negative     729
neutral      568
Name: sentiment, dtype: int64

In [0]:
counter = Counter(df['sentiment'].tolist())
sent_rank = {i[0]: idx for idx, i in enumerate(counter.most_common(3))}
df = df[df['sentiment'].map(lambda x: x in sent_rank)]

In [335]:
sent_rank

{'negative': 1, 'neutral': 2, 'positive': 0}

In [0]:
comment_list = df['Comment'].tolist()
sent_list = [sent_rank[i] for i in df['sentiment'].tolist()]
sent_list = np.array(sent_list)

In [337]:
sent_list

array([1, 1, 1, ..., 0, 0, 0])

In [0]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(comment_list)

In [339]:
x_train_counts.shape

(3062, 15376)

In [0]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [341]:
x_train_tfidf.shape

(3062, 15376)

In [342]:
x_train_tfidf.shape

(3062, 15376)

In [343]:
sent_list.shape, x_train_tfidf.shape

((3062,), (3062, 15376))

In [0]:
#x_train_tfidf = x_train_tfidf.reshape(x_train_tfidf.shape[1:])

In [0]:
X_train, X_test, y_train, y_test = train_test_split(x_train_tfidf, sent_list, test_size=0.3, random_state=0)

In [346]:
df.shape

(3062, 8)

In [0]:
clf = MultinomialNB().fit(X_train, y_train)

In [0]:
y_score = clf.predict(X_test)

In [0]:
n_right = 0
for i in range(len(y_score)):
  if y_score[i] == y_test[i]:
    n_right += 1

In [0]:
#Threshold .1 is the best accuracy so far.
#Classes are imbalanced

#print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

In [0]:
df_neg = df[df['sentiment']=='negative'].sample(500)


In [0]:
df_pos = df[df['sentiment']=='positive'].sample(500)

In [0]:
df_neu = df[df['sentiment']=='neutral'].sample(500)

In [0]:
frames = [df_pos, df_neg, df_neu]

df_samp = pd.concat(frames)

In [355]:
#balanced classes
df_samp

Unnamed: 0.1,Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,SaltyScore,sentiment
1471,443,23338486,The bigger issue right now is the total crater...,gexla,23337857,False,0.033,positive
1334,2703,23334549,Any virtual environment will take precedence o...,joshvm,23331698,False,0.044,positive
1658,434,23338897,"&gt; Now, you can either hire someone from San...",bachmeier,23337857,False,0.017,positive
1524,1560,23333515,One approach I like for this sort of thing is ...,jakear,23331499,False,0.029,positive
2501,649,23338306,Text editors as display engines in not entirel...,acomjean,23334898,False,0.000,positive
...,...,...,...,...,...,...,...,...
629,1091,23336682,Posts critical of Microsft usually draw a set ...,pwdisswordfish2,23331287,False,0.102,neutral
1147,1432,23324501,I have had so many bad experiences with instac...,monadic2,23324147,False,0.055,neutral
1733,2390,23332105,"Twitter must have automation for this, since t...",Animats,23322112,False,0.000,neutral
1863,3275,23330595,"Comments moved to <a href=""https:&#x2F;&#x2F;n...",dang,23329515,False,0.000,neutral


In [0]:
#analyze new df
df_samp['sentiment'] = df_samp['Comment'].apply(analyze_sentiment_vader_lexicon)

In [0]:
counter = Counter(df_samp['sentiment'].tolist())
sent_rank = {i[0]: idx for idx, i in enumerate(counter.most_common(3))}
df_samp = df_samp[df_samp['sentiment'].map(lambda x: x in sent_rank)]

In [0]:
comment_list = df_samp['Comment'].tolist()
sent_list = [sent_rank[i] for i in df_samp['sentiment'].tolist()]
sent_list = np.array(sent_list)

In [0]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(comment_list)

In [0]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(x_train_tfidf, sent_list, test_size=0.3, random_state=0)

In [0]:
clf = MultinomialNB().fit(X_train, y_train)

In [0]:
y_score = clf.predict(X_test)

In [0]:
n_right = 0
for i in range(len(y_score)):
  if y_score[i] == y_test[i]:
    n_right += 1

In [365]:
#That didn't help at all

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 48.44%


In [0]:
#Convert Salty to int for classification
df['SaltyScore'] = df['SaltyScore'] *100
df['SaltyScore'] = df['SaltyScore'].astype(int)

In [0]:
#going to bin salty score values

cut_labels = ['pos', 'neu', 'neg']
cut_bins = [0, 33, 67, 100]
df['bin'] = pd.cut(df['SaltyScore'], bins=cut_bins, labels=cut_labels)

In [368]:
df

Unnamed: 0.1,Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,SaltyScore,sentiment,bin
0,1118,23334754,Fuck em,rStar,23331287,False,77,negative,neg
1,120,23338465,"Not great, not terrible",laretluval,23334339,False,73,negative,neg
2,1353,23320362,Risk aversion and capital?,eximius,23319848,False,71,negative,neg
3,1265,23335878,AMP is a terrifying solution to an awful problem.,tobyhinloopen,23322730,False,59,negative,neu
4,2595,23329336,launch aborted :(,tosh,23322948,False,59,negative,neu
...,...,...,...,...,...,...,...,...,...
3057,1722,23338836,I’m strongly thinking of just migrating to Ubu...,samgranieri,23336255,False,0,positive,
3058,1720,23337471,Proper ARM SBCL with threads yay,ivan4th,23336255,False,0,positive,
3059,1719,23336865,Finally! Does this mean ArchLinuxARM (ALARM) w...,qalmakka,23336255,False,0,positive,
3060,1718,23338715,Is there a noticeable performance benefit with...,mgamache,23336255,False,0,positive,


In [0]:
df['bin'] = df['bin'].fillna('neg')

In [0]:
df['bin'] = df['Comment'].apply(analyze_sentiment_vader_lexicon)

In [0]:
#df = pd.DataFrame.drop(df, columns=['sentiment','SaltyScore','Deleted', 'StoryId', 'Unnamed: 0', 'Comment_ID'])
df = pd.DataFrame.drop(df, columns=['SaltyScore','Deleted', 'StoryId', 'Unnamed: 0', 'Comment_ID'])

In [372]:
df

Unnamed: 0,Comment,UserName,sentiment,bin
0,Fuck em,rStar,negative,negative
1,"Not great, not terrible",laretluval,negative,negative
2,Risk aversion and capital?,eximius,negative,negative
3,AMP is a terrifying solution to an awful problem.,tobyhinloopen,negative,negative
4,launch aborted :(,tosh,negative,negative
...,...,...,...,...
3057,I’m strongly thinking of just migrating to Ubu...,samgranieri,positive,positive
3058,Proper ARM SBCL with threads yay,ivan4th,positive,positive
3059,Finally! Does this mean ArchLinuxARM (ALARM) w...,qalmakka,positive,positive
3060,Is there a noticeable performance benefit with...,mgamache,positive,positive


In [0]:
counter = Counter(df['bin'].tolist())
salt_rank = {i[0]: idx for idx, i in enumerate(counter.most_common(3))}
df = df[df['bin'].map(lambda x: x in salt_rank)]

In [0]:
comment_list = df['bin'].tolist()
salt_list = [salt_rank[i] for i in df['bin'].tolist()]
salt_list = np.array(salt_list)

In [0]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(comment_list)

In [0]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(x_train_tfidf, salt_list, test_size=0.3, random_state=0)

In [0]:
clf = MultinomialNB().fit(X_train, y_train)

In [0]:
y_score = clf.predict(X_test)

In [0]:
n_right = 0
for i in range(len(y_score)):
  if y_score[i] == y_test[i]:
    n_right += 1

In [381]:
#probably overfit

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 100.00%


In [0]:
#Experiment with deep learning model

from nltk import word_tokenize
from collections import defaultdict

In [383]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
def count_top_x_words(corpus, top_x, skip_top_n):
  count = defaultdict(lambda: 0)
  for c in corpus:
    for w in word_tokenize(c):
      count[w] += 1
  count_tuples = sorted([(w,c) for w, c in count.items()], key=lambda x: x[1], reverse=True)
  return [i[0] for i in count_tuples[skip_top_n: skip_top_n + top_x]]

In [0]:
def replace_top_x_words_with_vectors(corpus, top_x):
  topx_dict = {top_x[i]: i for i in range(len(top_x))}
  return [
          [topx_dict[w] for w in word_tokenize(s) if w in topx_dict]
          for s in corpus
  ], topx_dict

In [0]:
def filter_to_top_x(corpus, n_top, skip_top_n=0):
  top_x = count_top_x_words(corpus, n_top, skip_top_n)
  return replace_top_x_words_with_vectors(corpus, top_x)

In [387]:
!pip install lib



In [0]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
import pandas as pd
from collections import Counter
import lib

In [0]:
#Using the Hackernews dataset

df = pd.read_csv('/content/Final_Salty.csv')

In [390]:
df

Unnamed: 0.1,Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,SaltyScore
0,1118,23334754,Fuck em,rStar,23331287,False,0.778
1,120,23338465,"Not great, not terrible",laretluval,23334339,False,0.731
2,1353,23320362,Risk aversion and capital?,eximius,23319848,False,0.714
3,1265,23335878,AMP is a terrifying solution to an awful problem.,tobyhinloopen,23322730,False,0.599
4,2595,23329336,launch aborted :(,tosh,23322948,False,0.592
...,...,...,...,...,...,...,...
3057,1722,23338836,I’m strongly thinking of just migrating to Ubu...,samgranieri,23336255,False,0.000
3058,1720,23337471,Proper ARM SBCL with threads yay,ivan4th,23336255,False,0.000
3059,1719,23336865,Finally! Does this mean ArchLinuxARM (ALARM) w...,qalmakka,23336255,False,0.000
3060,1718,23338715,Is there a noticeable performance benefit with...,mgamache,23336255,False,0.000


In [0]:
df['sentiment'] = df['Comment'].apply(analyze_sentiment_vader_lexicon)

In [0]:
counter = Counter(df['sentiment'].tolist())
top_sent = {i[0]: idx for idx, i in enumerate(counter.most_common(3))}
df = df[df['sentiment'].map(lambda x: x in top_sent)]

In [0]:
comment_list = df['Comment'].tolist()
mapped_list, word_list = filter_to_top_x(comment_list, 2500, 10)
comment_list_o = [top_sent[i] for i in df['sentiment'].tolist()]
comment_list = to_categorical(comment_list_o)

max_comment_length = 150

mapped_list = sequence.pad_sequences(mapped_list, maxlen=max_comment_length)
train_x, test_x, train_y, test_y = train_test_split(mapped_list, comment_list, test_size=0.3)

In [0]:
max_review_length = 150

embedding_vector_length = 64
model = Sequential()

In [395]:
model.add(Embedding(2500, embedding_vector_length, input_length=max_review_length))
model.add(Conv1D(50, 5))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(max(comment_list_o) + 1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, epochs=5, batch_size=64)

y_score = model.predict(test_x)
y_score = [[1 if i == max(sc) else 0 for i in sc] for sc in y_score]
n_right = 0
for i in range(len(y_score)):
    if all(y_score[i][j] == test_y[i][j] for j in range(len(y_score[i]))):
        n_right += 1
        

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
