In [1]:
!pip install twython

Collecting twython
  Downloading https://files.pythonhosted.org/packages/24/80/579b96dfaa9b536efde883d4f0df7ea2598a6f3117a6dd572787f4a2bcfb/twython-3.8.2-py3-none-any.whl
Installing collected packages: twython
Successfully installed twython-3.8.2


In [2]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import twython

In [3]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
%ls

[0m[01;34msample_data[0m/


In [7]:
#Using the Hackernews dataset

df = pd.read_csv('/content/Final_Salty.csv')

In [8]:
#SaltyScore created using Vader, highest salt on top

df.head()

Unnamed: 0.1,Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,SaltyScore
0,1118,23334754,Fuck em,rStar,23331287,False,0.778
1,120,23338465,"Not great, not terrible",laretluval,23334339,False,0.731
2,1353,23320362,Risk aversion and capital?,eximius,23319848,False,0.714
3,1265,23335878,AMP is a terrifying solution to an awful problem.,tobyhinloopen,23322730,False,0.599
4,2595,23329336,launch aborted :(,tosh,23322948,False,0.592


In [9]:
#Use vader to create a column for classification

def analyze_sentiment_vader_lexicon(review, threshold=0.1,
                                    verbose=False):
  # analyze the sentiment for review
  analyzer = SentimentIntensityAnalyzer()
  scores = analyzer.polarity_scores(review)
  # get aggregate scores and final sentiment
  agg_score = scores['compound']
  if agg_score >= threshold:
    final_sentiment = 'positive'
  elif agg_score <= -threshold:
    final_sentiment = 'negative'
  else:
    final_sentiment = 'neutral'

  if verbose:
    # display detailed sentiment statistics
    positive = str(round(scores['pos'], 2)*100)+'%'
    final = round(agg_score, 2)
    negative = str(round(scores['neg'], 2)*100)+'%'
    neutral = str(round(scores['neu'], 2)*100)+'%'
    sentiment_frame = pd.DataFrame([[final_sentiment, final, positive,
                                      negative, neutral]], columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'],['Predicted Sentiment', 'Polarity Score','Positive', 'Negative', 'Neutral']],codes=[[0,0,0,0,0],[0,1,2,3,4]]))
    print(sentiment_frame)
  return final_sentiment

In [10]:
df['sentiment'] = df['Comment'].apply(analyze_sentiment_vader_lexicon)

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,SaltyScore,sentiment
0,1118,23334754,Fuck em,rStar,23331287,False,0.778,negative
1,120,23338465,"Not great, not terrible",laretluval,23334339,False,0.731,negative
2,1353,23320362,Risk aversion and capital?,eximius,23319848,False,0.714,negative
3,1265,23335878,AMP is a terrifying solution to an awful problem.,tobyhinloopen,23322730,False,0.599,negative
4,2595,23329336,launch aborted :(,tosh,23322948,False,0.592,negative


In [12]:
#may need to adjust threshold

df['sentiment'].value_counts()

positive    1765
negative     729
neutral      568
Name: sentiment, dtype: int64

In [13]:
counter = Counter(df['sentiment'].tolist())
sent_rank = {i[0]: idx for idx, i in enumerate(counter.most_common(3))}
df = df[df['sentiment'].map(lambda x: x in sent_rank)]

In [14]:
sent_rank

{'negative': 1, 'neutral': 2, 'positive': 0}

In [15]:
comment_list = df['Comment'].tolist()
sent_list = [sent_rank[i] for i in df['sentiment'].tolist()]
sent_list = np.array(sent_list)

In [16]:
sent_list

array([1, 1, 1, ..., 0, 0, 0])

In [17]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(comment_list)

In [18]:
x_train_counts.shape

(3062, 15376)

In [19]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [20]:
x_train_tfidf.shape

(3062, 15376)

In [21]:
x_train_tfidf.shape

(3062, 15376)

In [22]:
sent_list.shape, x_train_tfidf.shape

((3062,), (3062, 15376))

In [23]:
#x_train_tfidf = x_train_tfidf.reshape(x_train_tfidf.shape[1:])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(x_train_tfidf, sent_list, test_size=0.3, random_state=0)

In [25]:
df.shape

(3062, 8)

In [26]:
clf = MultinomialNB().fit(X_train, y_train)

In [27]:
y_score = clf.predict(X_test)

In [28]:
n_right = 0
for i in range(len(y_score)):
  if y_score[i] == y_test[i]:
    n_right += 1

In [29]:
#Threshold .1 is the best accuracy so far.
#Classes are imbalanced

#print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

In [30]:
df_neg = df[df['sentiment']=='negative'].sample(500)


In [31]:
df_pos = df[df['sentiment']=='positive'].sample(500)

In [32]:
df_neu = df[df['sentiment']=='neutral'].sample(500)

In [33]:
frames = [df_pos, df_neg, df_neu]

df_samp = pd.concat(frames)

In [34]:
#balanced classes
df_samp

Unnamed: 0.1,Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,SaltyScore,sentiment
748,2551,23319606,What are the chances that there is no rational...,KKKKkkkk1,23318131,False,0.089,positive
1909,2824,23321785,Slightly off-topic but it&#x27;s interesting h...,t0astbread,23321448,False,0.000,positive
1785,2764,23336307,Woah this works really well! I love these hand...,gitgud,23328081,False,0.000,positive
1692,468,23338831,Anyone can already take your job.<p>Since ther...,ravenstine,23337857,False,0.012,positive
2906,1517,23325424,There are already some novel applications of t...,cl0rkster,23322321,False,0.000,positive
...,...,...,...,...,...,...,...,...
2798,269,23338475,"<a href=""http:&#x2F;&#x2F;www.usbmadesimple.co...",Koshkin,23335072,False,0.000,neutral
492,2124,23325872,Of course. Analogous to how your closet is a l...,rdiddly,23324225,False,0.116,neutral
298,2673,23324208,This is essentially the Andrew Wakefield fiasc...,pjc50,23322658,False,0.153,neutral
2912,1508,23328617,Much harder to discover new foods via a comput...,inamberclad,23324147,False,0.000,neutral


In [35]:
#analyze new df
df_samp['sentiment'] = df_samp['Comment'].apply(analyze_sentiment_vader_lexicon)

In [36]:
counter = Counter(df_samp['sentiment'].tolist())
sent_rank = {i[0]: idx for idx, i in enumerate(counter.most_common(3))}
df_samp = df_samp[df_samp['sentiment'].map(lambda x: x in sent_rank)]

In [37]:
comment_list = df_samp['Comment'].tolist()
sent_list = [sent_rank[i] for i in df_samp['sentiment'].tolist()]
sent_list = np.array(sent_list)

In [38]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(comment_list)

In [39]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(x_train_tfidf, sent_list, test_size=0.3, random_state=0)

In [41]:
clf = MultinomialNB().fit(X_train, y_train)

In [42]:
y_score = clf.predict(X_test)

In [43]:
n_right = 0
for i in range(len(y_score)):
  if y_score[i] == y_test[i]:
    n_right += 1

In [44]:
#That didn't help at all

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 54.67%


In [45]:
#Convert Salty to int for classification
df['SaltyScore'] = df['SaltyScore'] *100
df['SaltyScore'] = df['SaltyScore'].astype(int)

In [46]:
#going to bin salty score values

cut_labels = ['pos', 'neu', 'neg']
cut_bins = [0, 33, 67, 100]
df['bin'] = pd.cut(df['SaltyScore'], bins=cut_bins, labels=cut_labels)

In [47]:
df

Unnamed: 0.1,Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,SaltyScore,sentiment,bin
0,1118,23334754,Fuck em,rStar,23331287,False,77,negative,neg
1,120,23338465,"Not great, not terrible",laretluval,23334339,False,73,negative,neg
2,1353,23320362,Risk aversion and capital?,eximius,23319848,False,71,negative,neg
3,1265,23335878,AMP is a terrifying solution to an awful problem.,tobyhinloopen,23322730,False,59,negative,neu
4,2595,23329336,launch aborted :(,tosh,23322948,False,59,negative,neu
...,...,...,...,...,...,...,...,...,...
3057,1722,23338836,I’m strongly thinking of just migrating to Ubu...,samgranieri,23336255,False,0,positive,
3058,1720,23337471,Proper ARM SBCL with threads yay,ivan4th,23336255,False,0,positive,
3059,1719,23336865,Finally! Does this mean ArchLinuxARM (ALARM) w...,qalmakka,23336255,False,0,positive,
3060,1718,23338715,Is there a noticeable performance benefit with...,mgamache,23336255,False,0,positive,


In [48]:
df['bin'] = df['bin'].fillna('neg')

In [49]:
df['bin'] = df['Comment'].apply(analyze_sentiment_vader_lexicon)

In [50]:
#df = pd.DataFrame.drop(df, columns=['sentiment','SaltyScore','Deleted', 'StoryId', 'Unnamed: 0', 'Comment_ID'])
df = pd.DataFrame.drop(df, columns=['SaltyScore','Deleted', 'StoryId', 'Unnamed: 0', 'Comment_ID'])

In [51]:
df

Unnamed: 0,Comment,UserName,sentiment,bin
0,Fuck em,rStar,negative,negative
1,"Not great, not terrible",laretluval,negative,negative
2,Risk aversion and capital?,eximius,negative,negative
3,AMP is a terrifying solution to an awful problem.,tobyhinloopen,negative,negative
4,launch aborted :(,tosh,negative,negative
...,...,...,...,...
3057,I’m strongly thinking of just migrating to Ubu...,samgranieri,positive,positive
3058,Proper ARM SBCL with threads yay,ivan4th,positive,positive
3059,Finally! Does this mean ArchLinuxARM (ALARM) w...,qalmakka,positive,positive
3060,Is there a noticeable performance benefit with...,mgamache,positive,positive


In [52]:
counter = Counter(df['bin'].tolist())
salt_rank = {i[0]: idx for idx, i in enumerate(counter.most_common(3))}
df = df[df['bin'].map(lambda x: x in salt_rank)]

In [53]:
comment_list = df['bin'].tolist()
salt_list = [salt_rank[i] for i in df['bin'].tolist()]
salt_list = np.array(salt_list)

In [54]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(comment_list)

In [55]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(x_train_tfidf, salt_list, test_size=0.3, random_state=0)

In [57]:
clf = MultinomialNB().fit(X_train, y_train)

In [58]:
y_score = clf.predict(X_test)

In [59]:
n_right = 0
for i in range(len(y_score)):
  if y_score[i] == y_test[i]:
    n_right += 1

In [60]:
#probably overfit

print("Accuracy: %.2f%%" % ((n_right/float(len(y_test)) * 100)))

Accuracy: 100.00%


In [61]:
#Experiment with deep learning model

from nltk import word_tokenize
from collections import defaultdict

In [62]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [63]:
def count_top_x_words(corpus, top_x, skip_top_n):
  count = defaultdict(lambda: 0)
  for c in corpus:
    for w in word_tokenize(c):
      count[w] += 1
  count_tuples = sorted([(w,c) for w, c in count.items()], key=lambda x: x[1], reverse=True)
  return [i[0] for i in count_tuples[skip_top_n: skip_top_n + top_x]]

In [64]:
def replace_top_x_words_with_vectors(corpus, top_x):
  topx_dict = {top_x[i]: i for i in range(len(top_x))}
  return [
          [topx_dict[w] for w in word_tokenize(s) if w in topx_dict]
          for s in corpus
  ], topx_dict

In [65]:
def filter_to_top_x(corpus, n_top, skip_top_n=0):
  top_x = count_top_x_words(corpus, n_top, skip_top_n)
  return replace_top_x_words_with_vectors(corpus, top_x)

In [66]:
!pip install lib

Collecting lib
  Downloading https://files.pythonhosted.org/packages/5c/15/f8ba504146fbe6f1bcb79786fd9b9cb45e5e3ff4c4536d1418d83fdd143c/lib-3.0.0.tar.gz
Building wheels for collected packages: lib
  Building wheel for lib (setup.py) ... [?25l[?25hdone
  Created wheel for lib: filename=lib-3.0.0-cp36-none-any.whl size=3213 sha256=0c94c75cd4670cb3e81bcef20a197cd498263590139fef95ea3d0544238b559c
  Stored in directory: /root/.cache/pip/wheels/52/de/da/0ef2064c4b4a213d27518228ebd79a2349e1b81341b05f7196
Successfully built lib
Installing collected packages: lib
Successfully installed lib-3.0.0


In [67]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import to_categorical
import pandas as pd
from collections import Counter
import lib

In [68]:
#Using the Hackernews dataset

df = pd.read_csv('/content/Final_Salty.csv')

In [69]:
df

Unnamed: 0.1,Unnamed: 0,Comment_ID,Comment,UserName,StoryId,Deleted,SaltyScore
0,1118,23334754,Fuck em,rStar,23331287,False,0.778
1,120,23338465,"Not great, not terrible",laretluval,23334339,False,0.731
2,1353,23320362,Risk aversion and capital?,eximius,23319848,False,0.714
3,1265,23335878,AMP is a terrifying solution to an awful problem.,tobyhinloopen,23322730,False,0.599
4,2595,23329336,launch aborted :(,tosh,23322948,False,0.592
...,...,...,...,...,...,...,...
3057,1722,23338836,I’m strongly thinking of just migrating to Ubu...,samgranieri,23336255,False,0.000
3058,1720,23337471,Proper ARM SBCL with threads yay,ivan4th,23336255,False,0.000
3059,1719,23336865,Finally! Does this mean ArchLinuxARM (ALARM) w...,qalmakka,23336255,False,0.000
3060,1718,23338715,Is there a noticeable performance benefit with...,mgamache,23336255,False,0.000


In [70]:
df['sentiment'] = df['Comment'].apply(analyze_sentiment_vader_lexicon)

In [71]:
counter = Counter(df['sentiment'].tolist())
top_sent = {i[0]: idx for idx, i in enumerate(counter.most_common(3))}
df = df[df['sentiment'].map(lambda x: x in top_sent)]

In [72]:
comment_list = df['Comment'].tolist()
mapped_list, word_list = filter_to_top_x(comment_list, 2500, 10)
comment_list_o = [top_sent[i] for i in df['sentiment'].tolist()]
comment_list = to_categorical(comment_list_o)

max_comment_length = 150

mapped_list = sequence.pad_sequences(mapped_list, maxlen=max_comment_length)
train_x, test_x, train_y, test_y = train_test_split(mapped_list, comment_list, test_size=0.3)

In [73]:
max_review_length = 150

embedding_vector_length = 64
model = Sequential()

In [74]:
model.add(Embedding(2500, embedding_vector_length, input_length=max_review_length))
model.add(Conv1D(50, 5))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(max(comment_list_o) + 1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, epochs=25, batch_size=64)

y_score = model.predict(test_x)
y_score = [[1 if i == max(sc) else 0 for i in sc] for sc in y_score]
n_right = 0
for i in range(len(y_score)):
    if all(y_score[i][j] == test_y[i][j] for j in range(len(y_score[i]))):
        n_right += 1
        

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
