In [1]:
import pandas as pd

In [2]:
# if in google colab, run this instead of the one below. 

"""
%%capture
!python -m spacy download en_core_web_lg
import spacy
from spacy.tokenizer import Tokenizer
!pip install vaderSentiment
"""

'\n%%capture\n!python -m spacy download en_core_web_lg\nimport spacy\nfrom spacy.tokenizer import Tokenizer\n!pip install vaderSentiment\n'

In [3]:
%%capture
import spacy
!python -m spacy download en_core_web_lg
from spacy.tokenizer import Tokenizer
!pip install vaderSentiment

**Loading the data**

In [4]:
news = pd.read_csv('https://github.com/EvidenceN/Hacker_News_Trolls/blob/master/top_hacker_authors_dataset/top_hacker_authors.csv?raw=true')
news.head()

Unnamed: 0,author,text,ranking,Record Count
0,jrockway,"Honestly, I would just like to get 3 sizes wit...",20,1
1,maxerickson,I would think the allusion is rooted in huntin...,1,1
2,tim333,"I thought I&#x27;d stick this thing up, partly...",0,1
3,nawitus,"By the way, the e-voting system has multiple s...",2,1
4,waps,"Did you ever know any manager, never mind a hi...",1,1


In [5]:
comment = news['text']
comment[:5]

0    Honestly, I would just like to get 3 sizes wit...
1    I would think the allusion is rooted in huntin...
2    I thought I&#x27;d stick this thing up, partly...
3    By the way, the e-voting system has multiple s...
4    Did you ever know any manager, never mind a hi...
Name: text, dtype: object

In [6]:
author = news['author']
author[:5]

0       jrockway
1    maxerickson
2         tim333
3        nawitus
4           waps
Name: author, dtype: object

**Tokenizing the data**

In [7]:
nlp = spacy.load("en_core_web_lg")

In [8]:
tokenizer = Tokenizer(nlp.vocab)

**Using Vader Sentiment Analysis**

In [12]:
import vaderSentiment

In [13]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [14]:
import vaderSentiment.vaderSentiment as vv

In [15]:
sample = comment[0]
sample

"Honestly, I would just like to get 3 sizes with each shipment, try them all on, and pick the best... but without having to advance them 3x the cost of a pair of shoes.<p>I don't find returning to be a big deal.  You put the shoes back in the box they came in, print a label, and drop it off at the post office.   Still easier than going to a store, and once you know your size for a given brand, you don't have to do this anymore.<p>I've also found that shoe sizes don't really vary all that much, in my experience.  I wear size 11EE boots, 11EE running shoes, 11EE sneakers, and 11EE dress shoes.  I think the problem that a lot of people have is that they order standard-width shoes even though they don't have standard-width feet.  Then the fit comes down to how tight the laces are, etc.<p>But I digress.  Zappos will probably not go out of business if they don't buy this company."

In [16]:
score = SentimentIntensityAnalyzer()

In [17]:
help(score)

Help on SentimentIntensityAnalyzer in module vaderSentiment.vaderSentiment object:

class SentimentIntensityAnalyzer(builtins.object)
 |  SentimentIntensityAnalyzer(lexicon_file='vader_lexicon.txt', emoji_lexicon='emoji_utf8_lexicon.txt')
 |  
 |  Give a sentiment intensity score to sentences.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, lexicon_file='vader_lexicon.txt', emoji_lexicon='emoji_utf8_lexicon.txt')
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  make_emoji_dict(self)
 |      Convert emoji lexicon file to a dictionary
 |  
 |  make_lex_dict(self)
 |      Convert lexicon file to a dictionary
 |  
 |  polarity_scores(self, text)
 |      Return a float for sentiment strength based on the input text.
 |      Positive values are positive valence, negative value are negative
 |      valence.
 |  
 |  score_valence(self, sentiments, text)
 |  
 |  sentiment_valence(self, valence, sentitext, item, i, sentiments)
 |  
 |  --------------------

Rober Notebook https://github.com/BrokenShell/SaltyHacker/blob/master/nlp.py

Vader Documentation

https://pypi.org/project/vaderSentiment/

In [18]:
score.polarity_scores(comment[4])

{'neg': 0.143, 'neu': 0.802, 'pos': 0.055, 'compound': -0.5423}

In [19]:
comment[4]

'Did you ever know any manager, never mind a higher-up manager, that did even 3 of those things ?<p>Management is about money and favours, and about those things alone, and that&#x27;s why work sucks. The only way to get ahead is with threats and horsetrading.'

In [20]:
score.polarity_scores(comment[10])

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [21]:
comment[10]

"Here in South Australia, it's often said (Wikipedia's source page from news.com.au is a 404 now though) that a particular brand of iced coffee out-sells Coke."

In [22]:
help(vv)

Help on module vaderSentiment.vaderSentiment in vaderSentiment:

NAME
    vaderSentiment.vaderSentiment

DESCRIPTION
    If you use the VADER sentiment analysis tools, please cite:
    Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
    Sentiment Analysis of Social Media Text. Eighth International Conference on
    Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.

CLASSES
    builtins.object
        SentiText
        SentimentIntensityAnalyzer
    
    class SentiText(builtins.object)
     |  SentiText(text)
     |  
     |  Identify sentiment-relevant string-level properties of input text.
     |  
     |  Methods defined here:
     |  
     |  __init__(self, text)
     |      Initialize self.  See help(type(self)) for accurate signature.
     |  
     |  ----------------------------------------------------------------------
     |  Data descriptors defined here:
     |  
     |  __dict__
     |      dictionary for instance variables (if de

In [23]:
cleaned = pd.read_csv("https://raw.githubusercontent.com/buildweek-saltiest-hacker/data-engineering-api/master/hacker-comments.csv")

cleaned.head()

Unnamed: 0.1,Unnamed: 0,hacker_name,hacker_comment,comment_saltiness,hacker_score
0,0,DanBC,Drug use has clear links with violence althoug...,9996,247.11
1,1,waps,You failed to explain why the Sana'a manuscrip...,9996,-684.35
2,2,AnthonyMouse,How do you stop another oklahoma cityI think t...,9995,507.01
3,3,michaelochurch,Can you think of any constructive ways forward...,9994,-932.92
4,4,paulhauggis,These sorts of articles do nothing to help the...,9994,-832.11


In [24]:
text = cleaned['hacker_comment']

sample_text = text[:5]
sample_text

0    Drug use has clear links with violence althoug...
1    You failed to explain why the Sana'a manuscrip...
2    How do you stop another oklahoma cityI think t...
3    Can you think of any constructive ways forward...
4    These sorts of articles do nothing to help the...
Name: hacker_comment, dtype: object

In [25]:
sample_list = []
for i in sample_text:
  a = score.polarity_scores(i)
  b = a['compound']
  c = round(b*10, 2)
  sample_list.append(c)

sample_list

[-10.0, -10.0, -10.0, -9.99, -9.99]

Creating a the ranking for each comment.

In [26]:
# creating a new dataframe that just has the information needed

text = cleaned['hacker_comment']
name = cleaned['hacker_name']

salty_hackers = pd.DataFrame({
    'Name':name,
    'Comment': text
})

salty_hackers.head()

Unnamed: 0,Name,Comment
0,DanBC,Drug use has clear links with violence althoug...
1,waps,You failed to explain why the Sana'a manuscrip...
2,AnthonyMouse,How do you stop another oklahoma cityI think t...
3,michaelochurch,Can you think of any constructive ways forward...
4,paulhauggis,These sorts of articles do nothing to help the...


In [27]:
comment = salty_hackers['Comment']

ranking = []

for i in comment:
  scores = score.polarity_scores(i)
  final_score = scores['compound']
  rounded_score = round(final_score*10, 2)
  ranking.append(rounded_score)

In [28]:
salty_hackers['comment_ranking'] = ranking

In [29]:
salty_hackers.head()

Unnamed: 0,Name,Comment,comment_ranking
0,DanBC,Drug use has clear links with violence althoug...,-10.0
1,waps,You failed to explain why the Sana'a manuscrip...,-10.0
2,AnthonyMouse,How do you stop another oklahoma cityI think t...,-10.0
3,michaelochurch,Can you think of any constructive ways forward...,-9.99
4,paulhauggis,These sorts of articles do nothing to help the...,-9.99


In [30]:
sample_data = salty_hackers.iloc[:10]
sample_data

Unnamed: 0,Name,Comment,comment_ranking
0,DanBC,Drug use has clear links with violence althoug...,-10.0
1,waps,You failed to explain why the Sana'a manuscrip...,-10.0
2,AnthonyMouse,How do you stop another oklahoma cityI think t...,-10.0
3,michaelochurch,Can you think of any constructive ways forward...,-9.99
4,paulhauggis,These sorts of articles do nothing to help the...,-9.99
5,grey-area,Let's consider a specific example:http://www.n...,-9.99
6,danso,I don't know what it's like to be raped or sex...,-9.99
7,pdonis,"Are you actually serious? Not joking?No, I'm n...",-9.99
8,lionhearted,The person who stole it clearly needed it more...,-9.99
9,pyre,"Regarding producing oil from livestock, you'll...",-9.99


In [31]:
salty_hackers['comment_ranking'].describe()

count    230703.000000
mean          1.862038
std           5.451271
min         -10.000000
25%          -1.780000
50%           2.230000
75%           6.700000
max          10.000000
Name: comment_ranking, dtype: float64

In [32]:
average = salty_hackers.groupby(by='Name').mean()
average

Unnamed: 0_level_0,comment_ranking
Name,Unnamed: 1_level_1
001sky,1.589093
0x0,1.337746
10ren,4.157703
3pt14159,2.552662
6ren,3.444013
...,...
zdw,1.959259
zem,3.512906
zo1,2.449098
zobzu,2.499911


In [33]:
average[:10]

Unnamed: 0_level_0,comment_ranking
Name,Unnamed: 1_level_1
001sky,1.589093
0x0,1.337746
10ren,4.157703
3pt14159,2.552662
6ren,3.444013
Alex3917,1.733708
Alupis,2.168095
AndrewDucker,2.395503
AnimalMuppet,1.426772
Animats,0.879325


In [34]:
average['comment_ranking']

Name
001sky      1.589093
0x0         1.337746
10ren       4.157703
3pt14159    2.552662
6ren        3.444013
              ...   
zdw         1.959259
zem         3.512906
zo1         2.449098
zobzu       2.499911
zokier      1.504020
Name: comment_ranking, Length: 1000, dtype: float64

In [35]:
average_dict = average['comment_ranking'].to_dict()
average_dict

{'001sky': 1.5890934844192632,
 '0x0': 1.3377456647398849,
 '10ren': 4.157702702702702,
 '3pt14159': 2.5526623376623383,
 '6ren': 3.4440131578947377,
 'Alex3917': 1.7337075718015653,
 'Alupis': 2.1680952380952383,
 'AndrewDucker': 2.395503355704698,
 'AnimalMuppet': 1.4267721518987346,
 'Animats': 0.8793248945147676,
 'AnthonyMouse': -0.507068062827225,
 'Apocryphon': 0.8412173913043476,
 'Argorak': 2.6918018018018013,
 'AznHisoka': 2.368947368421053,
 'BrandonM': 3.724186046511628,
 'CWuestefeld': 1.637821782178218,
 'CamperBob': 1.605909090909092,
 'CamperBob2': 1.3114220183486252,
 'ChuckMcM': 3.668179723502303,
 'CmonDev': 1.5765467625899283,
 'ColinWright': 2.152369477911647,
 'ComputerGuru': 1.832722222222222,
 'Confusion': 0.8523394495412849,
 'Crito': 0.19444852941176508,
 'Cushman': 2.694,
 'DanBC': -0.24710134128167272,
 'DaniFong': 3.5959259259259277,
 'DanielBMarkham': 3.6512701252236135,
 'DanielStraight': 2.1638392857142854,
 'Daniel_Newby': -0.4157758620689647,
 'DannoHu

In [36]:
all_users = average_dict.keys()
all_users

dict_keys(['001sky', '0x0', '10ren', '3pt14159', '6ren', 'Alex3917', 'Alupis', 'AndrewDucker', 'AnimalMuppet', 'Animats', 'AnthonyMouse', 'Apocryphon', 'Argorak', 'AznHisoka', 'BrandonM', 'CWuestefeld', 'CamperBob', 'CamperBob2', 'ChuckMcM', 'CmonDev', 'ColinWright', 'ComputerGuru', 'Confusion', 'Crito', 'Cushman', 'DanBC', 'DaniFong', 'DanielBMarkham', 'DanielStraight', 'Daniel_Newby', 'DannoHung', 'DannyBee', 'DenisM', 'DennisP', 'Dewie', 'DigitalSea', 'DrJokepu', 'Dylan16807', 'EGreg', 'EliRivers', 'ErrantX', 'Estragon', 'Evbn', 'FireBeyond', 'Florin_Andrei', 'FooBarWidget', 'ForHackernews', 'GFK_of_xmaspast', 'GFischer', 'GHFigs', 'GhotiFish', 'Goladus', 'Groxx', 'GuiA', 'Hexstream', 'HeyLaughingBoy', 'Houshalter', 'IgorPartola', 'InclinedPlane', 'JabavuAdams', 'Jach', 'JacobAldridge', 'JadeNB', 'JoachimSchipper', 'JoeAltmaier', 'JonnieCache', 'JoshTriplett', 'JulianMorrison', 'JumpCrisscross', 'Jun8', 'Kalium', 'Karunamon', 'Keyframe', 'KirinDave', 'Kiro', 'Lawtonfogle', 'Locke168

In [37]:
user_list = list(all_users)

sample_users = user_list[:10]
sample_users

['001sky',
 '0x0',
 '10ren',
 '3pt14159',
 '6ren',
 'Alex3917',
 'Alupis',
 'AndrewDucker',
 'AnimalMuppet',
 'Animats']

In [38]:
for user in sample_users:
  sample_rank = average_dict[user]
  print(sample_rank)

1.5890934844192632
1.3377456647398849
4.157702702702702
2.5526623376623383
3.4440131578947377
1.7337075718015653
2.1680952380952383
2.395503355704698
1.4267721518987346
0.8793248945147676


In [39]:
users = salty_hackers['Name']
user_ranking = []
for user in users:
  user_rank = average_dict[user]
  round_user_rank = round(user_rank, 2)
  user_ranking.append(round_user_rank)

In [40]:
user_ranking[:10]

[-0.25, 0.68, -0.51, 0.93, 0.83, 3.94, 2.77, 1.89, 4.23, 0.58]

In [41]:
salty_hackers['user_ranking'] = user_ranking

In [42]:
salty_hackers.head()

Unnamed: 0,Name,Comment,comment_ranking,user_ranking
0,DanBC,Drug use has clear links with violence althoug...,-10.0,-0.25
1,waps,You failed to explain why the Sana'a manuscrip...,-10.0,0.68
2,AnthonyMouse,How do you stop another oklahoma cityI think t...,-10.0,-0.51
3,michaelochurch,Can you think of any constructive ways forward...,-9.99,0.93
4,paulhauggis,These sorts of articles do nothing to help the...,-9.99,0.83


In [43]:
salty_hackers['user_ranking'].describe()

count    230703.000000
mean          1.862013
std           0.943609
min          -0.800000
25%           1.190000
50%           1.810000
75%           2.440000
max           5.890000
Name: user_ranking, dtype: float64

## Exporting Final Data Set

In [44]:
compression_opts = dict(method='zip',archive_name='salty_hackers.csv')  

salty_hackers.to_csv('salty_hackers.zip', index=False, compression=compression_opts)  