In [5]:
import psycopg2
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
import re
from __future__ import division
from nltk.tag import StanfordNERTagger

In [6]:
conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-52-27-114-159.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
df = pd.read_sql_query("SELECT * FROM articles limit 5", conn)

In [7]:
df

Unnamed: 0,site,title,author,published_on,accessed_on,url,body,newspaper_keywords,newspaper_summary,id
0,Breitbart,"Trump: ‘We’re Doing Very Well in Iraq,’ U.S. T...",John Hayward,2017-03-29,2017-03-31 08:20:09.478760,http://www.breitbart.com/national-security/201...,"SIGN UP FOR OUR NEWSLETTER On Tuesday, Presid...","{war,told,secretary,soldiers,taking,iraq,presi...","SIGN UP FOR OUR NEWSLETTEROn Tuesday, Presiden...",981
1,Breitbart,Top U.S. General: ‘We Have Not Relaxed the Rul...,Edwin Mora,2017-03-29,2017-03-31 08:20:14.741313,http://www.breitbart.com/national-security/201...,"SIGN UP FOR OUR NEWSLETTER WASHINGTON, D.C. —...","{gen,rules,responsibility,engagement,military,...",We have not relaxed the rules of engagement.\n...,982
2,Breitbart,Protests in Paris Continue for Third Night Aft...,Oliver Jj Lane,2017-03-30,2017-03-31 08:20:18.922523,http://www.breitbart.com/london/2017/03/30/pro...,SIGN UP FOR OUR NEWSLETTER Hundreds of “Asian...,"{night,protest,paris,killed,france,chinese,sub...",SIGN UP FOR OUR NEWSLETTERHundreds of “Asians”...,983
3,Breitbart,Rep. Jim Jordan: Working with Dems on Health C...,Dan Riehl,2017-03-30,2017-03-31 08:20:25.141342,http://www.breitbart.com/radio/2017/03/30/rep-...,SIGN UP FOR OUR NEWSLETTER Rep. Jim Jordan (R...,"{come,jim,healthcare,reform,werent,working,mis...",SIGN UP FOR OUR NEWSLETTERRep. Jim Jordan (R-O...,984
4,Breitbart,John McCain in Last-Minute Attempt to Avert Go...,Ian Mason,2017-03-30,2017-03-31 08:20:29.304708,http://www.breitbart.com/big-government/2017/0...,SIGN UP FOR OUR NEWSLETTER Sen. John McCain (...,"{constitutional,john,supreme,court,attempt,mcc...",SIGN UP FOR OUR NEWSLETTERSen. John McCain (R-...,985


#### Tokenize the article body

In [8]:
tokenized_body = []
for body in df['body']:
    body = body.decode('utf-8')
    tokens = nltk.word_tokenize(body)
    tokenized_body.append(tokens)

In [9]:
se = pd.Series(tokenized_body)
df['tokenized_body'] = se.values

#### Simple word count

In [10]:
word_count = []
for body in df['tokenized_body']:
    word_count.append(len(body))

In [11]:
se = pd.Series(word_count)
df['word_count'] = se.values

#### Stopword Removal

In [12]:
stop_words = stopwords.words('english')
stop_words = stop_words + [',', '.', '!', '?', '"','\'', '/', '\\', '-', '--', '—', '(', ')', '[', ']', '\'s', '\'t', '\'ve', '\'d', '\'ll', '\'re']
stop_words = set(stop_words) # making this a set increases performance for large documents

In [13]:
stopworded_body = []
for body in df['tokenized_body']:
    stopworded_body.append([w.lower() for w in body if w not in stop_words])

In [14]:
se = pd.Series(stopworded_body)
df['stopworded_body'] = se.values

#### Lemmatization: Get the root words for the tokenized and stopworded body text


In [15]:
wnl = nltk.WordNetLemmatizer()
lemmatized_words = []
lemmatized_body = []
for body in df['stopworded_body']:
    # We need to tag words with their parts of speech before the WordNet lemmatizer will work properly
    pos_tagged_body = nltk.pos_tag(body)
    lemmatized_words = []
    for word, tag in pos_tagged_body:
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
        else:
            lemma = wnl.lemmatize(word, wntag)
        lemmatized_words.append(lemma)
    lemmatized_body.append(lemmatized_words)

In [16]:
se = pd.Series(lemmatized_body)
df['lemmatized_body'] = se.values

#### Bag of Words/Frequency Distribution: Get word count from lemmatized text

In [17]:
word_bag = []
for body in df['lemmatized_body']:
    fdist = FreqDist(body)
    # FreqDist returns a special nltk.probability.FreqDist type
    # This is a list of tuples
    # Here is an example of how to access the elements for future reference
#     print(fdist.most_common())
    # Access an individual tuple
#     print(fdist.most_common()[0])
    # Access the word from the tuple
#     print(fdist.most_common()[0][0])
    # Access the count from the tuple
#     print(fdist.most_common()[0][1])
    # Append to list as ordered frequency distribution
    word_bag.append(fdist.most_common())


In [18]:
se = pd.Series(word_bag)
df['word_bag'] = se.values

#### Named Entity Extraction using StanfordNLP Classification Model

###### Stanford NLP named entity extractor requires that you download the jar from https://nlp.stanford.edu/software/CRF-NER.shtml#Download, unzip and extract english.all.3class.distsim.crf.ser.gz and stanford-ner.jar, then provide their file paths to StanfordNERTagger below and you may need to install java8 on ubuntu: https://tecadmin.net/install-oracle-java-8-ubuntu-via-ppa/

In [19]:
st = StanfordNERTagger('/media/justin/Data/Google Drive/Assignments and Projects/Machine Learning/NLP/english.all.3class.distsim.crf.ser.gz',
					   '/media/justin/Data/Google Drive/Assignments and Projects/Machine Learning/NLP/stanford-ner.jar',
					   encoding='utf-8')

In [20]:
classified_texts = []
for body in df['tokenized_body']:
    classified_texts.append(st.tag(body))

# print(classified_text)

#### Now, if we want to parse the list of tuples returned by the standford classifier into a more easily usable list form, we can take that output, convert it to the standard IOB tag format with stanfordNE2BIO, then parse that into a tree, and traverse the tree to rearrange into a list

In [21]:
from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag

    return bio_tagged_sent

#### Now convert the IOB tagged tuples into a tree (this can be called with the original stanfordNERTagger output, skipping the explicit call to convert to IOB format)

In [22]:
def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree


In [23]:
ne_trees = []
for text in classified_texts:
    ne_trees.append(stanfordNE2tree(text))

#### Finally, join the leaves into a formated list of tuples

In [24]:
ne_in_sent = []
ne_in_sents = []
for tree in ne_trees:
    ne_in_sent = []
    for subtree in tree:
        if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    ne_in_sents.append(ne_in_sent)

In [25]:
se = pd.Series(ne_in_sents)
df['named_entities'] = se.values

#### Lexical diversity is a measure of the complexity, or sophistication, of a text. A higher number means the text has a richer vocabulary and less repetition of words. If the calculation returns 65.23, for example, that means 65.23% of the total words are distinct.

In [26]:
def lexical_diversity(text):
    return len(set(text)) / len(text) * 100

In [27]:
lex_div = []
for body in df['stopworded_body']:
    lex_div.append(lexical_diversity(body))
    print("lexical diversity: " + str(lexical_diversity(body)))

lexical diversity: 69.4610778443
lexical diversity: 69.7228144989
lexical diversity: 76.8166089965
lexical diversity: 73.2558139535
lexical diversity: 75.0


In [28]:
se = pd.Series(lex_div)
df['lexical_diversity'] = se.values

In [29]:
df

Unnamed: 0,site,title,author,published_on,accessed_on,url,body,newspaper_keywords,newspaper_summary,id,tokenized_body,word_count,stopworded_body,lemmatized_body,word_bag,named_entities,lexical_diversity
0,Breitbart,"Trump: ‘We’re Doing Very Well in Iraq,’ U.S. T...",John Hayward,2017-03-29,2017-03-31 08:20:09.478760,http://www.breitbart.com/national-security/201...,"SIGN UP FOR OUR NEWSLETTER On Tuesday, Presid...","{war,told,secretary,soldiers,taking,iraq,presi...","SIGN UP FOR OUR NEWSLETTEROn Tuesday, Presiden...",981,"[SIGN, UP, FOR, OUR, NEWSLETTER, On, Tuesday, ...",595,"[sign, up, for, our, newsletter, on, tuesday, ...","[sign, up, for, our, newsletter, on, tuesday, ...","[(iraq, 8), (troop, 6), (fight, 5), (trump, 5)...","[(Trump, PERSON), (White House, ORGANIZATION),...",69.461078
1,Breitbart,Top U.S. General: ‘We Have Not Relaxed the Rul...,Edwin Mora,2017-03-29,2017-03-31 08:20:14.741313,http://www.breitbart.com/national-security/201...,"SIGN UP FOR OUR NEWSLETTER WASHINGTON, D.C. —...","{gen,rules,responsibility,engagement,military,...",We have not relaxed the rules of engagement.\n...,982,"[SIGN, UP, FOR, OUR, NEWSLETTER, WASHINGTON, ,...",831,"[sign, up, for, our, newsletter, washington, d...","[sign, up, for, our, newsletter, washington, d...","[(mosul, 10), (civilian, 9), (gen., 8), (u.s.,...","[(WASHINGTON, LOCATION), (D.C., LOCATION), (U....",69.722814
2,Breitbart,Protests in Paris Continue for Third Night Aft...,Oliver Jj Lane,2017-03-30,2017-03-31 08:20:18.922523,http://www.breitbart.com/london/2017/03/30/pro...,SIGN UP FOR OUR NEWSLETTER Hundreds of “Asian...,"{night,protest,paris,killed,france,chinese,sub...",SIGN UP FOR OUR NEWSLETTERHundreds of “Asians”...,983,"[SIGN, UP, FOR, OUR, NEWSLETTER, Hundreds, of,...",528,"[sign, up, for, our, newsletter, hundreds, “as...","[sign, up, for, our, newsletter, hundred, “asi...","[(police, 13), (protest, 8), (chinese, 6), (pa...","[(Paris, LOCATION), (Paris, LOCATION), (Shaoyo...",76.816609
3,Breitbart,Rep. Jim Jordan: Working with Dems on Health C...,Dan Riehl,2017-03-30,2017-03-31 08:20:25.141342,http://www.breitbart.com/radio/2017/03/30/rep-...,SIGN UP FOR OUR NEWSLETTER Rep. Jim Jordan (R...,"{come,jim,healthcare,reform,werent,working,mis...",SIGN UP FOR OUR NEWSLETTERRep. Jim Jordan (R-O...,984,"[SIGN, UP, FOR, OUR, NEWSLETTER, Rep., Jim, Jo...",451,"[sign, up, for, our, newsletter, rep., jim, jo...","[sign, up, for, our, newsletter, rep., jim, jo...","[(jordan, 8), (get, 4), (come, 4), (repeal, 4)...","[(Jim Jordan, PERSON), (Breitbart News Daily S...",73.255814
4,Breitbart,John McCain in Last-Minute Attempt to Avert Go...,Ian Mason,2017-03-30,2017-03-31 08:20:29.304708,http://www.breitbart.com/big-government/2017/0...,SIGN UP FOR OUR NEWSLETTER Sen. John McCain (...,"{constitutional,john,supreme,court,attempt,mcc...",SIGN UP FOR OUR NEWSLETTERSen. John McCain (R-...,985,"[SIGN, UP, FOR, OUR, NEWSLETTER, Sen., John, M...",405,"[sign, up, for, our, newsletter, sen., john, m...","[sign, up, for, our, newsletter, sen., john, m...","[(sen., 5), (senate, 5), (thursday, 4), (gorsu...","[(John McCain, PERSON), (Judge Neil Gorsuch’s ...",75.0


In [34]:
for article in df['url']:
    print(article)

http://www.breitbart.com/national-security/2017/03/29/trump-were-doing-very-well-in-iraq-u-s-troops-fighting-like-never-before/#disqus_thread
http://www.breitbart.com/national-security/2017/03/29/top-u-s-gen-we-have-not-relaxed-the-rules-of-engagement-for-avoiding-civilian-casualties/#disqus_thread
http://www.breitbart.com/london/2017/03/30/protests-paris-chinese-citizen-killed-police/#disqus_thread
http://www.breitbart.com/radio/2017/03/30/rep-jordan-working-dems-health-care-would-be-big-mistake/#disqus_thread
http://www.breitbart.com/big-government/2017/03/30/john-mccain-in-last-minute-attempt-to-avert-gorsuch-constitutional-option/#disqus_thread


#### Sentiment Analysis

#### We'll need to break the articles apart by sentence to do almost any form of sentiment analysis.

In [30]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer



In [38]:
for x in range(5):
    print(df['title'][x])
    sentence=df['body'][x]
    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(sentence)
    print(ss)
    print(df['url'][x])

Trump: ‘We’re Doing Very Well in Iraq,’ U.S. Troops ‘Fighting Like Never Before’
{'neg': 0.082, 'neu': 0.82, 'pos': 0.098, 'compound': 0.8501}
http://www.breitbart.com/national-security/2017/03/29/trump-were-doing-very-well-in-iraq-u-s-troops-fighting-like-never-before/#disqus_thread
Top U.S. General: ‘We Have Not Relaxed the Rules of Engagement’ in Mosul
{'neg': 0.094, 'neu': 0.821, 'pos': 0.086, 'compound': -0.9335}
http://www.breitbart.com/national-security/2017/03/29/top-u-s-gen-we-have-not-relaxed-the-rules-of-engagement-for-avoiding-civilian-casualties/#disqus_thread
Protests in Paris Continue for Third Night After Chinese Citizen Killed by Police
{'neg': 0.152, 'neu': 0.816, 'pos': 0.032, 'compound': -0.9954}
http://www.breitbart.com/london/2017/03/30/protests-paris-chinese-citizen-killed-police/#disqus_thread
Rep. Jim Jordan: Working with Dems on Health Care Would Be ‘a Big Mistake’
{'neg': 0.035, 'neu': 0.92, 'pos': 0.045, 'compound': 0.3384}
http://www.breitbart.com/radio/201