In [123]:
from __future__ import division
import json
import re
import string
import nltk
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [50]:
with open("timelines_golden_globes_tv_short.json") as f:
    tv_short = json.load(f)

In [51]:
tweets_list = [[tweet["text"] for tweet in tweets] for tweets in tv_short]

In [52]:
tweets_list[0][0]

"@Dior men's showing in Paris was so edgy and cool! @KRISVANASSCHE is a true artist. https://t.co/gDkZ4haiTT"

# tokenize

In [10]:
def processTweet(tweet):
    # process the tweets

    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    return tweet

In [53]:
tweets_list = [[processTweet(tweet) for tweet in tweets] for tweets in tweets_list]
# remove retweets
tweets_list = [[tweet for tweet in tweets if tweet.split()[0] != 'rt'] for tweets in tweets_list]

In [54]:
tweets_list[0][0]

"AT_USER men's showing in paris was so edgy and cool! AT_USER is a true artist. URL"

In [55]:
def replaceTwoOrMore(s):
    #look for 2 or more repetitions of character and replace with the character itself
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)

In [56]:
tweets_list = [[replaceTwoOrMore(tweet) for tweet in tweets] for tweets in tweets_list]

In [57]:
tweets_list[0][0]

"AT_USER men's showing in paris was so edgy and cool! AT_USER is a true artist. URL"

In [58]:
def remove_punct(word):
    punc = set(string.punctuation)
    punc.remove('-')
    return "".join([a for a in word if a not in punc])

In [59]:
tweets_list = [[remove_punct(tweet) for tweet in tweets] for tweets in tweets_list]

In [107]:
with open("stopwords.txt") as f:
    stopwords = f.read().split('\n')
stopwords.append('ATUSER')
stopwords.append('URL')

In [64]:
def remove_stopwords(tweet, stopwords):
    return " ".join([word for word in tweet.split(" ") if word not in stopwords])

In [68]:
tweets_list = [[remove_stopwords(tweet, stopwords) for tweet in tweets] for tweets in tweets_list]

In [77]:
tweets_list = [[tweet.split() for tweet in tweets] for tweets in tweets_list]

# count positive negative words

In [None]:
with open("negative-words.txt") as f:
    negwords = f.read().split('\n')
    set_neg = set(negwords)
with open("positive-words.txt") as f:
    poswords = f.read().split('\n')
    set_pos = set(poswords)

In [78]:
tweets_list[0][0]

['mens', 'paris', 'edgy', 'cool', 'true', 'artist']

In [115]:
pos_neg_count = [[(sum([word in set_pos for word in tweet]), sum([word in set_neg for word in tweet]))
                  for tweet in tweets] 
                 for tweets in tweets_list]

In [163]:
def pos_neg(stat):
    if stat[0] > stat[1]:
        return "pos"
    elif stat[1] > stat[0]:
        return "neg"
    else:
        return "neu"

In [165]:
pos_neg_res = [[pos_neg(stat) for stat in people] for people in pos_neg_count]

In [168]:
pos_neg_resFreqDist(pos_neg_res[1])

Counter({'neg': 46, 'neu': 194, 'pos': 109})

# word stat and word cloud

In [130]:
# tweets = [[" ".join([word for word in tweet])
#                    for tweet in people] for people in tweets_list]
# tweets = [" ".join([tweet for tweet in people]) for people in tweets]

In [159]:
tweets = [[item for sublist in people for item in sublist] for people in tweets_list]
tweets_all = [item for sublist in tweets for item in sublist]

In [160]:
def sortFreqDict(freqdict):
    aux = [(freqdict[key], key) for key in freqdict]
    aux.sort()
    aux.reverse()
    return aux

In [161]:
sortFreqDict(FreqDist(tweets_all))

[(1777, 'love'),
 (1772, 'thank'),
 (1432, 'thanks'),
 (1035, 'amp'),
 (1016, 'im'),
 (780, 'happy'),
 (777, '“ATUSER'),
 (701, 'day'),
 (639, 'time'),
 (578, 'night'),
 (576, 'yes'),
 (575, '2'),
 (537, 'am'),
 (506, '💋💋'),
 (465, 'tonight'),
 (454, 'dont'),
 (442, 'rt'),
 (431, 'watching'),
 (412, 'cant'),
 (367, 'oitnb'),
 (345, 'crazyexgirlfriend'),
 (345, 'amazing'),
 (340, 'people'),
 (340, 'fun'),
 (324, 'hope'),
 (321, 'watch'),
 (303, 'americancrime'),
 (301, 'life'),
 (300, 'thats'),
 (299, 'wait'),
 (294, 'look'),
 (286, 'beautiful'),
 (283, 'birthday'),
 (266, 'live'),
 (258, 'episode'),
 (254, 'oh'),
 (253, 'season'),
 (253, 'hey'),
 (253, 'check'),
 (246, '4'),
 (239, 'support'),
 (239, 'janethevirgin'),
 (238, 'youre'),
 (235, 'miss'),
 (235, 'friend'),
 (227, 'please'),
 (226, 'via'),
 (225, 'excited'),
 (222, 'proud'),
 (214, 'help'),
 (213, 'world'),
 (211, 'lovely'),
 (210, 'ill'),
 (208, 'family'),
 (205, 'guys'),
 (205, 'book'),
 (203, 'tomorrow'),
 (203, 'doing'),

# sentiment analysis using naive bayes

In [None]:
#start getfeatureVector
def getFeatureVector(tweet):
    featureVector = []
    #split tweet into words
    words = tweet.split()
    for w in words:
        #replace two or more with two occurrences
        w = replaceTwoOrMore(w)
        #strip punctuation
        w = w.strip('\'"?,.')
        #check if the word stats with an alphabet
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
        #ignore if it is a stop word
        if(w in stopWords or val is None):
            continue
        else:
            featureVector.append(w.lower())
    return featureVector
#end

In [None]:
#Read the tweets one by one and process it
fp = open('data/sampleTweets.txt', 'r')
line = fp.readline()

st = open('data/feature_list/stopwords.txt', 'r')
stopWords = getStopWordList('data/feature_list/stopwords.txt')

while line:
    processedTweet = processTweet(line)
    featureVector = getFeatureVector(processedTweet)
    print featureVector
    line = fp.readline()
#end loop
fp.close()

In [82]:
punc = set(string.punctuation)
punc.remove('-')
punc

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~'}