# Sentiment Analysis of Twitter Feeds to Find Your Favorite Airline

In [13]:
import json

import re
from pattern.en import parse
from pattern.en import pprint
from pattern.vector import stem, PORTER, LEMMA
from sklearn.feature_extraction import text
from gensim import corpora 

In [3]:
import findspark
findspark.init()
print findspark.find()

/usr/local/opt/apache-spark/libexec


In [4]:
import pyspark

In [14]:
# adapted from HW5
def get_parts(thetext, punctuation):
    # generate stopwords list & regexes for 2+ periods or 2+ dashes
    stop = text.ENGLISH_STOP_WORDS
    regex1=re.compile(r"\.{2,}")
    regex2=re.compile(r"\-{2,}")
    thetext=re.sub(regex1, ' ', thetext)
    thetext=re.sub(regex2, ' ', thetext)
    nouns=[]
    descriptives=[]
    for i,sentence in enumerate(parse(thetext, tokenize=True, lemmata=True).split()):
        nouns.append([])
        descriptives.append([])
        for token in sentence:
            if len(token[4]) >0:
                if token[1] in ['JJ', 'JJR', 'JJS']:
                    if token[4] in stop or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    descriptives[i].append(token[4])
                elif token[1] in ['NN', 'NNS']:
                    if token[4] in stop or token[4][0] in punctuation or token[4][-1] in punctuation or len(token[4])==1:
                        continue
                    nouns[i].append(token[4])
    out=zip(nouns, descriptives)
    nouns2=[]
    descriptives2=[]
    for n,d in out:
        if len(n)!=0 and len(d)!=0:
            nouns2.append(n)
            descriptives2.append(d)
    return nouns2, descriptives2

In [6]:
# initialize Spark context
conf = pyspark.SparkConf().setAppName("Twitter_Airline").setMaster("local[*]")
sc = pyspark.SparkContext(conf=conf)

# get tweets from text file
# using sample tweets for now
text_lines = sc.textFile('sample_tweets.json')
tweets = text_lines.map(json.loads)
tweets_text = tweets.map(lambda t: t['text'])



### Sentiment of a sentence based on log probs in a word list

Function to read the word list file:

In [7]:
import numpy as np
# read the word list
def readSentimentList(file_name):
    ifile = open(file_name, 'r')
    happy_log_probs = {}
    sad_log_probs = {}
    ifile.readline() #Ignore title row
    # splitting the csv
    for line in ifile:
        tokens = line[:-1].split(',')
        happy_log_probs[tokens[0]] = float(tokens[1])
        sad_log_probs[tokens[0]] = float(tokens[2])

    return happy_log_probs, sad_log_probs

Using Naive Bayes rule:

In [8]:
def classifySentiment(words, happy_log_probs, sad_log_probs):
    # get the log-probability of each word under each sentiment
    happy_probs = [happy_log_probs[word] for word in words if word in happy_log_probs]
    sad_probs = [sad_log_probs[word] for word in words if word in sad_log_probs]

    # sum all the log-probabilities for each sentiment to get a log-probability for the whole tweet
    tweet_happy_log_prob = np.sum(happy_probs)
    tweet_sad_log_prob = np.sum(sad_probs)

    # calculate the probability of the tweet belonging to each sentiment
    prob_happy = np.reciprocal(np.exp(tweet_sad_log_prob - tweet_happy_log_prob) + 1)
    prob_sad = 1 - prob_happy

    return prob_happy, prob_sad

Load the word list:

In [9]:
# load list of words and log probs
happy_log_probs, sad_log_probs = readSentimentList('wordlist.csv')

Reading in the tweet:

In [10]:
# read tweet
tweet1 = ['my', 'hate', 'southwest']

# calculate the probability
tweet1_happy_prob, tweet1_sad_prob = classifySentiment(tweet1, happy_log_probs, sad_log_probs)

print tweet1 
print "happy probability: " , tweet1_happy_prob 
print "sad probability:", tweet1_sad_prob

['my', 'hate', 'southwest']
happy probability:  0.280105168408
sad probability: 0.719894831592


In [11]:
# get words out for sentiment analysis
punc = list('.,;:!?()[]{}`''\"@#$^&*+-|=~_')
sentiment_words = tweets_text.map(lambda t: t.strip(punc).split())

# classify sentiment of tweet
tweets_probs = sentiment_words.map(lambda ws: classifySentiment(ws, happy_log_probs, sad_log_probs))
happy_probs = tweets_probs.keys()
sad_probs = tweets_probs.values()

### LDA on nouns for topic analysis

In [15]:
# parse tweets to nouns & adjectives
tweets_n_a = tweets_text.map(lambda t: get_parts(t, punc))
tweets_nouns = tweets_n_a.keys()
all_nouns = tweets_nouns.flatMap(lambda l: l).toLocalIterator()
tweets_adjs = tweets_n_a.values()

# feed nouns into gensim
dictionary = corpora.Dictionary(all_nouns)