**General Imports**

In [None]:
from operator import add
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from requests_oauthlib import OAuth1Session
from time import gmtime, strftime

import ast
import json
import nltk
import requests
import requests_oauthlib
import string
import time

**NLTK Imports**

In [None]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

import nltk

In [None]:
BATCH_INTERVAL = 5

In [None]:
ssc = StreamingContext(sc, BATCH_INTERVAL)

### Training the Sentiment Analysis Classifier

An essential part of creating a sentiment analysis algorithm (or any data mining algorithm) is to use a comprehensive dataset or "Corpus" for learning, as well as a set of test data to ensure that the accuracy of the your algorithm meets the standards you expect.

This will also allow you to adjust your algorithm in order to predict better (or more precisely) natural language features that you could extract from the text that will contribute to the sentiment classification, rather than using a generic approach.

We will use the training dataset provided by the University of Michigan for Kaggle competitions: https://inclass.kaggle.com/c/si650winter11

This dataset contains 1,578,627 classified tweets and each row is marked as:

- 1 for positive feeling
- 0 for negative feeling

In [None]:
file = sc.textFile('aux/datasets/sentiment-analysis.csv')

In [None]:
header = file.take(1)[0]

In [None]:
dataset = file.filter(lambda line: line != header)

In [None]:
dataset.take(3)

In [None]:
def clearLine(line):
    columns = line.split(',')
    translator = str.maketrans({key: None for key in string.punctuation})
    
    sentiment = columns[1]
    tweet = columns[3].strip()
    tweet = tweet.translate(translator)
    tweet = tweet.split(' ')
    
    tweetLower = [word.lower() for word in tweet]
    
    return(tweetLower, sentiment)

In [None]:
datasetTraining = dataset.map(lambda line: clearLine(line))

In [None]:
datasetTraining.take(3)

In [None]:
nltk.download('stopwords')

In [None]:
stopWords = [[word, word + '_NEG'] for word in stopwords.words('english') for singleWord in word]
stopWords = [word for wordPair in stopWords for word in wordPair]

In [None]:
sentimentAnalyzer = SentimentAnalyzer()

In [None]:
datasetTrainingSample = datasetTraining.take(10000)

In [None]:
wordsNegative = sentimentAnalyzer.all_words([mark_negation(word) for word in datasetTrainingSample])
wordsNegative = [word for word in wordsNegative if word not in stopWords]

In [None]:
unigramFeatures = sentimentAnalyzer.unigram_word_feats(wordsNegative, top_n = 200)

In [None]:
sentimentAnalyzer.add_feat_extractor(extract_unigram_feats, unigrams = unigramFeatures)

In [None]:
trainingSet = sentimentAnalyzer.apply_features(datasetTrainingSample)

### Model Training

In [None]:
trainer = NaiveBayesClassifier.train

In [None]:
classifier = sentimentAnalyzer.train(trainer, trainingSet)

In [None]:
testSentence01 = [(['this', 'program', 'is', 'bad'], '')]
testSentence02 = [(['tough', 'day', 'at', 'work', 'today'], '')]
testSentence03 = [(['that', 'place', 'is', 'awesome'], '')]

In [None]:
testSet01 = sentimentAnalyzer.apply_features(testSentence01)
testSet02 = sentimentAnalyzer.apply_features(testSentence02)
testSet03 = sentimentAnalyzer.apply_features(testSentence03)

**Twitter Authentication**

In [None]:
apiKey = ''
apiKeySecret = ''
accessToken = ''
accessTokenSecret = ''

In [None]:
searchTerm = 'Trump'

In [None]:
urlSample = 'https://stream.twitter.com/1.1/statuses/sample.json'

In [None]:
urlFilter = 'https://stream.twitter.com/1.1/statuses/filter.json?track=' + searchTerm

In [None]:
auth = requests_oauthlib.OAuth1(apiKey, apiKeySecret, accessToken, accessTokenSecret)

**Stream Configuration**

In [None]:
rdd = ssc.sparkContext.parallelize([0])

In [None]:
stream = ssc.queueStream([], default = rdd)

In [None]:
NUM_TWEETS = 500

In [None]:
def twitterDataStream():
    response = requests.get(urlFilter, auth = auth, stream = True)
    print(urlFilter, response)
    
    count = 0
    
    for line in response.iter_lines():
        try:
            if count > NUM_TWEETS:
                break

            post = json.loads(line.decode('utf-8'))
            contents = [post['text']]
            count += 1

            yield str(contents)
        except:
            result = False

In [None]:
def mapData(t, rdd):
    return rdd.flatMap(lambda x: twitterDataStream())

In [None]:
stream = stream.transform(mapData)

In [None]:
streamCoord = stream.map(lambda line: ast.literal_eval(line))

In [None]:
def tweetClassifier(tweet):
    sentence = [(tweet, '')]
    testSet = sentimentAnalyzer.apply_features(sentence)
    
    print(tweet, classifier.classify(testSet[0][0]))
    
    return(tweet, classifier.classify(testSet[0][0]))

In [None]:
def getTweetText(rdd):
    for line in rdd:
        tweet = line.strip()
        translator = str.maketrans({key: None for key in string.punctuation})
        tweet = tweet.translate(translator)
        tweet = tweet.split(' ')
        
        tweetLower = [word.lower() for word in tweet]
        
        return tweetClassifier(tweetLower)

In [None]:
results = []

In [None]:
def rddOutput(rdd):
    global results
    
    pairs = rdd.map(lambda x: (getTweetText(x)[1], 1))
    counts = pairs.reduceByKey(add)
    output = [count for count in counts.collect()]
    
    result = [time.strftime('%I:%M:%S'), output]
    results.append(result)
    
    print(result)    

In [None]:
streamCoord.foreachRDD(lambda t, rdd: rddOutput(rdd))

### Start Streaming

In [None]:
ssc.start()

In [None]:
count = True

In [None]:
while count:
    if len(results) > 5:
        count = False

In [None]:
outputFile = 'aux/' + time.strftime('%I%M%S')

In [None]:
rddResults = sc.parallelize(results)

In [None]:
rddResults.saveAsTextFile(outputFile)

In [None]:
rddResults.collect()

In [None]:
ssc.stop()