**General Imports**

In [15]:
from operator import add
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from requests_oauthlib import OAuth1Session
from time import gmtime, strftime

import ast
import json
import nltk
import requests
import requests_oauthlib
import string
import time

**NLTK Imports**

In [16]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

import nltk

In [17]:
BATCH_INTERVAL = 5

In [18]:
ssc = StreamingContext(sc, BATCH_INTERVAL)

### Training the Sentiment Analysis Classifier

An essential part of creating a sentiment analysis algorithm (or any data mining algorithm) is to use a comprehensive dataset or "Corpus" for learning, as well as a set of test data to ensure that the accuracy of the your algorithm meets the standards you expect.

This will also allow you to adjust your algorithm in order to predict better (or more precisely) natural language features that you could extract from the text that will contribute to the sentiment classification, rather than using a generic approach.

We will use the training dataset provided by the University of Michigan for Kaggle competitions: https://inclass.kaggle.com/c/si650winter11

This dataset contains 1,578,627 classified tweets and each row is marked as:

- 1 for positive feeling
- 0 for negative feeling

In [19]:
file = sc.textFile('aux/datasets/sentiment-analysis.csv')

In [20]:
header = file.take(1)[0]

In [21]:
dataset = file.filter(lambda line: line != header)

In [22]:
dataset.take(3)

['1,0,Sentiment140,                     is so sad for my APL friend.............',
 '2,0,Sentiment140,                   I missed the New Moon trailer...',
 '3,1,Sentiment140,              omg its already 7:30 :O']

In [23]:
def clearLine(line):
    columns = line.split(',')
    translator = str.maketrans({key: None for key in string.punctuation})
    
    sentiment = columns[1]
    tweet = columns[3].strip()
    tweet = tweet.translate(translator)
    tweet = tweet.split(' ')
    
    tweetLower = [word.lower() for word in tweet]
    
    return(tweetLower, sentiment)

In [24]:
datasetTraining = dataset.map(lambda line: clearLine(line))

In [95]:
datasetTraining.take(3)

[(['is', 'so', 'sad', 'for', 'my', 'apl', 'friend'], '0'),
 (['i', 'missed', 'the', 'new', 'moon', 'trailer'], '0'),
 (['omg', 'its', 'already', '730', 'o'], '1')]

In [96]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/caio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [97]:
stopWords = [[word, word + '_NEG'] for word in stopwords.words('english') for singleWord in word]
stopWords = [word for wordPair in stopWords for word in wordPair]

In [98]:
sentimentAnalyzer = SentimentAnalyzer()

In [99]:
datasetTrainingSample = datasetTraining.take(10000)

In [100]:
wordsNegative = sentimentAnalyzer.all_words([mark_negation(word) for word in datasetTrainingSample])
wordsNegative = [word for word in wordsNegative if word not in stopWords]

In [101]:
unigramFeatures = sentimentAnalyzer.unigram_word_feats(wordsNegative, top_n = 200)

In [102]:
sentimentAnalyzer.add_feat_extractor(extract_unigram_feats, unigrams = unigramFeatures)

In [103]:
trainingSet = sentimentAnalyzer.apply_features(datasetTrainingSample)

### Model Training

In [106]:
trainer = NaiveBayesClassifier.train

In [107]:
classifier = sentimentAnalyzer.train(trainer, trainingSet)

Training classifier


In [111]:
testSentence01 = [(['this', 'program', 'is', 'bad'], '')]
testSentence02 = [(['tough', 'day', 'at', 'work', 'today'], '')]
testSentence03 = [(['that', 'place', 'is', 'awesome'], '')]

In [114]:
testSet01 = sentimentAnalyzer.apply_features(testSentence01)
testSet02 = sentimentAnalyzer.apply_features(testSentence02)
testSet03 = sentimentAnalyzer.apply_features(testSentence03)