##### Used Source: https://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/

## Import Modules & Libraries

In [None]:
import sys
import nltk
import re
# http://tweepy.readthedocs.io/en/v3.5.0/index.html
import tweepy
from tweepy import OAuthHandler
# https://pandasguide.readthedocs.io/en/latest/
import pandas as pd
# https://numpy.readthedocs.io/en/latest/
import numpy as np
# https://api.mongodb.com/python/current/
import pymongo
# Helps to save trained classifier to a file and load again
import pickle
# Work with csv files
import csv
csv.field_size_limit(sys.maxsize)
import pymongo
from wordcloud import WordCloud,STOPWORDS

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Prepare Train Data
### Open Data Source and format it properly for further use

In [None]:
training_data = []

# we change the initial labels of 0 => negative and 1 => positive. Every tweet is therefore labeled properly
with open('./files/tweets/train_data_32k_tweets.csv','rt') as csv_data:
    reader = csv.reader(csv_data, delimiter=';')
    for labeled_tweet in reader:
        if labeled_tweet[1] == '0':
            labeled_tweet[1] = 'negative'
        else:
            labeled_tweet[1] = 'positive'
        training_data.append(labeled_tweet)        

### Define Helper Functions for Data Preparation

In [None]:
# Clean tweet and remove links and other unwanted information
def clean_tweet(tweet):
    return ''.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet))


# Tokenize every single tweet (=break down tweet into array of words with label)
def tokenize_tweet(tweet):
    return nltk.tokenize.word_tokenize(tweet)


# remove common english stopwords from a tweet (i.e. 'and', 'this', 'or', 'i')
def remove_stopwords(tweet):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    for word in tweet:
        if word in stopwords:
            tweet.remove(word)
    return tweet

### Clean, Tokenize and Remove Stopwords from Tweets

In [None]:
'''Cleaning every tweet and removing unwanted characters like commas, links, @ signs and so on...
We also move the actual tweet at index 0 in the array and the label (neg or pos) at index 1.
So we have an array of arrays where each array is one tweet with the associated label => [ [tweet, label] ]'''
cleaned_tweets = [[clean_tweet(labeled_tweet[0]),labeled_tweet[1]] for labeled_tweet in training_data]

# tokenizing every tweet
# now we change the array of every tweet to a tuple of (tweet, label)
tokenized_clnd_tweets = [(tokenize_tweet(labeled_tweet[0]),labeled_tweet[1].lower()) for labeled_tweet in cleaned_tweets]

# removing stopwords
for (tweet,_) in tokenized_clnd_tweets:
    remove_stopwords(tweet)
    
formatted_train_data = tokenized_clnd_tweets    
# printing an example to show how the data looks like now
print(formatted_train_data[0])

### Define Feature Extractor Methods

In [None]:
# extract all words from all tweets and save it in an array of words
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words

# use NLTK to all the features (i.e. words) from every tweet. 
# Features are important for the classifier to classify tweets using these features
def get_word_features(wordlist):
    wordlist_freq = nltk.FreqDist(wordlist)
    word_features = wordlist_freq.keys()
    return word_features

# This mehtod is used by the classifier to extract the features from every tweet and use it for training.
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

#### Extract Word Features 

In [None]:
# Get all words from all tweets first and then get all the word features and save them under the variable
word_features = get_word_features(get_words_in_tweets(formatted_train_data))

#### Define the Training Data

In [None]:
print("Maximum 'size' allowed is: %s" % len(formatted_train_data))

In [None]:
'''Set the size of the training data to use for your classifier.
Be warned: Bigger datasets will take your classifier more time but will train it far more better.
However, if you choose a smaller dataset, the classifier will finish earlier with training but will be not that good'''
size = 500

# Extract all features from the data to prepare it for the classifier and start training it
training_set = nltk.classify.apply_features(extract_features, formatted_train_data[:size])

## Train Classifier

In [None]:
# ETT (Estimated Time of Training) = depending on size of training set and Docker config aswell as Hardware
# Skip if trained classifier is available as saved file => go to 'Load classifier from file'
classifier = nltk.NaiveBayesClassifier.train(training_set)

### Save the trained Classifier

In [None]:
classifier_file = open('./files/sentiment_clf.pickle', 'wb')
pickle.dump(classifier, classifier_file)
classifier_file.close()

### Load saved Classifier from file (skips training if trained classifier already saved as a file)

In [None]:
classifier_file = open('./files/sentiment_classifier.pickle', 'rb')
# save trained classifier (file) under classifier variable
classifier = pickle.load(classifier_file)
classifier_file.close()

## Test Classifier

In [None]:
tweet_student1 = 'I hate FH JOANNEUM'
tweet_student2 = 'FH JOANNEUM is awesome'

print("Student1's sentiment:", classifier.classify(extract_features(tweet_student1.split())).upper())
print("Student2's sentiment:", classifier.classify(extract_features(tweet_student2.split())).upper())
print(classifier.show_most_informative_features(10))

## Show WordCloud of most informative features

In [None]:
number_of_words = 50
wordcloud_data = [re.match(r'^.*contains\((.*)\).*$', word[0], re.M).group(1) for word in classifier.most_informative_features(number_of_words)]

def wordcloud_draw(data, color = 'white'):
    text = ' '.join([word for word in data])
    wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color=color,
                      width=2500,
                      height=2000).generate(text)
    plt.figure(1,figsize=(13, 13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
    
wordcloud_draw(wordcloud_data,'white')

## Test Classifier with real world Tweets
### Set-up MongoDB Connection

In [None]:
MONGO_URL = 'mongodb://twitter-mongodb:27017/'

In [None]:
def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False, criteria=None, projection=None):
    # Optionally, use criteria and projection to limit the data that is
    # returned - http://docs.mongodb.org/manual/reference/method/db.collection.find/
    
    # Connects to the MongoDB server running on
    client = pymongo.MongoClient(MONGO_URL)
    # Reference a particular collection in the database
    db = client[mongo_db]
    # Perform a bulk insert and return the IDs
    coll = db[mongo_db_coll]
    if criteria is None:
        criteria = {}
    if projection is None:
        cursor = coll.find(criteria)
    else:
        cursor = coll.find(criteria, projection)
    
    # Returning a cursor is recommended for large amounts of data
    if return_cursor:
        return cursor
    else:
        return [ item for item in cursor ]

In [None]:
# define if trends should be used as tweets or trumps tweets
# use either 'trump' or 'trends'
database = 'trump'

data = load_from_mongo(database, 'tweets')
tweets_from_mongo = [tweet['text'] for tweet in data]

### Remove unwanted characters from Tweets

In [None]:
tweets = [clean_tweet(tweet) for tweet in tweets_from_mongo]

### Tokenize Tweets and remove stopwords

In [None]:
# tokenize tweets
tokenized_tweets = [nltk.tokenize.word_tokenize(tweet.lower()) for tweet in tweets]

# remove stopwords
for tweet in tokenized_tweets:
    remove_stopwords(tweet)

# print available #nr of tweets to know range for next code snippet
print("Available numbers of tweets: %s" % len(tokenized_tweets))

### Test the Classifier with one of the fetched Tweets

In [None]:
#change tweet number to analyse different tweets that where fetched
tweet_nr = 99

trump_tweet = tokenized_tweets[tweet_nr]
print(tweets[tweet_nr])
print("Trump's Sentiment about the Tweet:", classifier.classify(extract_features(trump_tweet)).upper())