In [337]:
# encoding: utf-8
import fileinput
import json
import copy
import nltk
import pprint
from nltk.stem import WordNetLemmatizer
import re
import string
from collections import Counter
import urllib
from nltk.corpus import sentiwordnet as swn
from contractions import CONTRACTION_MAP
import numpy as np
import pandas as pd
from classifier import csv2listdict
from nltk.corpus import stopwords
from random import shuffle
from preprocess import preprocess
from clean_text import get_text_sanitized
from sklearn.feature_extraction.text import CountVectorizer


In [338]:
tweets_emotion_file = './data/text_emotion.csv'
features = ['tweet_id', 'sentiment', 'text']
tweets_emotions = pd.read_csv(tweets_emotion_file)

In [339]:
stopword_list = nltk.corpus.stopwords.words("english")

def remove_characters_before_tokenization(sentence, keep_apostrophes=False):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = r'[?|$|&|*|%|@|(|)|~]' # add other characters here to remove them
        filtered_sentence = re.sub(PATTERN, r'', sentence).lower()
    else:
        PATTERN = r'[^a-zA-Z0-9 ]' # only extract alpha-numeric characters
        filtered_sentence = re.sub(PATTERN, r'', sentence).lower()
    return filtered_sentence


def sanitize_content(text):
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return text


def nostop_content(text):
    text = ' '.join([word for word in text.split() if word not in stopword_list])
    return text.lower()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'n')
    return lemma


def lemmatize_content(text):
    tmp = []
    for word in text.split():
        tmp.append(lemmatize(word))
    return ' '.join(tmp)


# for t in tweets:
#     not_a_stopword = []
#     for word in t['text_tokenized']:
#         word = lemmatize(word)
#         if word not in stopword_list:
#             not_a_stopword.append(word)
#     t['text_tokens'] = not_a_stopword

lemmatizer = WordNetLemmatizer()   

content_sanitizer = lambda x: sanitize_content(x)
content_stopwordsremover = lambda x: nostop_content(x)
content_lemmatizer = lambda x: lemmatize_content(x)

tweets_emotions['sanitized_content'] = tweets_emotions['content'].apply(content_sanitizer)
tweets_emotions['sanitized_content'] = tweets_emotions['sanitized_content'].apply(content_stopwordsremover)
tweets_emotions['lemmatized_content'] = tweets_emotions['sanitized_content'].apply(content_lemmatizer)

tweets_emotions['sentiment_num'] = tweets_emotions.sentiment.map({'neutral':0, 
                                                                  'worry':1, 
                                                                  'happiness':2,
                                                                  'sadness':3,
                                                                  'love':4,
                                                                  'surprise':5,
                                                                  'fun':6,
                                                                  'relief':7,
                                                                  'hate':8,
                                                                  'empty':9,
                                                                  'enthusiasm':10,
                                                                  'boredom':11,
                                                                  'anger':12
                                                                 })

In [340]:
from sklearn.model_selection import train_test_split

X = tweets_emotions.lemmatized_content
y = tweets_emotions.sentiment_num

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(32000,)
(8000,)
(32000,)
(8000,)


In [350]:
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit(X_train)
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)


In [356]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_bow, y_train)

# make class predictions for X_test_bow
y_predict_class = nb.predict(X_test_bow)

# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_predict_class)

0.31962499999999999

In [357]:
print(y_test.value_counts())

0     1740
1     1666
3     1046
2     1028
4      762
5      425
7      352
6      338
8      268
10     163
9      162
11      31
12      19
Name: sentiment_num, dtype: int64


In [298]:
def read_tweets(filenames):
    # https://stackoverflow.com/questions/24754861/unicode-file-with-python-and-fileinput
    # fileinput.input(filename, openhook=fileinput.hook_encoded("utf-8")).
    # raw = url.read().decode('windows-1252')
    tweets = []
    with fileinput.input((filenames)) as f:
        for line in f:
            tweet = json.loads(line)
            tweet['text_tokenized'] = preprocess(tweet['text'])
            tweets.append(tweet)
        return tweets
    


In [299]:
filenames = ['data/MelbourneTweets0.txt', 
             'data/MelbourneTweets2.txt']
tweets = read_tweets(filenames)

In [300]:
# # Source code from https://github.com/dipanjanS/text-analytics-with-python
# def expand_contractions(sentence, contraction_mapping):
#     contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)

#     def expand_match(contraction):
#         match = contraction.group(0)
#         first_char = match[0]
#         expanded_contraction = contraction_mapping.get(match)\
#                                 if contraction_mapping.get(match)\
#                                 else contraction_mapping.get(match.lower())
#         expanded_contraction = first_char+expanded_contraction[1:]
#         return expanded_contraction

#     expanded_sentence = contractions_pattern.sub(expand_match, sentence)
#     return expanded_sentence

# for t in tweets:
#     # https://stackoverflow.com/questions/7395789/replacing-a-weird-single-quote-with-blank-string-in-python
#     t['text_expanded'] = expand_contractions(t['text'], CONTRACTION_MAP)
#     t['text_removed'] = re.sub(r'http\S+', '', t['text_expanded'])
#     t['text_removed'] = re.sub(r'\n', '', t['text_removed'])
#     t['text_removed'] = re.sub(r'RT', '', t['text_removed'])

# # this isn't working due to encoding issue
# # more on this later
# print(expand_contractions("next time someone asks why i'm moving i'll reply", CONTRACTION_MAP))
# print(tweets[3]['text'])
# expand_contractions(tweets[3]['text'], CONTRACTION_MAP)


In [301]:
# # handles @mention, make lowercase
# tknzr = nltk.tokenize.casual.TweetTokenizer(preserve_case=False, 
#                                             strip_handles=True, 
#                                             reduce_len=True)

# for t in tweets:
#     t['text_tokenized'] = tknzr.tokenize(t['text_removed'])
# # tweets_tokenized = [tknzr.tokenize(tweet) for tweet in tweets_minus_escape]

In [302]:

# tweets_minus_stop = []
stopword_list = nltk.corpus.stopwords.words('english')

for t in tweets:
    not_a_stopword = []
    for word in t['text_tokenized']:
        word = lemmatize(word)
        if word not in stopword_list:
            not_a_stopword.append(word)
    t['text_tokens'] = not_a_stopword


In [303]:
token_counter = Counter()
for t in tweets:
    for x in set(t['text_tokens']):
        token_counter[x] += 1

In [304]:
# remove punctuations
# have to get encoding right to resolve this issue
# tweets_final = []
# for tweet in tweets_minus_stop:
#     tweet = [''.join(c for c in s if c not in string.punctuation) for s in tweet]
#     tweet = [t for t in tweet if t]
#     tweets_final.append(tweet)

In [195]:
regex_hashtag = re.compile(r'(?:\A|\s)#([a-z]{1,})(?:\Z|\s)')
print(tweets_emotion[32495]['text'])

# not understanding why only one hashtag is deletd
def remove_hashtag(tweets, regex):
    for t in tweets:
        t['text'] = re.sub(regex_hashtag, '', t['text'])
#         re.sub(regex_hashtag, '', t['text'])
    return tweets

remove_hashtag(tweets_emotion, regex_hashtag)
print(tweets_emotion[32495]['text'])


@heatedskates That may be, I still don't like hearing his name so much.   #blackhawks #canucks
@heatedskates That may be, I still don't like hearing his name so much.  #canucks


In [196]:
# handles @mention, make lowercase
tknzr = nltk.tokenize.casual.TweetTokenizer(preserve_case=False, 
                                            strip_handles=True, 
                                            reduce_len=True)
for t in tweets_emotion:
    t['text_tokenized'] = tknzr.tokenize(t['text'])


In [292]:


# print(tweets_emotion[32495]['text_tokenized'])

# for tweet in tweets_emotion:
#     for token in tweet['text_tokenized']:
#         token = lemmatize(token)

# print(tweets_emotion[32495]['text_tokenized'])

In [198]:
stops = set(stopwords.words('english'))

for t in tweets_emotion:
    not_a_stopword = []
    for word in t['text_tokenized']:
        word = lemmatize(word)
        if word not in stops:
            not_a_stopword.append(word)
    t['text_tokens'] = not_a_stopword

In [210]:
tweets_emotion[3].keys()

dict_keys(['tweet_id', 'sentiment', 'text', 'text_tokenized', 'text_tokens'])