In [66]:
# encoding: utf-8
import fileinput
import json
import pprint

import re
import string

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from collections import Counter
from contractions import CONTRACTION_MAP

import numpy as np
import pandas as pd

from preprocess import preprocess
from clean_text import get_text_sanitized


In [67]:
tweets_emotion_file = './data/text_emotion.csv'
features = ['tweet_id', 'sentiment', 'text']
tweets_emotions = pd.read_csv(tweets_emotion_file)

In [68]:
stopword_list = nltk.corpus.stopwords.words("english")

def remove_characters_before_tokenization(sentence, keep_apostrophes=False):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = r'[?|$|&|*|%|@|(|)|~]' # add other characters here to remove them
        filtered_sentence = re.sub(PATTERN, r'', sentence).lower()
    else:
        PATTERN = r'[^a-zA-Z0-9 ]' # only extract alpha-numeric characters
        filtered_sentence = re.sub(PATTERN, r'', sentence).lower()
    return filtered_sentence


def sanitize_content(text):
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return text


def nostop_content(text):
    text = ' '.join([word for word in text.split() if word not in stopword_list])
    return text.lower()


def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'n')
    return lemma


def lemmatize_content(text):
    tmp = []
    for word in text.split():
        tmp.append(lemmatize(word))
    return ' '.join(tmp)

def int64toint32(x):
    x = x.astype(np.int32)
    return x

lemmatizer = WordNetLemmatizer()   

content_sanitizer = lambda x: sanitize_content(x)
content_stopwordsremover = lambda x: nostop_content(x)
content_lemmatizer = lambda x: lemmatize_content(x)
type_converter = lambda x: int64toint32(x)
         
tweets_emotions['sanitized_content'] = tweets_emotions['content'].apply(content_sanitizer)
tweets_emotions['sanitized_content'] = tweets_emotions['sanitized_content'].apply(content_stopwordsremover)
tweets_emotions['lemmatized_content'] = tweets_emotions['sanitized_content'].apply(content_lemmatizer)

# tweets_emotions['sentiment_num'] = tweets_emotions.sentiment.map({'neutral':0, 
#                                                                   'worry':1, 
#                                                                   'happiness':2,
#                                                                   'sadness':3,
#                                                                   'love':4,
#                                                                   'surprise':5,
#                                                                   'fun':6,
#                                                                   'relief':7,
#                                                                   'hate':8,
#                                                                   'empty':9,
#                                                                   'enthusiasm':10,
#                                                                   'boredom':11,
#                                                                   'anger':12
#                                                                  })

In [69]:
from sklearn.model_selection import train_test_split

X = tweets_emotions.lemmatized_content
y = tweets_emotions.sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(32000,)
(8000,)
(32000,)
(8000,)


In [70]:
count_vect = CountVectorizer()
X_train_bow = count_vect.fit(X_train)
X_train_bow = count_vect.transform(X_train)
X_test_bow = count_vect.transform(X_test)

tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_bow)
X_train_tfidf = tfidf_transformer.transform(X_train_bow)
X_train_tfidf.shape

clf = MultinomialNB().fit(X_train_tfidf, y_train)
y_predict_class = clf.predict(X_test_bow)

from sklearn import metrics
metrics.accuracy_score(y_test, y_predict_class)

0.31337500000000001

In [71]:
print(y_test.value_counts())

neutral       1740
worry         1666
sadness       1046
happiness     1028
love           762
surprise       425
relief         352
fun            338
hate           268
enthusiasm     163
empty          162
boredom         31
anger           19
Name: sentiment, dtype: int64


In [72]:
def read_tweets(filenames):
    # https://stackoverflow.com/questions/24754861/unicode-file-with-python-and-fileinput
    # fileinput.input(filename, openhook=fileinput.hook_encoded("utf-8")).
    # raw = url.read().decode('windows-1252')
    tweets = []
    with fileinput.input((filenames)) as f:
        for line in f:
            tweet = json.loads(line)
            tweet['text_tokenized'] = preprocess(tweet['text'])
            tweets.append(tweet)
        return tweets
    


In [73]:
filenames = ['data/MelbourneTweets0.txt', 
             'data/MelbourneTweets2.txt']
tweets = read_tweets(filenames)

In [74]:
# handles @mention, make lowercase
tknzr = nltk.tokenize.casual.TweetTokenizer(preserve_case=False, 
                                            strip_handles=True, 
                                            reduce_len=True)
regexp_hashtag = re.compile(r'(?:\A|\s)#([a-z]{1,})(?:\Z|\s)')
regexp_url = re.compile(r"http\S+")


for t in tweets:
    tokenized = tknzr.tokenize(t['text'])
    not_a_stopword = []
    for word in tokenized:
        word = lemmatize(word)
        if word not in stopword_list:
            not_a_stopword.append(word)
    t['text_tokens'] = ' '.join(not_a_stopword)
    predict_new = clf.predict(count_vect.transform([t['text_tokens']]))
    t['text_sentiment'] = predict_new.tolist()[0]


In [75]:
# regex_hashtag = re.compile(r'(?:\A|\s)#([a-z]{1,})(?:\Z|\s)')
# print(tweets_emotion[32495]['text'])

# # not understanding why only one hashtag is deletd
# def remove_hashtag(tweets, regex):
#     for t in tweets:
#         t['text'] = re.sub(regex_hashtag, '', t['text'])
# #         re.sub(regex_hashtag, '', t['text'])
#     return tweets

# remove_hashtag(tweets_emotion, regex_hashtag)
# print(tweets_emotion[32495]['text'])


In [76]:
# remove punctuations
# have to get encoding right to resolve this issue
# tweets_final = []
# for tweet in tweets_minus_stop:
#     tweet = [''.join(c for c in s if c not in string.punctuation) for s in tweet]
#     tweet = [t for t in tweet if t]
#     tweets_final.append(tweet)

# print(tweets_emotion[32495]['text_tokenized'])

# for tweet in tweets_emotion:
#     for token in tweet['text_tokenized']:
#         token = lemmatize(token)

# print(tweets_emotion[32495]['text_tokenized'])

In [82]:
dict1 = tweets[3298]
# dict1 = { 'name' : 'song', 'age' : 10 }

# print("dict1 = {0}".format(dict1))
# print("dict1 type = {0}".format(type(dict1)))
# print("================")


# CONVERT dictionary to json using json.dump

json_val = json.dumps(dict1)
tweets_json = json.dumps(tweets)

with open('output.json', 'w') as outfile:
    json.dump(json_val, outfile)
    
    
json = json.dumps(dict)
f = open("dict.json","w")
f.write(json)
f.close()
