In [271]:
# encoding: utf-8
import fileinput
import json
import copy
import nltk
import pprint
from nltk.stem import WordNetLemmatizer
import re
import string
from collections import Counter
import urllib
from nltk.corpus import sentiwordnet as swn
from contractions import CONTRACTION_MAP
import numpy as np
import pandas as pd
from classifier import csv2listdict
from nltk.corpus import stopwords
from random import shuffle
from preprocess import preprocess
from clean_text import get_text_sanitized

In [243]:
def read_tweets(filenames):
    # https://stackoverflow.com/questions/24754861/unicode-file-with-python-and-fileinput
    # fileinput.input(filename, openhook=fileinput.hook_encoded("utf-8")).
    # raw = url.read().decode('windows-1252')
    tweets = []
    with fileinput.input((filenames)) as f:
        for line in f:
            tweet = json.loads(line)
            tweet['text_tokenized'] = preprocess(tweet['text'])
            tweets.append(tweet)
        return tweets
    


In [244]:
filenames = ['data/MelbourneTweets0.txt', 
             'data/MelbourneTweets2.txt']
tweets = read_tweets(filenames)

In [245]:
# # Source code from https://github.com/dipanjanS/text-analytics-with-python
# def expand_contractions(sentence, contraction_mapping):
#     contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)

#     def expand_match(contraction):
#         match = contraction.group(0)
#         first_char = match[0]
#         expanded_contraction = contraction_mapping.get(match)\
#                                 if contraction_mapping.get(match)\
#                                 else contraction_mapping.get(match.lower())
#         expanded_contraction = first_char+expanded_contraction[1:]
#         return expanded_contraction

#     expanded_sentence = contractions_pattern.sub(expand_match, sentence)
#     return expanded_sentence

# for t in tweets:
#     # https://stackoverflow.com/questions/7395789/replacing-a-weird-single-quote-with-blank-string-in-python
#     t['text_expanded'] = expand_contractions(t['text'], CONTRACTION_MAP)
#     t['text_removed'] = re.sub(r'http\S+', '', t['text_expanded'])
#     t['text_removed'] = re.sub(r'\n', '', t['text_removed'])
#     t['text_removed'] = re.sub(r'RT', '', t['text_removed'])

# # this isn't working due to encoding issue
# # more on this later
# print(expand_contractions("next time someone asks why i'm moving i'll reply", CONTRACTION_MAP))
# print(tweets[3]['text'])
# expand_contractions(tweets[3]['text'], CONTRACTION_MAP)


In [246]:
# # handles @mention, make lowercase
# tknzr = nltk.tokenize.casual.TweetTokenizer(preserve_case=False, 
#                                             strip_handles=True, 
#                                             reduce_len=True)

# for t in tweets:
#     t['text_tokenized'] = tknzr.tokenize(t['text_removed'])
# # tweets_tokenized = [tknzr.tokenize(tweet) for tweet in tweets_minus_escape]

In [247]:
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'n')
    return lemma

lemmatizer = WordNetLemmatizer()
# tweets_minus_stop = []
stopword_list = nltk.corpus.stopwords.words('english')

for t in tweets:
    not_a_stopword = []
    for word in t['text_tokenized']:
        word = lemmatize(word)
        if word not in stopword_list:
            not_a_stopword.append(word)
    t['text_tokens'] = not_a_stopword


In [248]:
token_counter = Counter()
for t in tweets:
    for x in set(t['text_tokens']):
        token_counter[x] += 1

In [249]:
# remove punctuations
# have to get encoding right to resolve this issue
# tweets_final = []
# for tweet in tweets_minus_stop:
#     tweet = [''.join(c for c in s if c not in string.punctuation) for s in tweet]
#     tweet = [t for t in tweet if t]
#     tweets_final.append(tweet)

In [284]:
tweets_emotion_file = './data/text_emotion.csv'
features = ['tweet_id', 'sentiment', 'text']
tweets_emotions = pd.read_csv(tweets_emotion_file)
# tweets_emotion = csv2listdict(tweets_emotion_file)

In [285]:
tweets_emotions.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [288]:
cachedStopWords = stopwords.words("english")

def remove_characters_before_tokenization(sentence, keep_apostrophes=False):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = r'[?|$|&|*|%|@|(|)|~]' # add other characters here to remove them
        filtered_sentence = re.sub(PATTERN, r'', sentence).lower()
    else:
        PATTERN = r'[^a-zA-Z0-9 ]' # only extract alpha-numeric characters
        filtered_sentence = re.sub(PATTERN, r'', sentence).lower()
    return filtered_sentence

def sanitize_twitter_commands(text):
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return text

def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in cachedStopWords])
    return text.lower()

sanitizer = lambda x: sanitize_twitter_commands(x)
stopwordsremover = lambda x: remove_stopwords(x)
tweets_emotions['sanitized_content'] = tweets_emotions['content'].apply(sanitizer)
tweets_emotions['sanitized_content'] = tweets_emotions['sanitized_content'].apply(stopwordsremover)
tweets_emotions['sentiment_num'] = tweets_emotions.sentiment.map({'neutral':0, 
                                                                  'worry':1, 
                                                                  'happiness':2,
                                                                  'sadness':3,
                                                                  'love':4,
                                                                  'surprise':5,
                                                                  'fun':6,
                                                                  'relief':7,
                                                                  'hate':8,
                                                                  'empty':9,
                                                                  'enthusiasm':10,
                                                                  'boredom':11,
                                                                  'anger':12
                                                                 })
tweets_emotions.head()

Unnamed: 0,tweet_id,sentiment,author,content,sanitized_content,sentiment_num
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,know listenin bad habit earlier started freaki...,9
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhh waitin call,3
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,funeral ceremony gloomy friday,3
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants hang friends soon,10
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,we want trade someone houston tickets one,0


In [269]:
from sklearn.model_selection import train_test_split

X = tweets_emotions.content
y = tweets_emotions.sentiment_num

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [195]:
regex_hashtag = re.compile(r'(?:\A|\s)#([a-z]{1,})(?:\Z|\s)')
print(tweets_emotion[32495]['text'])

# not understanding why only one hashtag is deletd
def remove_hashtag(tweets, regex):
    for t in tweets:
        t['text'] = re.sub(regex_hashtag, '', t['text'])
#         re.sub(regex_hashtag, '', t['text'])
    return tweets

remove_hashtag(tweets_emotion, regex_hashtag)
print(tweets_emotion[32495]['text'])


@heatedskates That may be, I still don't like hearing his name so much.   #blackhawks #canucks
@heatedskates That may be, I still don't like hearing his name so much.  #canucks


In [196]:
# handles @mention, make lowercase
tknzr = nltk.tokenize.casual.TweetTokenizer(preserve_case=False, 
                                            strip_handles=True, 
                                            reduce_len=True)
for t in tweets_emotion:
    t['text_tokenized'] = tknzr.tokenize(t['text'])


In [197]:
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word, 'n')
    return lemma

print(tweets_emotion[32495]['text_tokenized'])

for tweet in tweets_emotion:
    for token in tweet['text_tokenized']:
        token = lemmatize(token)

print(tweets_emotion[32495]['text_tokenized'])

['that', 'may', 'be', ',', 'i', 'still', "don't", 'like', 'hearing', 'his', 'name', 'so', 'much', '.', '#canucks']
['that', 'may', 'be', ',', 'i', 'still', "don't", 'like', 'hearing', 'his', 'name', 'so', 'much', '.', '#canucks']


In [198]:
stops = set(stopwords.words('english'))

for t in tweets_emotion:
    not_a_stopword = []
    for word in t['text_tokenized']:
        word = lemmatize(word)
        if word not in stops:
            not_a_stopword.append(word)
    t['text_tokens'] = not_a_stopword

In [210]:
tweets_emotion[3].keys()

dict_keys(['tweet_id', 'sentiment', 'text', 'text_tokenized', 'text_tokens'])

In [None]:
from sklearn.cross_validation import train_test_split

x = tweets_emotion
y = tweets_emotion['sentiment']


In [199]:
shuffle(tweets_emotion)

t1 = int(len(tweets_emotion)*0.8)
t2 = t1 + int(len(tweets_emotion)*0.1)
             
training = tweets_emotion[:t1]
development = tweets_emotion[t1:t2]
testing = tweets_emotion[t2:]

shuffle(training)
shuffle(development)
shuffle(testing)


In [208]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_dtm = vectorizer.fit_transform([t['text'] for t in testing])
train_dtm

<4000x9515 sparse matrix of type '<class 'numpy.int64'>'
	with 47500 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.anive_bayes import MultinomialNB

nb = MultinomialNB()


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score
from sklearn import cross_validation 


def get_BOW(text):
    BOW = {}
    for word in text:
        BOW[word] = BOW.get(word,0) + 1
    return BOW


def prepare_tweets_data(tweets, feature_extractor):
    feature_matrix = []
    classifications = []
    for tweet in tweets:
        feature_dict = feature_extractor(tweet)
        feature_matrix.append(feature_dict)
        if tweet in positive_tweets:
            classifications.append('positive')
        else:
            classifications.append('negative')
    
    vectorizer = DictVectorizer()
    dataset = vectorizer.fit_transform(feature_matrix)
    return dataset, classifications


def check_accuracy(model, predictions, classifications):
    print "\n"+model+" accuracy"
    print accuracy_score(classifications,predictions)

dataset, classifications = prepare_tweets_data(development, get_BOW)
n_to_test = range(5,25)
clfs = [MultinomialNB(n, True, [0.52, 0.5]) for n in n_to_test]
for clf in clfs:
    predictions = cross_validation.cross_val_predict(clf, dataset, classifications, cv=10)
    check_accuracy("naive bayes", predictions, classifications)
    print(clf.get_params())

n_to_test = range(1,10)
clfs = [LogisticRegression(C=n/float(10)) for n in n_to_test]
for clf in clfs:
    predictions = cross_validation.cross_val_predict(clf, dataset, classifications, cv=10)
    check_accuracy("logistic regression", predictions, classifications)
    print(clf.get_params())

In [None]:
from sklearn.metrics import classification_report

def check_results(predictions, classifications):
    print "accuracy"
    print accuracy_score(classifications,predictions)
    print classification_report(classifications,predictions)

NBclf = MultinomialNB(12, True, [0.52, 0.5]) 
LRclf = LogisticRegression(C=0.8)

training_X, training_Y = prepare_tweets_data(training, get_BOW)
testing_X, testing_Y = prepare_tweets_data(training, get_BOW)

NBclf.fit(training_X, training_Y)
LRclf.fit(training_X, training_Y)

print("\nNaive Bayes\n")
predictions = NBclf.predict(testing_X)
check_results(predictions, testing_Y)


print("\nLogisticRegression\n")
predictions = LRclf.predict(testing_X)
check_results(predictions, testing_Y)