In [1]:
import pandas as pd
import numpy as np
import csv
import time


import re
import nltk

# PREPROCESS THE TWEET MESSAGES

- Language Identification
- Case conversion
- Clen Text From tags (e.g. Html)
- Remove Stopwords 
- Expand Contractions
- Encode Relevant Emoticons and Punctuation
- Remove Special Characters
- Correcting Words

# DEFINE FUNCTIONS

- All functions are modular, the algorithm can be improved by choosing the right mix of feature selection and data cleansing.
- The order is important! The cleaning something (e.g. 1 word character) before processing symbols or special characters (e.g. emoticons) before converting them would compromise the algorithm!

In [2]:
#Create a DICT with the important emticons and punctuactions

# Emoticons
EMOTICONS = \
    [   ('__EMJ_SMILEY',   [':-)', ':)', '(:', '(-:', ] )  ,\
        ('__EMJ_LAUGH',        [':-D', ':D', 'X-D', 'XD', 'xD', ':P', ':p' ] )    ,\
        ('__EMJ_LOVE',     ['<3', ':\*', ] )   ,\
        ('__EMJ_WINK',     [';-)', ';)', ';-D', ';D', '(;', '(-;', ] ) ,\
        ('__EMJ_FROWN',        [':-(', ':(', ] )   ,\
        ('__EMJ_CRY',      [':,(', ':\'(', ':"(', ':(('] ) ,\
    ]

# Punctuations
PUNCTUATIONS = \
    [   ('_P_EXCL',['!', '¡', ] ),\
        ('_P_QUES',['?', '¿', ] ),\
        ('_P_EL',['...', '…', ] ),\
    ]
    
    
CONTRACTION_DIC = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

# MODULAR FUNCTIONS

In [3]:
#Check language 
#It returns the language code
from langdetect import detect
def check_language(msg):
    try:
        language = detect(msg)
    except:
        #if only punctuations are detected, assume it is an emoticon with sentiment polarity
        #Hence, do not drop it 
        language = 'en'
    return language

#Case Conversion - Gengeral String
def lower_case(msg):
    msg = msg.lower()
    return msg

#Case Conversion - Toknes
def lower_case_tokens(tokens):
    tokens = [token.lower() for token in tokens]
    return tokens

#Remove URLS
def remove_urls(msg):
    msg = re.sub(r"http\S+", "", msg)
    return msg

#Remove User Account
def remove_user_account(msg):
    msg = re.sub(r"@[^\s]+[\s]?", "", msg)
    return msg

#Remove Numbers
def remove_numbers(msg):
    msg = re.sub(r"\d+", "", msg)
    return msg

#Remove RT
def remove_retweet(msg):
    msg = re.sub(r'rt', "", msg)
    return msg

#Remove Hashtag
def remove_hashtag(msg):
    msg = re.sub(r"#[^\s]+[\s]?", "", msg)
    return msg

#TOKENIZE
#It's expecially tailored for tweets (different from word_tokenize)
from nltk.tokenize import TweetTokenizer
def tokenizer (msg):
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(msg)
    return tokens

#EXPAND CONTRACTIONS
def expand_contractions(s, CONTRACTION_MAP = CONTRACTION_DIC):
    contractions_re = re.compile('(%s)' % '|'.join(CONTRACTION_MAP.keys()))
    def replace(match):
        return CONTRACTION_MAP[match.group(0)]
    
    return contractions_re.sub(replace, s)

#REMOVE STOP-WORDS
#Keep Meaningful stopwords
meaningful_stops = ['no', 'nor', 'not']
stopwords = nltk.corpus.stopwords.words('english')
stopwords = [word for word in stopwords if word not in meaningful_stops]
def remove_stopwords(tokens):
    cleaned_tokens = [token for token in tokens if token not in stopwords]
    return cleaned_tokens

#CORRECT REPEATING CHARACTERS
from nltk.corpus import wordnet
def remove_repeated_characters(tokens):
    repeat_pattern = re.compile(r'(\w*)(\w)\2(\w*)')
    match_substitution = r'\1\2\3'
    #it checks in a dictionary of english words when to stop correcting
    def replace(old_word):
        if wordnet.synsets(old_word):
            return old_word
        new_word = repeat_pattern.sub(match_substitution, old_word)
        return replace(new_word) if new_word != old_word else new_word
    
    correct_tokens = [replace(word) for word in tokens]
    return correct_tokens

#STEMMING
#from nltk.stem import PorterStemmer, LancasterStemmer, RegexpStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.regexp import RegexpStemmer
from nltk.stem.snowball import SnowballStemmer
def stemming(tokens): #TODO: Add a dictionary
    #st = LancasterStemmer()
    st = PorterStemmer()
    #st = RegexpStemmer()
    #st = SnowballStemmer('english')
    def stemmer_token(old_word):
        stem_token = st.stem(old_word)
        return stem_token
    stem_tokens = [stemmer_token(tkn) for tkn in tokens]
    return stem_tokens


#LEMMING
from nltk.stem import WordNetLemmatizer
def lemming(tokens):
    wnl = WordNetLemmatizer()
    def lemmer_token(old_word):
        lem_token = wnl.lemmatize(old_word)
        return lem_token
    
    lem_tokens = [lemmer_token(tkn) for tkn in tokens]
    return lem_tokens

        

#CONVERT SPECIAL CHARACTERS FOR SENTIMENT ANALYSIS
def replace_special_characters(tokens):
    
    def replace(token, punctuations=PUNCTUATIONS):
        d = {r: punct for punct, replacement in punctuations for r in replacement}
        for punct, replacement in punctuations:
            pattern = "(" + "|".join(map(re.escape, replacement)) + ")"
            token = re.sub(pattern, lambda m: d[m.group()], token)
        return token
    
    punct_tokens = [replace(tkn) for tkn in tokens]
    return punct_tokens
        


#EMOTICONS - Keep only meaningful ones
def replace_emojis(tokens):
    
    def replace(token, emoticons=EMOTICONS):
        d = {r: emote for emote, replacement in emoticons for r in replacement}
        for emote, replacement in emoticons:
            pattern = "(" + "|".join(map(re.escape, replacement)) + ")"
            token = re.sub(pattern, lambda m: d[m.group()], token)
        return token
    
    emj_tokens = [replace(tkn) for tkn in tokens]
    return emj_tokens


#REMOVE SPECIAL CHARACTERS NOT RELEVANT from tokens
#TODO: It may need more research about which character to delete and which to keep
# Details
# [^ - start of a *negated character class
# \s - whitespace
# \w - word char (letter, digit or/and _)
# ' - a single quote
# & - a & symbol -% - a % symbol
# - - a hyphen (since it is at the end, it will be parsed as a literal -)
# ] - end of the character class.
def remove_special(tokens):
    def remove(token):
        pattern = re.compile(r"[^\s\w'-]")
        token = pattern.sub('', token)
        return token
    
    clean_tokens = [remove(tkn) for tkn in tokens]
    return clean_tokens


#REMOVE 1 LETTER WORDS
def remove_one_letter(tokens):
    def remove(token):
        if len(token)>1:
            return token
        pass
    clean_tokens = [remove(tkn) for tkn in tokens]
    return clean_tokens


#CLEAN NULL TOKEN
def filter_null_token(tokens):
    tokens = list(filter(None, tokens))
    return tokens

In [4]:
#Text Normalization for SENTIMENT ANALYSIS
def text_normalization_sentiment (msg):
    #Convert everything in lowcase
    #msg = lower_case(msg)
    #Remove Urls
    msg = remove_urls(msg)
    #Remove 'rt' (retweet) TODO: It needs to preserve a STRING!
    msg = remove_retweet(msg)
    #Remove '@' (mentions)
    msg = remove_user_account(msg)
    #Remove Hashtag
    msg = remove_hashtag(msg)
    #Remove Numbers
    msg = remove_numbers(msg)
    #Expand Contractions
    msg = expand_contractions(msg)
    
    ######### TOKENIZER ## NOW WE DEAL WITH -TOKENS- ########
    msg_tkn = tokenizer(msg)
    #Encode Relevant Emoticons
    msg_tkn = replace_emojis(msg_tkn)
    #Lowcase after tokenization and emotijs
    msg_tkn = lower_case_tokens(msg_tkn)
    #Correcting repeating characters
    msg_tkn = remove_repeated_characters(msg_tkn)
    #Remove stop-words
    msg_tkn = remove_stopwords(msg_tkn)
    
    #NB. Apply Lemmatization before, and after Stemming!
    #Lemmatization
    msg_tkn = lemming(msg_tkn)
    #Stemming (which algorithm?)
    msg_tkn = stemming(msg_tkn)
   
        
    #Special Character (punctuation)
    msg_tkn = replace_special_characters(msg_tkn)
    
    #Remove Special Character left
    msg_tkn = remove_special(msg_tkn)
    
    #Remove 1 letter tokens
    mgs_tkn = remove_one_letter(msg_tkn)
    
    #Attention: Clean from Null tokens
    msg_tkn = filter_null_token(msg_tkn)

    #Join the tokens back (the classifier tfidf needs a string)
    msg_tkn = " ".join(msg_tkn)
    
    return msg_tkn

# LABELED DATASET

In [5]:
#Import LABELED DATASET (BINARY) OF TWEETS
#2 labels dataset: Positive - Negative

df_labeled = pd.read_csv('D:/NLP Project - Resources/Trainded Dataset - Sentiment.csv', error_bad_lines=False)
df_labeled = df_labeled[['Sentiment','SentimentText']]
df_labeled.dropna()
print('Shape Labeled Dataset:', df_labeled.shape)
df_labeled.head(10)

b'Skipping line 8836: expected 4 fields, saw 5\n'
b'Skipping line 535882: expected 4 fields, saw 7\n'


Shape Labeled Dataset: (1578612, 2)


Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...
5,0,or i just worry too much?
6,1,Juuuuuuuuuuuuuuuuussssst Chillin!!
7,0,Sunny Again Work Tomorrow :-| ...
8,1,handed in my uniform today . i miss you ...
9,1,hmmmm.... i wonder how she my number @-)


In [11]:
#Number of positive and negative tweets.

negative = df_labeled.ix[(df_labeled['Sentiment']==0)]
positive = df_labeled.ix[(df_labeled['Sentiment']==1)]
print('Positive labeled tweets:', positive.shape[0])
print('Negative labeled tweets:', negative.shape[0])

Positive labeled tweets: 790177
Negative labeled tweets: 788435


In [21]:
#Keep only english messages
#This would prevent unfiltered non-english messages to reduce the pefromances of the model
#CHECK THE LANGUAGE
start_time = time.time()

df_labeled['language'] = df_labeled.SentimentText.apply(check_language)

print("--- %s seconds to check language---" % (time.time() - start_time))

--- 9575.795402526855 seconds to check language---


In [22]:
#Filter Only English Tweets
df_labeled = df_labeled[df_labeled.language == 'en']
print('Shape Labeled Dataset - Only English texts:', df_labeled.shape)
df_labeled.head(10)

Shape Labeled Dataset - Only English texts: (1470739, 3)


Unnamed: 0,Sentiment,SentimentText,language
0,0,is so sad for my APL frie...,en
1,0,I missed the New Moon trail...,en
2,1,omg its already 7:30 :O,en
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,en
4,0,i think mi bf is cheating on me!!! ...,en
5,0,or i just worry too much?,en
7,0,Sunny Again Work Tomorrow :-| ...,en
8,1,handed in my uniform today . i miss you ...,en
9,1,hmmmm.... i wonder how she my number @-),en
10,0,I must think about positive..,en


In [24]:
#Store the filtered labeled dataset
df_labeled.to_csv('D:/NLP Project - Resources/Trainded Dataset - EnglishFiltered.csv',
                  encoding='utf-8')

# TRAINING MODELS

- ConuntVectorizer: It learns the vocaboluary of the corpus and extracts word count features (Bag of Words).

- Pipeline: connects a series of steps into one ojbect. We can use it to merge the feature extraction and classification in one operation


In [29]:
import scipy
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split

In [30]:
#Import LABELED DATASET (BINARY) OF TWEETS
#2 labels dataset: Positive - Negative
df_labeled = pd.read_csv('D:/NLP Project - Resources/Trainded Dataset - EnglishFiltered.csv')

In [31]:
start_time = time.time()

#Assign the Tweets to X and the Sentiment to Y
#TWEET (1,5 milion of tweets)
df_labeled = df_labeled.loc[2:1450000]
#Normalize tweets for Sentiment Analysis
df_labeled['Normalized Text'] = df_labeled.SentimentText.apply(text_normalization_sentiment)
#Prevent NaN after normalization (it can affect some embedding procedures)
df_labeled = df_labeled.dropna()

print("--- %s seconds to normalize the labeled dataset---" % (time.time() - start_time))

#TWEETS NORMALIZED
X = df_labeled[['Normalized Text']]
#SENTIMENT LABEL
y = df_labeled[['Sentiment']]

--- 3755.5590500831604 seconds to normalize the labeled dataset---


In [32]:
df_labeledeled.to_csv('D:/NLP Project - Resources/Trained Dataset - EnglishFiltered - NORMALIZED.csv',
                 encoding='utf-8')

# SVM Algorithm (linear)

The C parameter tells the SVM optimization how much you want to avoid misclassifying each training example.
For large values of C, the optimization will choose a smaller-margin hyperplane if that hyperplane does
a better job of getting all the training points classified correctly. Conversely,
a very small value of C will cause the optimizer to look for a larger-margin separating hyperplane,
even if that hyperplane misclassifies more points.
For very tiny values of C, you should get misclassified examples, often even if your training data is linearly separable.

In [35]:
# Split dataset in testing and training - particular attention to the index
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X.iloc[:,0], y.iloc[:,0],
                                                                             test_size=0.2, random_state=42)

In [36]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

start_time = time.time()

#C=0.5 --> Small C values increase the separation margin on the hyperplane. It is beneficial for this model, because
#the models need to be able to generalize in a microblog context
vec_clf = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,3))),
                   ('tfidf', TfidfTransformer(use_idf=True)),
                   ('clf', LinearSVC(C=0.5))])
#Fit the SVM model
vec_clf.fit(X_train, y_train)
#Save the classifier
joblib.dump(vec_clf, 'svmClassifier v02.pk1', compress=3)

print("--- %s seconds to train SVM algorithm---" % (time.time() - start_time))

--- 944.1860303878784 seconds to train SVM algorithm---


In [37]:
####Evaluation Classifier: Linear Support Vector#####
y_pred = vec_clf.predict(X_test)
y_confidence = vec_clf.decision_function(X_test)

#RESULTS
print (sklearn.metrics.classification_report(y_test, y_pred))
print ('Distance boundaries')
print (y_confidence.max())
print (y_confidence.min())

             precision    recall  f1-score   support

          0       0.80      0.83      0.81    145285
          1       0.82      0.79      0.81    144715

avg / total       0.81      0.81      0.81    290000

Distance boundaries
3.8552277354
-8.88838657272


# NAIVE BAYES CLASSIFIER

In [None]:
# Split dataset in testing and training - particular attention to the index for Random Forest
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X.iloc[:,0], y.iloc[:,0],
                                                                             test_size=0.2, random_state=42)

In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


start_time = time.time()

naiv_cl = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,3))),
                   ('tfidf', TfidfTransformer(use_idf=True)),
                   ('clf', MultinomialNB())])

naiv_cl = naiv_cl.fit(X_train, y_train)
#Save the classifier
joblib.dump(naiv_cl, 'multinomialNBCassifier V02.pk1', compress=3)

print("--- %s seconds to train Naive Bayes Classifier---" % (time.time() - start_time))

--- 394.0458130836487 seconds to train Naive Bayes Classifier---


In [34]:
####PREDICTION: Classifier: MultinomialNB#####
y_pred = naiv_cl.predict(X_test)
#Percentage of belonging to each class
y_confidence = naiv_cl.predict_proba(X_test)

#RESULTS
print (sklearn.metrics.classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.77      0.82      0.80    145285
          1       0.81      0.76      0.78    144715

avg / total       0.79      0.79      0.79    290000



# RANDOM FOREST

In [39]:
# Split dataset in testing and training - particular attention to the index for Random Forest
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X.iloc[:,0], y.iloc[:,0],
                                                                             test_size=0.2, random_state=42)

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

start_time = time.time()

randfor_cl = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,3))),
                       ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', RandomForestClassifier())])

randfor_cl = randfor_cl.fit(X_train, y_train)
#Save the classifier
joblib.dump(randfor_cl, 'randomForestCassifier.pk1', compress=3)

print("--- %s seconds to train Random Forest Classifier---" % (time.time() - start_time))

--- 120492.02273583412 seconds to train Random Forest Classifier---


In [41]:
####PREDICTION: Classifier: MultinomialNB#####
y_pred = randfor_cl.predict(X_test)

#RESULTS
print (sklearn.metrics.classification_report(y_test, y_pred))


#Percentage of belonging to each class
y_confidence = randfor_cl.predict_proba(X_test)

             precision    recall  f1-score   support

          0       0.74      0.80      0.77    145285
          1       0.78      0.72      0.75    144715

avg / total       0.76      0.76      0.76    290000



# MAXIMUM ENTROPY (LOGISTIC REGRESSION) CLASSIFIER

In [None]:
# Split dataset in testing and training - particular attention to the index for Random Forest
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X.iloc[:,0], y.iloc[:,0],
                                                                             test_size=0.2, random_state=42)

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

start_time = time.time()

logistic_cl = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,3))),
                       ('tfidf', TfidfTransformer(use_idf=True)),
                       ('clf', LogisticRegression())])

logistic_cl = logistic_cl.fit(X_train, y_train)
#Save the classifier
joblib.dump(randfor_cl, 'logisticCassifier.pk1', compress=3)

print("--- %s seconds to train Logistic Regression Classifier---" % (time.time() - start_time))

--- 512.7569930553436 seconds to train Logistic Regression Classifier---


In [45]:
####PREDICTION: Classifier: LogistiRegression#####
y_pred = logistic_cl.predict(X_test)

#RESULTS
print (sklearn.metrics.classification_report(y_test, y_pred))


#Percentage of belonging to each class
y_confidence = randfor_cl.predict_proba(X_test)

             precision    recall  f1-score   support

          0       0.80      0.82      0.81    145285
          1       0.82      0.79      0.80    144715

avg / total       0.81      0.81      0.81    290000

