# Sentiment emotions classifier

1. Data pre-processing
2. Feature engineering
3. Model building

In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import numpy as np
import emoji
import xgboost, textblob, string, ekphrasis, nltk, re

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from xgboost.sklearn import XGBClassifier

from sklearn.svm import NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from mlxtend.classifier import StackingCVClassifier

from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

import gensim
from gensim.models import Word2Vec

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from gensim.models.doc2vec import TaggedDocument

sp = SpellCorrector(corpus="english") 

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/antonis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/antonis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/antonis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/antonis/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Using TensorFlow backend.
  from pandas import Panel


Reading english - 1grams ...


In [2]:
#Read data
train = pd.read_csv('Data/11 emotions (0:1)/2018-E-c-En-train.txt', sep='\t')
valid = pd.read_csv('Data/11 emotions (0:1)/2018-E-c-En-dev.txt', sep='\t')

tweets = train.append(valid)
tweets = tweets.reset_index(drop = True)
tweets = tweets.drop('ID', axis=1)

tweets = tweets.rename(columns = {'Tweet':'text'})

tweets

Unnamed: 0,text,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,“Worry is a down payment on a problem you may ...,0,1,0,0,0,0,1,0,0,0,1
1,Whatever you decide to do make sure it makes y...,0,0,0,0,1,1,1,0,0,0,0
2,@Max_Kellerman it also helps that the majorit...,1,0,1,0,1,0,1,0,0,0,0
3,Accept the challenges so that you can literall...,0,0,0,0,1,0,1,0,0,0,0
4,My roommate: it's okay that we can't spell bec...,1,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
7719,@BadHombreNPS @SecretaryPerry If this didn't m...,1,0,1,0,0,0,0,0,0,0,0
7720,Excited to watch #stateoforigin tonight! Come ...,0,0,0,0,1,0,1,0,0,0,0
7721,"Blah blah blah Kyrie, IT, etc. @CJC9BOSS leavi...",1,0,1,0,0,0,0,0,1,0,0
7722,#ThingsIveLearned The wise #shepherd never tru...,0,0,0,0,0,0,0,0,0,0,0


## Data pre-processing

1. Normalization, unpacking, tokenizer
2. Tweet cleaning, lemmatizer, stopword removal

In [3]:
text_processor = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    fix_html=True,  
    segmenter="twitter", 
    corrector="twitter", 
    unpack_hashtags=True,  
    unpack_contractions=True, 
    spell_correct_elong=True,
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [10]:
stopwords = nltk.corpus.stopwords.words('english')
rooter = nltk.stem.WordNetLemmatizer().lemmatize
punctuation = '!"$%&\'()*+,-./:;=?[\\]^_`{|}~•'

def get_word_and_tag(tokens):
    tagged = pos_tag(tokens)
    cleaned_tags = []
    for word, tag in tagged:
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        cleaned_tags.append((word,pos))
    return cleaned_tags

def clean_tweet(tweet):
    tweet = tweet.lower() # lower case
    tweet = emoji.demojize(tweet) #emojis to text
    tweet = re.sub('['+punctuation + ']+', ' ', tweet) # remove punctuation
    tokens = [word for word in tweet.split(' ') if word not in stopwords] # remove stopwords
    tokens = [word for word in tokens if len(word)>0] #remove double spaces
    
    tokens = [rooter(word,tag) for word,tag in get_word_and_tag(tokens)] # apply word rooter with POS tagging
    tweet = ' '.join(tokens)
    return tweet

In [11]:
tweets['corrected_text'] = [" ".join(text_processor.pre_process_doc(s)) for s in tweets.text]
tweets['corrected_text'] = tweets['corrected_text'].apply(clean_tweet)

In [13]:
#Create train, test sets

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(tweets['corrected_text'], 
                                    tweets.drop(['text','corrected_text'], axis = 1), test_size = 0.2, random_state=0)

## Feature engineering

1. Count vectors

In [14]:
# Create count vectors
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{3,}', max_features=5000)
count_vect.fit(tweets.corrected_text)

x_count = count_vect.transform(tweets.corrected_text)
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

## Model building

In [19]:
def train_model(classifier, train, train_y, valid, valid_y):
    """
    classifier: Classifier object
    train: Train predictors
    train_y: Train y
    valid: Validation predictors
    valid_y: Validation y
    """
    classifier.fit(train, train_y)
    preds = classifier.predict(valid)
    return 'Accuracy: {}'.format(metrics.accuracy_score(valid_y[emotion], preds))

In [20]:
emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

In [23]:
for emotion in emotions:
    classifier = linear_model.LogisticRegression()
    
    print(emotion, train_model(classifier, xtrain_count, train_y[emotion], xvalid_count, valid_y))



anger Accuracy: 0.7902912621359224
anticipation Accuracy: 0.8634304207119741
disgust Accuracy: 0.7411003236245954
fear Accuracy: 0.8977346278317152
joy Accuracy: 0.8187702265372169
love Accuracy: 0.9074433656957929
optimism Accuracy: 0.7805825242718447
pessimism Accuracy: 0.8711974110032362
sadness Accuracy: 0.7799352750809061
surprise Accuracy: 0.9488673139158577
trust Accuracy: 0.940453074433657
