# Special Instruction

To run this notebook, download the Word2Vec embedding trained on twitter data by Frederic Godin and put inside directory ./embeddings/word2vec_twitter_model.bin

In [2]:
#@title Connect Google Drive folder

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SMU_MITB_NLP/Project/Classical Models

from tensorflow.python.client import device_lib
device_lib.list_local_devices()

#!pip install stanza

ModuleNotFoundError: No module named 'google.colab'

In [3]:
#@title Import Packages & Define Pretrained Models
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn import metrics, model_selection
import numpy as np
# import stanza
import gensim.models
import os, json, re
# from stanza.server import CoreNLPClient
import pandas as pd 
from sklearn import ensemble
import xgboost as xgb
import pickle

# import from the scripts provided by the creator(s) of Twitter Word2vec model to read pre-trained embeddings
# source: https://www.fredericgodin.com/software/
import word2vecReaderUtils as utils
from word2vecReader import *

## Download word2vec_twitter_model.bin on big twitter dataset (4.56GB file)
word_emb_model = Word2Vec.load_word2vec_format('./embeddings/word2vec_twitter_model.bin', binary=True)
emj_emb_model = gensim.models.KeyedVectors.load_word2vec_format('./pre_trained/emoji2vec.bin', binary=True)
# corenlp_dir = './corenlp'
# stanza.install_corenlp(dir=corenlp_dir)
# os.environ["CORENLP_HOME"] = corenlp_dir

FileNotFoundError: [Errno 2] No such file or directory: './embeddings/word2vec_twitter_model.bin'

In [None]:
#@title Load Dataset
def load_dataset(train = True):
    if train:
        path = './data/train_emoji.txt'
    else:
        path = './data/test.txt'
    y = []
    sentences = []
    with open(path, 'r', encoding = 'utf-8') as handle:
        for ind, line in enumerate(handle):
            if ind != 0:
                line = line.rstrip()
                label = int(line.split("\t")[1])
                tweet = line.split("\t")[2]
                y.append(label)
                sentences.append(tweet)
    return sentences, y

corpus, y = load_dataset('train')
for i in range(5):
    print('Sentence {}: {}\nLabel: {}\n'.format(i,corpus[i],y[i]))

Sentence 0: Sweet United Nations video. Just in time for Christmas. #imagine #NoReligion  http://t.co/fej2v3OUBR
Label: 1

Sentence 1: @mrdahl87 We are rumored to have talked to Erv's agent... and the Angels asked about Ed Escobar... that's hardly nothing    ;)
Label: 1

Sentence 2: Hey there! Nice to see you Minnesota/ND Winter Weather
Label: 1

Sentence 3: 3 episodes left I'm dying over here
Label: 0

Sentence 4: "I can't breathe!" was chosen as the most notable quote of the year in an annual list released by a Yale University librarian
Label: 1



# Feature Engineering

### Base Feature

In [None]:
# Splits each tweet into 2 sections, averages word and emoji embeddings for each part separately
# This is the base feature of the Classical Discriminative Models. 
def bisectioned_embeddings_avg(corpus, word_emb_model, emj_emb_model, diff = False):
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
    word_emb_model_size = word_emb_model.layer1_size
    emj_emb_model_size = emj_emb_model.vector_size 
    meanVectors = []
    for tweet in corpus:
        t = tokenizer(tweet)
        rightWords, rightEmojis, leftWords, leftEmojis = ([], [], [], [])
         # mRW: mean right words
         # mRE: mean right emojis 
         # mLW: mean left words
         # mLE: mean left emojis
        mRW, mRE, mLW, mLE = (np.zeros(word_emb_model_size), np.zeros(emj_emb_model_size), np.zeros(word_emb_model_size), np.zeros(emj_emb_model_size)) 
        for i in range(int(len(t)/2)):
            if t[i] in word_emb_model and not t[i].startswith('@'):
                rightWords.append(word_emb_model[t[i]])
            if t[i] in emj_emb_model:
                rightEmojis.append(emj_emb_model[t[i]])
        for i in range(int(len(t)/2), len(t)):
            if t[i] in word_emb_model and not t[i].startswith('@'):
                leftWords.append(word_emb_model[t[i]])
            if t[i] in emj_emb_model:
                leftEmojis.append(emj_emb_model[t[i]])
        if len(rightWords)>0:
            mRW = np.mean(rightWords, axis=0)
        if len(rightEmojis)>0:
            mRE = np.mean(rightEmojis, axis=0)
        if len(leftWords)>0:
            mLW = np.mean(leftWords, axis=0)
        if len(leftEmojis)>0:
            mLE = np.mean(leftEmojis, axis=0)

        diff_words = np.abs(mRW - mLW)
        diff_emojis = np.abs(mRE - mLE)

        if diff == True:
            meanVectors.append(np.concatenate((mRW, mRE, mLW, mLE, diff_words, diff_emojis)))
        else:
            meanVectors.append(np.concatenate((mRW, mRE, mLW, mLE)))

    return meanVectors

### Additional Handcrafted Features

In [None]:
def extractEmoticon(tweet):
    """returns all the emoticons in tweet"""
    return re.findall(r'[\U0001f600-\U0001f650]', ' '.join(tweet))

def extractHashtag(tweet):
    t = tweet.split(' ')
    text = []
    hashtagText = []
    oneHashtag = []
    flag = 0
    for w in t:
        if w == "<hashtag>":
            flag = 1
            continue
        if flag == 1:
            if w == "</hashtag>":
                hashtagText.append(oneHashtag)
                oneHashtag = []
                flag = 0
            else:
                oneHashtag.append(w)
        else:
            text.append(w)
    return text, hashtagText

def extractHashtag(tweet):
    t = tweet.split(' ')
    text = []
    hashtagText = []
    oneHashtag = []
    flag = 0
    for w in t:
        if w == "<hashtag>":
            flag = 1
            continue
        if flag == 1:
            if w == "</hashtag>":
                hashtagText.append(oneHashtag)
                oneHashtag = []
                flag = 0
            else:
                oneHashtag.append(w)
        else:
            text.append(w)
    return text, hashtagText

def sentiment(txt):
    """compute sentiment for text"""
    txt = ' '.join(txt)
    if not len(txt): return 2
    output = int(nlp_client.annotate(txt, properties={
                              'annotators': 'tokenize,ssplit,pos,depparse,parse,sentiment',
                                'outputFormat': 'json'
                              })['sentences'][0]['sentimentValue'])
    return output

def contrast(twt):
    """search for emotion contrast in hastag, emoticon and tweet text"""
    contrast = 0 # contrast flag
    txt_sentiment = sentiment(twt[0])
    htag_sentiment = [sentiment(h) for hash_segments in twt[1] for h in hash_segments]
    emoji_sentiment = twt[2]

    if (txt_sentiment in {2,3,4}) and (set(htag_sentiment) & {0,1}):
        contrast = 1
    elif (txt_sentiment in {0,1}) and (set(htag_sentiment) & {3,4}): # maybe later try adding 2
        contrast = 1
    elif (txt_sentiment in {2,3,4}) and (set(emoji_sentiment) & {-1}):
        contrast = 1
    elif (txt_sentiment in {0,1}) and (set(emoji_sentiment) & {1}):
        contrast = 1
    elif {-1,1} in set(emoji_sentiment):
        contrast = 1
    elif ({0,4} in set(htag_sentiment)) or ({0,3} in set(htag_sentiment)) or ({1,4} in set(htag_sentiment)):
        contrast = 1
    elif (set(htag_sentiment) & {0,1}) and (set(emoji_sentiment) & {1}):
        contrast = 1
    elif (set(htag_sentiment) & {3,4}) and (set(emoji_sentiment) & {-1}):
        contrast = 1
    return np.array(contrast)

def polarity_constrast(tweet):
    left = tweet[:(len(tweet)//2)]
    right = tweet[len(tweet)//2:]
    output1 = int(nlp_client.annotate(left,properties={'annotators': 'sentiment','outputFormat': 'json'})['sentences'][0]['sentimentValue'])
    output2 = int(nlp_client.annotate(right, properties={'annotators': 'sentiment','outputFormat': 'json'})['sentences'][0]['sentimentValue'])    
    polarityDiff = 0
    if (output1>2 and output2<2) or (output1<2 and output2>2):
        polarityDiff = 1
    return polarityDiff

def tweet_vecs(twt):

    tags =  ['<allcaps>', '<annoyed>', '<censored>', '<elongated>', '<emphasis>', '<happy>',
             '<hashtag>', '<heart>', '<kiss>', '<laugh>', '<money>', 
             '<repeated>', '<sad>', '<shocking>', '<surprise>', '<wink>']

    twt = twt.split()
    left = twt[:(len(twt)//2)]
    right = twt[len(twt)//2:]
    
    scores = []
    
    for tag in tags:
        scores.append(sum(1 for t in left if t == tag))
    for tag in tags:
        scores.append(sum(1 for t in right if t == tag))
    return scores

def feats(text):
    """apply the tweet_vecs function on all tweets and return a result in a list"""
    return [tweet_vecs(twt) for twt in text]

def create_handcrafted_features(train = True):

    if train:
        corpus, _ = load_dataset(train = True)
        corpus_preprocessed = json.load(open('../data/train_preprocessed.txt','r'))
    else:
        corpus, _ = load_dataset(train = False)
        corpus_preprocessed = json.load(open('../data/test_preprocessed.txt','r'))

    ## Create polarity contrast
    polarity_contrast = []
    for sample in corpus:
        polarity_contrast.append(polarity_constrast(sample))

    ## Create contrast between emojis, hashtags and normal words
    twts = [extractEmoticon(twt) for twt in corpus_preprocessed]
    twts = [[emoji_sentiments[emoji] for emoji in twt] for twt in twts]
    txt = [extractHashtag(tweet) for tweet in corpus_preprocessed]
    txt = [(txt[i][0], txt[i][1], twts[i]) for i in range(len(twts))]
    contrast_feats = [contrast(twt) for twt in txt]
    ekphrasis_feats = [np.array(v) for v in feats(corpus_preprocessed)]

    extra_features = np.concatenate((np.array(polarity_contrast), contrast_feats, ekphrasis_feats), axis=1)

    if train:
        np.save('train_extra_feats.npy', extra_features)
    else:
        np.save('test_extra_feats.npy', extra_features)

    return extra_features

To generate extra features using CoreNLP annotator

In [None]:
# tags =  ['<allcaps>', '<annoyed>', '<censored>', '<elongated>', '<emphasis>', '<happy>',
#             '<hashtag>', '<heart>', '<kiss>', '<laugh>', '<money>', 
#             '<repeated>', '<sad>', '<shocking>', '<surprise>', '<wink>']

# df = pd.read_csv('./pre_trained/Emoji_Sentiment_Data_v1.0.csv')
# emoji_scores = df[['Emoji','Negative','Neutral','Positive','Unicode name']]
# tuples = [tuple(x) for x in emoji_scores.values]
# emoji_sentiments = {}
# idx2lb = {0:-1, 1:0, 2:1}
# for val in tuples:
#     emoji_sentiments[val[0]] = idx2lb[np.argmax(np.array(val[1:4]))]

# # Import client module
# nlp_client = CoreNLPClient(
#     annotators=['sentiment'], 
#     memory='4G', 
#     endpoint='http://localhost:9005', ## Change localhost number if cannot run
#     be_quiet=False)

# ## Generate and save handcrafted features
# create_handcrafted_features(train = True)
# create_handcrafted_features(train = False)

# nlp_client.stop()

# Generate train and test dataset

In [None]:
# Refer to Emoji_Sentiment to get positive/neutral/negative for emojies
def create_dataset(extra_features = True, train = True, diff = False):
    if train:
        corpus, y = load_dataset('train')
        X = bisectioned_embeddings_avg(corpus, word_emb_model, emj_emb_model, diff = diff)
        # handcrafted_features =  create_handcrafted_features(train = True)  ## Create from scratch
        extraFeatures = np.load(open('./train_extra_feats.npy', 'rb'), allow_pickle = True)
        for i in range(len(X)):
            X[i] = np.concatenate((X[i],extraFeatures[i]))
    else:
        corpus, y = load_dataset(train = False)
        X = bisectioned_embeddings_avg(corpus, word_emb_model, emj_emb_model)
        extraFeatures = np.load(open('./test_extra_feats.npy', 'rb'), allow_pickle = True)
        for i in range(len(X)):
            X[i] = np.concatenate((X[i],extraFeatures[i]))
    return X, y

In [None]:
X_train, y_train = create_dataset(extra_features = True, train = True)
X_test, y_test = create_dataset(extra_features = True, train = False)
X_train_diff, y_train_diff = create_dataset(extra_features = True, train = True, diff = True)
X_test_diff, y_test_diff = create_dataset(extra_features = True, train = False, diff = True)

# Training and Evaluation

In [None]:
## Global hyperparameters:

random_state = 2012
k_folds = 5

def evaluate(model, X, y, k_folds):
    preds = cross_val_predict(model, np.array(X), y, cv=k_folds)

    # Modify F1-score calculation depending on the task
    score = metrics.f1_score(y, preds, pos_label=1)
    p = metrics.precision_score(y, preds, pos_label=1)
    r = metrics.recall_score(y, preds, pos_label=1)
    acc = metrics.accuracy_score(y, preds)

    print('\n')
    print(model)
    print ("F1-score Task", score)
    print ("Precision Task", p)
    print ("Recall Task", r)
    print ("Accuracy Task", acc)

def test(model, X, y):
    preds = model.predict(np.array(X))

    # Modify F1-score calculation depending on the task
    score = metrics.f1_score(y, preds, pos_label=1)
    p = metrics.precision_score(y, preds, pos_label=1)
    r = metrics.recall_score(y, preds, pos_label=1)
    acc = metrics.accuracy_score(y, preds)

    print('\n')
    print(model)
    print ("F1-score Task", score)
    print ("Precision Task", p)
    print ("Recall Task", r)
    print ("Accuracy Task", acc)

    false_index = [i for i, x in enumerate(preds == y) if not x]

    return false_index

## Classical Models:
1. Logistic Regression (LR)
2. Support Vector Classifier (SVC)
3. Ensemble of LR and SVC 
4. XgBoost
5. RandomForest

## GRID SEARCH

### Logistic Regression

In [None]:
lr_params = {'C':[0.001,0.1,1,10,100,1000]}

lr_grid = model_selection.GridSearchCV(estimator = LogisticRegression(random_state = random_state),
                                      param_grid = lr_params, cv = k_folds, scoring = "f1", verbose = 1, n_jobs = -1)

lr_grid.fit(X_train, y_train)
print('Best set of hyperparameters:',lr_grid.best_params_)
print('Validation Score on best set of hyperparameters',lr_grid.best_score_)

filename = './model_checkpoint/lr_grid_search.sav'
pickle.dump(lr_grid, open(filename, 'wb'))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best set of hyperparameters: {'C': 1}
Validation Score on best set of hyperparameters 0.6750067161615402


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### Support Vector Classifier

In [None]:
svc_params = {'C':[0.1,1,10,100,1000],
              'gamma':[0.01,0.1,1,10,100]}

svc_grid = model_selection.GridSearchCV(SVC(kernel='rbf', random_state = random_state,cache_size = 1000,verbose = True,
                                                max_iter = 10000),
                                        param_grid = svc_params, cv = k_folds, scoring = 'f1',verbose = 1, n_jobs = -1)

svc_grid.fit(X_train, y_train)
print('Best set of hyperparameters:',svc_grid.best_params_)
print('Validation Score on best set of hyperparameters',svc_grid.best_score_)

filename = './model_checkpoint/svc_grid_search.sav'
pickle.dump(svc_grid, open(filename, 'wb'))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[LibSVM]Best set of hyperparameters: {'C': 10, 'gamma': 0.01}
Validation Score on best set of hyperparameters 0.6838427550312952


### Ensemble of Logistic Regression and Support Vector Classifier

In [None]:
voting_params = {'svm__C': [1, 10, 100],
                 'svm__gamma': [0.001, 0.01, 0.1],
                 'lr__C': [0.1, 1, 10]}
voting_grid = model_selection.GridSearchCV(VotingClassifier(estimators=[('svm', SVC(random_state=random_state, probability=True)),
                                            ('lr', LogisticRegression(random_state=random_state))], voting='soft'),
                                            param_grid = voting_params, cv = k_folds, scoring = 'f1',verbose = 1, n_jobs = -1)
voting_grid.fit(X_train, y_train)
print('Best set of hyperparameters:',voting_grid.best_params_)
print('Validation Score on best set of hyperparameters',voting_grid.best_score_)

filename = './model_checkpoint/voting_grid_search.sav'
pickle.dump(voting_grid, open(filename, 'wb'))

### XGBoost

In [None]:
xgb_params = {'learning_rate': [0.001, 0.01, 0.1, 1], 'max_depth': [10, 20, 30]}

xgb_grid = model_selection.GridSearchCV(xgb.XGBClassifier(n_estimators = 50, random_state = random_state),
                                            param_grid = xgb_params, cv = k_folds, scoring = 'f1',verbose = 1, n_jobs = -1)

xgb_grid.fit(np.array(X_train), y_train)
print('Best set of hyperparameters:',xgb_grid.best_params_)
print('Validation Score on best set of hyperparameters',xgb_grid.best_score_)

filename = './model_checkpoint/xgb_grid.sav'
pickle.dump(xgb_grid, open(filename, 'wb'))

Fitting 5 folds for each of 12 candidates, totalling 60 fits




Best set of hyperparameters: {'learning_rate': 0.1, 'max_depth': 30}
Validation Score on best set of hyperparameters 0.6358337745610353


## Retrain all models on the entire set

In [None]:
### Logistic Regression ###
lr_grid = pickle.load(open('./model_checkpoint/lr_grid_search.sav','rb'))
lr = lr_grid.best_estimator_
lr.fit(X_train, y_train)

### Support Vector Classifier ###
svc_grid = pickle.load(open('./model_checkpoint/svc_grid_search.sav','rb'))
svc = svc_grid.best_estimator_
svc.fit(X_train, y_train)

### Support Vector Classifier ###
voting_grid = pickle.load(open('./model_checkpoint/voting_grid_search.sav','rb'))
voting_classifier = voting_grid.best_estimator_
voting_classifier.fit(X_train, y_train)

### Support Vector Classifier ###
xgb_grid = pickle.load(open('./model_checkpoint/xgb_grid.sav','rb'))
xgb_model = xgb_grid.best_estimator_
xgb_model.fit(np.array(X_train), y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[LibSVM]

XGBClassifier(max_depth=30, missing=nan, n_estimators=50, random_state=2012)

# Testing & Error Analysis

In [None]:
models = [lr, svc, voting_classifier, xgb_model]
error_index = []
for i, model in enumerate(models):
    if i == 3:
        wrong_index = test(model, np.array(X_test), y_test)
    else:
        wrong_index = test(model, X_test, y_test)
    error_index.append(wrong_index)

wrong_corpus = [[],[],[],[]]

corpus, y = load_dataset('train')
for ind, sent in enumerate(corpus):
    for i in range(4):
        if ind in error_index[i]:
            wrong_corpus[i].append((sent, y[ind]))



LogisticRegression(C=1, random_state=2012)
F1-score Task 0.6605744125326372
Precision Task 0.5560439560439561
Recall Task 0.8135048231511254
Accuracy Task 0.6683673469387755


SVC(C=10, cache_size=1000, gamma=0.01, max_iter=10000, random_state=2012,
    verbose=True)
F1-score Task 0.6418835192069392
Precision Task 0.5221774193548387
Recall Task 0.8327974276527331
Accuracy Task 0.6313775510204082


VotingClassifier(estimators=[('svm',
                              SVC(C=100, gamma=0.01, probability=True,
                                  random_state=2012)),
                             ('lr',
                              LogisticRegression(C=0.1, random_state=2012))],
                 voting='soft')
F1-score Task 0.6631299734748011
Precision Task 0.5643340857787811
Recall Task 0.8038585209003215
Accuracy Task 0.6760204081632653


XGBClassifier(max_depth=30, missing=nan, n_estimators=50, random_state=2012)
F1-score Task 0.6377622377622377
Precision Task 0.5643564356435643
Recall Task

In [None]:
error_analysis = {'False Negatives':[], 'False Positives':[]}
for 


[("3 episodes left I'm dying over here", 0),
 ('"I can\'t breathe!" was chosen as the most notable quote of the year in an annual list released by a Yale University librarian',
  1),
 ('Oh, thank GOD - our entire office email system is down... the day of a big event. Santa, you know JUST what to get me for xmas.',
  1),
 ("Cold or warmth both suffuse one's cheeks with pink (colour/tone) ... Do you understand the underlying difference & its texture?",
  0),
 ("Just great when you're mobile bill arrives by text", 1),
 ('But @DarklightDave was trying to find us, and my battery died. Guess how he found us? Yes, that bastard wand! !!!!!',
  1),
 ('@deputymartinski please do..i need the second hand embarrassment so desperatly on my phone',
  1),
 ('@RushOrderTees THX4FLW! FLWtheMUSIC @ElektrikEventz @ElektrikMetro WE R #ElektrikBLOOM #ElektrikFANTASY #iwant2DRIFT #Elev8TheUnderground!',
  0),
 ("I was doing great with this summary of my year until I got to June 27th, and the weekend of @Hart