# SemEval 2017 task 4a 

### Library Imports

In [1]:
import re
from os.path import join
from os.path import isfile
import numpy as np

import pickle
import tabulate

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.corpus import opinion_lexicon

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam

from tqdm import tqdm

from transformers import BertTokenizer
from transformers import BertModel


print("My Pytorch version: " + torch.__version__)


My Pytorch version: 1.13.1+cu117


### User Defined Constants

In [2]:
#Please change these as approprite

DEVICE = torch.device('cuda') 

data_dir = "data/"

testsets = ['twitter-test1.txt', 'twitter-test2.txt', 'twitter-test3.txt']
devset = 'twitter-dev-data.txt'
trainset = 'twitter-training-data.txt'


### Constants

In [3]:

#Don't change these

CPU = torch.device('cpu')

#Assigning intgers to each class
NEG = 0
NEUT = 1
POS = 2

LABELS_IDS = {'negative':NEG, 'neutral':NEUT, 'positive':POS}

LABELS = {NEG:'negative',NEUT:'neutral',POS:'positive'}

In [4]:
# EVALUATION CODE
def read_test(testset):
    '''
    readin the testset and return a dictionary
    :param testset: str, the file name of the testset to compare
    '''
    id_gts = {}
    with open(testset, 'r', encoding='utf8') as fh:
        for line in fh:
            fields = line.split('\t')
            tweetid = fields[0]
            gt = fields[1]

            id_gts[tweetid] = gt

    return id_gts


def confusion(id_preds, testset, classifier):
    '''
    print the confusion matrix of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    gts = []
    for m, c1 in id_gts.items():
        if c1 not in gts:
            gts.append(c1)

    gts = ['positive', 'negative', 'neutral']

    conf = {}
    for c1 in gts:
        conf[c1] = {}
        for c2 in gts:
            conf[c1][c2] = 0

    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            print("SHOULD NOT BE HERE")
            pred = 'neutral'
        conf[pred][gt] += 1

    print(''.ljust(12) + '  '.join(gts))

    for c1 in gts:
        print(c1.ljust(12), end='')
        for c2 in gts:
            if sum(conf[c1].values()) > 0:
                print('%.3f     ' % (conf[c1][c2] / float(sum(conf[c1].values()))), end='')
            else:
                print('0.000     ', end='')
        print('')

    print('')


def evaluate(id_preds, testset, classifier):
    '''
    print the macro-F1 score of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    acc_by_class = {}
    for gt in ['positive', 'negative', 'neutral']:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    catf1s = {}

    ok = 0
    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            print("SHOULD NOT BE HERE")
            pred = 'neutral'

        if gt == pred:
            ok += 1
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    catcount = 0
    itemcount = 0
    macro = {'p': 0, 'r': 0, 'f1': 0}
    micro = {'p': 0, 'r': 0, 'f1': 0}
    semevalmacro = {'p': 0, 'r': 0, 'f1': 0}

    microtp = 0
    microfp = 0
    microtn = 0
    microfn = 0
    for cat, acc in acc_by_class.items():
        catcount += 1

        microtp += acc['tp']
        microfp += acc['fp']
        microtn += acc['tn']
        microfn += acc['fn']

        p = 0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        catf1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            semevalmacro['p'] += p
            semevalmacro['r'] += r
            semevalmacro['f1'] += f1

        itemcount += n

    micro['p'] = float(microtp) / float(microtp + microfp)
    micro['r'] = float(microtp) / float(microtp + microfn)
    micro['f1'] = 2 * float(micro['p']) * micro['r'] / float(micro['p'] + micro['r'])

    semevalmacrof1 = semevalmacro['f1'] / 2

    print(testset + ' (' + classifier + '): %.3f' % semevalmacrof1)

    return semevalmacrof1

In [5]:
#Simplified evaluation function used for development

def evaluate_simple(predictions, true):
    acc_by_class = {}
    for gt in ['positive', 'negative', 'neutral']:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    for i,gt in enumerate(true):
        pred = predictions[i]
        if gt == pred:
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    macro = {'p': 0, 'r': 0, 'f1': 0}
    for cat, acc in acc_by_class.items():

        p = 0
        #Calculating precision
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        #Calculating recall
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0 
        # Calculating F1 scores
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        #Adding scores for only the postive and negative classes
        if cat in ['positive', 'negative']: 
            macro['p'] += p
            macro['r'] += r
            macro['f1'] += f1

    macro_f1 = macro['f1'] / 2

    return macro_f1

## Preprocessing Tweets

In [6]:
lemmatizer = WordNetLemmatizer()

### Slang Replacement

In [7]:
#Parsing slang file

slang_dict = {}

with open(join(data_dir,'slang.txt'),'r') as f:
    lines = f.readlines()
for line in lines:
    line = line.strip()
    aslist = line.split('\t')
    if (len(aslist)!=2):
        print(line)
    slang = aslist[0]
    replace = aslist[1]
    slang_dict[slang]= replace

In [8]:
def convert_slang(tokens):
    tokens_new = []
    for token in tokens:
        if token in slang_dict:
            replacement= slang_dict[token]
            tokens_to_add = replacement.split(" ")
            tokens_new+=tokens_to_add

        else:
            tokens_new.append(token)
    
    return tokens_new

### Emoji Replacement


In [9]:


nltk.download('opinion_lexicon')

pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /dcs/pg22/u5501145/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [10]:
#Parsing Emoji Dictionary

with open(join(data_dir,'Emoji_Dict.p'), 'rb') as f:
    emoji_dict = pickle.load(f)
    emoji_dict = {v: k for k, v in emoji_dict.items()} # Reverses dictionary so emojis are keys
 

for k,v in emoji_dict.items(): #Extracts only sentiment words in emoji descriptions
    raw = v.replace(":","")
    tokens = raw.split('_')
    lem_list = [lemmatizer.lemmatize(t) for t in tokens] #Lemmatising first to give better chance of finding in wordlists
    sentiment_tokens = [t for t in lem_list if t in pos_list or t in neg_list] # keeps only sentiment tokens
    emoji_dict[k]=" ".join(set(sentiment_tokens)) # using set to remove any duplicates
                      
    
def convert_emojis(text): # Checks each word in text if its in the dict and replaces it if found
    for word in text.split(' '):
        if word in emoji_dict:
            text = text.replace(word,emoji_dict[word])
    return text

In [11]:
print(convert_emojis("emoji_1: 😊 \nemoji_2: 😂 \nemoji_3: 😞"))

emoji_1: smiling 
emoji_2: joy 
emoji_3: disappointed


### Regular Expressions

In [12]:
#Patterns

pos_emoticons = r'(:|;|x|X|=)(\)|D|p|P|3)'
neg_emoticons = r'(:|;|=)(\(|\{|\[)'
mentions = r"@[^\s]+"
websites= r"https?:\/\/[^\s]+|www\.[^\s]+|[^\s]+\.(com|net|org|uk|ru|ir|in)|[^\s]+\.co\.[^ ]{2}"
nonstandard_chars= r"[^a-zA-Z0-9\s]" 
one_character_words= r"\b\w{1}\b"
numeric_words= r"\b[0-9]+\b"
repeated_chars = r"(.)\1{2,}"


#Tests

pos_emote_test = ":) ;) XD =D"
neg_emote_test = ":( ;( ={ =["
mention_test = "@shouldntbehere shouldbehere @ @s"
website_test = "https://shouldntbehere.com www.shouldntbehere.com shouldbehere and so should this shouldn'tbehere.com https:// another.net anothertwice.co.nr"
nonstandard_test = "thi{}s $shou#ld ! not conta@in {any sp£ecial symb!ols"
one_character_test = "firstword h 4 shouldbehere a s thirdword b"
numeric_word_test = "5pm 5764 42in 5"
repeated_chars_test = "yayyyy i loooove itttt"

#patterns variable is a list of (pattern, replacement) for use with re.sub
patterns = [(pos_emoticons,'happy'), (neg_emoticons,'sad'),(mentions, ''),(websites,''), (nonstandard_chars,''), (one_character_words,''), (numeric_words, ''),(repeated_chars,r'\1\1')]
test_strings = [pos_emote_test, neg_emote_test,mention_test,website_test, nonstandard_test, one_character_test, numeric_word_test,repeated_chars_test]

print("Testing regular expressions:\n")
for i in range(len(patterns)):
    pattern,replacement = patterns[i]
    test_string = test_strings[i]
    result = re.sub(pattern, replacement, test_string)
    print(result)


Testing regular expressions:

happy happy happy happy
sad sad sad sad
 shouldbehere @ 
  shouldbehere and so should this  https://  
this should  not contain any special symbols
firstword   shouldbehere   thirdword 
5pm  42in 
yayy i loove itt


In [13]:

#Performs regular expression substitution on patterns defined above
def clean_text(text): 
    for pattern,replacement in patterns:
        text = re.sub(pattern, replacement, text)
    return text

### Preprocess function

In [14]:
#Converts Penn Treebank POS tags to wordnet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    elif treebank_tag.startswith('S'):
        return wordnet.ADJ_SAT
    else:
        return wordnet.NOUN

#Applies all preprocessing steps
def preprocess_text(text): 
    text = text.lower()
    text = convert_emojis(text)
    text = clean_text(text)

    tokens = word_tokenize(text)
    tokens = convert_slang(tokens)

    tags = pos_tag(tokens)
    #Lemmatisation using pos_tag, converted from penn treebank to wordnet
    tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(tag[1])) for token,tag in zip(tokens,tags)]
    
    out = " ".join(tokens) # Untokenize (tokenization needs to be done differently for different features/models)
    return out


### Loading Data and applying pre-processing

This cell takes some time to run (~2 min on my laptop)

In [15]:

tweetids = {}
tweetlabels = {}
processed_tweets = {}
raw_tweets = {}

for filename in [trainset,devset]+testsets:
    
    path = join(data_dir,filename)
    
    with open(path) as file: 
        lines = file.readlines()
    
    data_all = [line.split("\t") for line in lines] # Each entry contains three attributes delimited by \t
    
    tweetids[filename] = [d[0] for d in data_all]
    tweetlabels[filename] = [d[1] for d in data_all]
    raw_tweets[filename]= [d[2] for d in data_all]
    processed_tweets[filename] = [preprocess_text(d[2]) for d in data_all] 
    
data_tuple = (tweetids, tweetlabels, raw_tweets, processed_tweets)



### Data browser

Set `active=True` to allow you browse through data, see raw and processed form.\
To see next example enter any input and press enter\
To stop, enter an empty input

In [16]:
active = False

i=55
while(active):
    user_input = input()
    if (user_input==''):
        break
    else:
        print(raw_tweets['twitter-training-data.txt'][i])
        print(processed_tweets['twitter-training-data.txt'][i])
        print(tweetlabels['twitter-training-data.txt'][i])
        i+=1
    



## Feature Extraction for linear classifiers

### Bag of Words features

Here we fit the tfidf vectorizer on the training set \
The trained vectorizer will be used on both training and testing set using `tdif_features()`

In [17]:

tfidf_vectorizer = TfidfVectorizer()
training_tweets = processed_tweets[trainset]
tfidf_vectorizer.fit(training_tweets)

print("Fitted tfidf vectorizer on training set")
    

Trained tfidf vectorizer on training set


In [18]:
def tfidf_features(tweets):
    vectors = tfidf_vectorizer.transform(tweets)
    return vectors


### Glove features

We first parse the pre-trained glove features from the txt file\
Then setup two functions: 
* `get_glove_vector()` takes a token and returns a vector, while dealing with OOV tokens
* `glove_features()` takes a list of tweets and returns the mean glove vector for each




In [19]:
#Parising glove vector file into a dictionary

glove_dict = {}
EMBEDDING_DIM = 100

with open(join(data_dir,'glove.6B.100d.txt')) as f:
    #line = f.readlines()[0]
    for line in f.readlines():
        aslist = line.split(" ")
        word = aslist[0]
        vector_raw = aslist[1:]
        vector = [float(n.strip()) for n in vector_raw]
        glove_dict[word]=vector

MEAN_GLOVE = np.mean(list(glove_dict.values()),axis=0)

In [20]:
def get_glove_vector(token): # Simple funciton to deal with OOV words in glove
    if token in glove_dict:
        vector = glove_dict[token]
    else:
        vector = MEAN_GLOVE
        
    return vector
    

In [21]:
def glove_features(tweets):
    tokenized_tweets = [word_tokenize(tweet) for tweet in tweets]
    avg_vectors = np.zeros((len(tokenized_tweets),EMBEDDING_DIM))
    for i,tokens in enumerate(tokenized_tweets):
        if len(tokens)!=0:
            vectors = [get_glove_vector(token) for token in tokens]
            avg_vector = np.mean(vectors,axis=0)
            avg_vectors[i] = avg_vector
    return avg_vectors

### Lexicon Features 

We use opinion lexicon from earlier and compute a vector of size 2 representing the average presence in postitive and negative word lists respectively

In [22]:
def lexicon_features(tweets):
    tokenized_tweets = [word_tokenize(tweet) for tweet in tweets]
    avg_vectors = np.zeros((len(tokenized_tweets),2)) 
    for i,tokens in enumerate(tokenized_tweets):
        if len(tokens)!=0:
            vectors = [[token in pos_list, token in neg_list] for token in tokens]
            avg_vectors[i] = np.mean(vectors,axis=0)
    return avg_vectors
    

In [23]:
#Combination of features

def glove_lexicon_features(tweets):
    glove_vectors = glove_features(tweets)
    lexicon_vectors = lexicon_features(tweets)
    return np.concatenate([glove_vectors,lexicon_vectors],axis=1)


## Linear Classifiers

### Experiments

The cell below takes a few minutes to run, it trains 4 different classifiers on 3 different features, and evaluates them.

In [24]:

run_experiments = True 

feature_extractors = {"glove":glove_features, "lexicon":lexicon_features, "glove_lexicon":glove_lexicon_features, 'bow':tfidf_features}

to_test=testsets

total_avg = 0
n=0

if (run_experiments):

    training_set = processed_tweets[trainset] 
    train_labels = tweetlabels[trainset]
    
    FEATURE_LIST = ['glove','glove_lexicon','lexicon','bow']

    CLF_LIST = ['lr','svm','gnb','rf']

    results = []


    for i,feature in enumerate(FEATURE_LIST):
        
        
        feature_extractor = feature_extractors[feature]
        print("Extracting {} features".format(feature))
            
        train_features = feature_extractor(training_set)
        
        print(train_features.shape)
        print(len(train_labels))

        row =[feature]
        
        for j,classifier in enumerate(CLF_LIST):
            # Skeleton: Creation and training of the classifiers
            if classifier == 'svm':
                print('Training ' + classifier)
                clf = SGDClassifier(max_iter=1000, tol=1e-3)
                clf.fit(train_features, train_labels)
                
            elif classifier == 'rf':
                if (feature!='bow'):
                    print('Training ' + classifier)
                    clf = RandomForestClassifier()
                    clf.fit(train_features,train_labels)
                else:
                    clf=None
            elif classifier == 'lr':
                print('Training ' + classifier)
                clf = LogisticRegression(solver = 'saga', max_iter=200)
                clf.fit(train_features,train_labels)
            elif classifier == 'gnb':
                if (feature!='bow'):
                    clf = GaussianNB()
                    clf.fit(train_features,train_labels)
                else:
                    clf = None
            else:
                print('Unknown classifier name' + classifier)
                continue

            # Predition performance of the classifiers
            if (clf!=None):
                test_scores = []
                for testset in to_test:
                    
                    testset_name = testset
                    testset_path = join(data_dir, testset_name)
                    
                    test_set = processed_tweets[testset_name]
                    test_ids = tweetids[testset_name]
                    
                    #print("Extracting testing features for ", testset_name)
                    test_features = feature_extractor(test_set)
                    
                    #print("Predicting labels for ", testset_name)
                    predictions = clf.predict(test_features)

                    id_preds = {i:j for i,j in zip(test_ids,predictions)}
                    
                    score = evaluate(id_preds, testset_path, feature + '-' + classifier)

                    test_scores.append(score)

                row.append(np.mean(test_scores))
                total_avg+=np.mean(test_scores)
                n+=1
        results.append(row)

total_avg = total_avg/n


Extracting glove features
(45101, 100)
45101
Training lr
data/twitter-test1.txt (glove-lr): 0.440
data/twitter-test2.txt (glove-lr): 0.438
data/twitter-test3.txt (glove-lr): 0.473
Training svm
data/twitter-test1.txt (glove-svm): 0.440
data/twitter-test2.txt (glove-svm): 0.458
data/twitter-test3.txt (glove-svm): 0.462
data/twitter-test1.txt (glove-gnb): 0.483
data/twitter-test2.txt (glove-gnb): 0.474
data/twitter-test3.txt (glove-gnb): 0.465
Training rf
data/twitter-test1.txt (glove-rf): 0.322
data/twitter-test2.txt (glove-rf): 0.353
data/twitter-test3.txt (glove-rf): 0.351
Extracting glove_lexicon features
(45101, 102)
45101
Training lr
data/twitter-test1.txt (glove_lexicon-lr): 0.474
data/twitter-test2.txt (glove_lexicon-lr): 0.496
data/twitter-test3.txt (glove_lexicon-lr): 0.507
Training svm
data/twitter-test1.txt (glove_lexicon-svm): 0.552
data/twitter-test2.txt (glove_lexicon-svm): 0.560
data/twitter-test3.txt (glove_lexicon-svm): 0.541
data/twitter-test1.txt (glove_lexicon-gnb): 0

In [25]:
if (run_experiments):

    print(tabulate.tabulate(results,headers= [""]+CLF_LIST,floatfmt=".3f"))

    print("average score is {}".format(total_avg))

                  lr    svm    gnb     rf
-------------  -----  -----  -----  -----
glove          0.450  0.453  0.474  0.342
glove_lexicon  0.492  0.551  0.530  0.443
lexicon        0.400  0.313  0.390  0.467
bow            0.562  0.501
average score is 0.4551056388061755


In [26]:

#Here we retrain the best model (TFIDF+LR (MaxEnt)) and provide more detailed results


to_test = testsets

training_set = processed_tweets['twitter-training-data.txt'] 
train_labels = tweetlabels['twitter-training-data.txt']

train_features = tfidf_features(training_set)

lr_clf = LogisticRegression(solver = 'saga', max_iter=200)

lr_clf.fit(train_features, train_labels)

for testset in to_test:

    test_tweets = processed_tweets[testset]
    test_features = tfidf_features(test_tweets)
    test_ids = tweetids[testset]

    predictions = lr_clf.predict(test_features)
    id_preds = {i:j for i,j in zip(test_ids,predictions)}

    test_labels = tweetlabels[testset]

    confusion(id_preds, join(data_dir,testset), "bow" + '-' + "lr")

    evaluate(id_preds, join(data_dir,testset), "bow" + '-' + "lr")


            positive  negative  neutral
positive    0.740     0.048     0.213     
negative    0.093     0.814     0.093     
neutral     0.248     0.157     0.594     

data/twitter-test1.txt (bow-lr): 0.570
            positive  negative  neutral
positive    0.787     0.048     0.165     
negative    0.120     0.723     0.157     
neutral     0.336     0.109     0.555     

data/twitter-test2.txt (bow-lr): 0.572
            positive  negative  neutral
positive    0.753     0.065     0.182     
negative    0.214     0.658     0.128     
neutral     0.291     0.133     0.575     

data/twitter-test3.txt (bow-lr): 0.545


## LSTM classifiers

In [27]:
#This performs a simple pre-padding

def pad(sequence, desired_length, pad_value):
    if (len(sequence))==desired_length:
        return sequence
    elif (len(sequence)<desired_length):
        diff = desired_length-len(sequence)
        return [pad_value for i in range(diff)]+sequence
    else:
        return sequence[:desired_length] #Cuts sequence to desired length if its longer

#### LSTM Tokenizer

This class creates an object that can be fit to some training data, returning an embedding matrix, and transforms any data to correspond with that embeddings matrix.

* `fit()` takes pre-processed textual data, embeddings dictionary of form {token:vector} and embedding dimension and returns embeddings matrix \
  It also computes and stores the vocabulary for use in the tranform method

* `transform()` takes textual data and returns numeric tokens\
It uses the vocab computed by fit function to assign each token an index, which matches with the corresponding vector in the matrix returned by fit()

Note: Fit function adds two further tokens '[pad]' and '[unk]' to the vocab and adds a zero vector and mean vector their repsecitve embeddings.

To handle OOV tokens, the transform function will use the index of the '[unk]' vector, for any OOV tokens.

Examples:


`tokenizer.fit(['word','another','word'], glove_dict, 100)` returns a 4x100 embeddings matrix and stores the vocab
`tokenizer.transform(['word','another','word']) returns [0, 1, 0]` where each value is an index pointing to the corresponding embedding in the matrix

In [28]:

class LSTMTokenizer():
    def __init__(self):
        self.vocab_list = None
        self.vocab_dict = None
        self.matrix = None
        self.max_length = None
        

    def fit(self,data, vector_dict, vector_dim, max_length = 56):

        self.max_length = max_length
        tokenized_data = [word_tokenize(tweet) for tweet in data]
        all_tokens = [token for sequence in tokenized_data for token in sequence]
        vocab_set = set(all_tokens)
        self.vocab_list = [token for token in all_tokens if token in vocab_set]
    

        matrix = np.zeros((len(self.vocab_list),vector_dim))
        for i,token in enumerate(self.vocab_list):
            if (token in vector_dict):
                matrix[i]=vector_dict[token]

        self.vocab_list.append('[pad]')
        self.vocab_list.append('[unk]')

        self.vocab_dict = {token:index for index,token in enumerate(self.vocab_list)}

        pad_vector = np.zeros((vector_dim))
        unkown_vector = np.mean(matrix,axis=0)

        matrix = np.concatenate([matrix, [pad_vector,unkown_vector]],axis=0)
        self.matrix = matrix
        
        return matrix
    
    def transform(self,data):
        
        if (self.vocab_dict == None):
            raise Exception("LSTM Tokenizer has not been fitted, cannot transform any data.")

        tokenized_data = [word_tokenize(tweet) for tweet in data]

        tokenized_data = [pad(sequence, self.max_length, '[pad]') for sequence in tokenized_data]

        numeric_tokens = np.zeros((len(data),self.max_length))

        for i,sequence in enumerate(tokenized_data):
            for j,token in enumerate(sequence):
                if token not in self.vocab_dict:
                    numeric_tokens[i,j] = self.vocab_dict['[unk]'] # Handling OOV words here
                else:
                    numeric_tokens[i,j] = self.vocab_dict[token]

        return numeric_tokens


### Preparing Training Data for LSTM

In [29]:

lstm_prepare=True

tokenizer_filename = join(data_dir, 'lstm_tokenizer.p')
matrix_filename = join(data_dir, 'embeddings_matrix.p')

if lstm_prepare:

    train_tweets = processed_tweets[trainset]
    train_labels = tweetlabels[trainset]

    dev_tweets  = processed_tweets[devset]
    dev_labels = tweetlabels[devset]

    lstm_tokenizer = LSTMTokenizer()

    #Creating the embeddings matrix for the vocab of training set 
    embeddings_matrix = lstm_tokenizer.fit(train_tweets, glove_dict, EMBEDDING_DIM) 

    print("Fitted LSTM tokenizer to training data")


    #Data is transformed using the tokenizer here:
    X_train = torch.tensor(lstm_tokenizer.transform(train_tweets)).long()
    Y_train = torch.tensor([LABELS_IDS[label] for label in train_labels]).long()

    X_dev = torch.tensor(lstm_tokenizer.transform(dev_tweets)).long()
    Y_dev = torch.tensor([LABELS_IDS[label] for label in dev_labels]).long()


    print("x_train (shape): ", X_train.shape)
    print("y_train (shape): ", Y_train.shape)

    print("x dev (shape): ", X_dev.shape)
    print("y_dev (shape): ", Y_dev.shape)

    BATCH_SIZE = 256

    train_zipped = list(zip(X_train, Y_train))
    dev_zipped = list(zip(X_dev, Y_dev))

    lstm_train_loader = DataLoader(train_zipped, batch_size=BATCH_SIZE)
    lstm_dev_loader  = DataLoader(dev_zipped, batch_size=BATCH_SIZE)

    print("Train and Dev Loaders done preparing")

   






Fitted LSTM tokenizer to training data
x_train (shape):  torch.Size([45101, 56])
y_train (shape):  torch.Size([45101])
x dev (shape):  torch.Size([2000, 56])
y_dev (shape):  torch.Size([2000])
Train and Dev Loaders done preparing


In [30]:
#These functions are used so that the LSTM and BERT models can use the same training loop
#It takes a batch and splits it into its components according to what kind of model is being used

def lstm_batch_splitter(batch):
    return ([batch[0].to(DEVICE)],batch[1].to(DEVICE))

def bert_batch_splitter(batch):
    return ([batch[0].to(DEVICE),batch[1].to(DEVICE)],batch[2].to(DEVICE))
    

### Model Definitions

In [31]:
class LSTM_base(nn.Module):
    def __init__(self, embeddings_matrix, embedding_dim):
        super(LSTM_base, self).__init__()
        
        hidden_dim = 32
        layers = 2 

        self.embedding = torch.nn.Embedding.from_pretrained(torch.from_numpy(embeddings_matrix).float())
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, layers, bidirectional=False, batch_first =True, dropout=0.4)
        self.linear = nn.Linear(hidden_dim, 3)
        self.activation = nn.Tanh()

            
    def forward(self, X_batch):
        out = self.embedding(X_batch)
        out,_ = self.lstm(out)
        out = out[:, -1] # gets final outputs from LSTM
        out = self.activation(out)
        out = self.linear(out) 
        return out


In [32]:
class BiLSTM(nn.Module):
    def __init__(self, embeddings_matrix, embedding_dim):
        super(BiLSTM, self).__init__()
        
        hidden_dim = 32
        layers = 2

        self.embedding = torch.nn.Embedding.from_pretrained(torch.from_numpy(embeddings_matrix).float())
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, layers, bidirectional=True, batch_first =True, dropout=0.4)
        self.linear = nn.Linear(2*hidden_dim, 3)
        self.activation = nn.Tanh()
        self.softmax = nn.Softmax(1)

            
    def forward(self, X_batch):
        out = self.embedding(X_batch)
        out,_ = self.lstm(out)
        out = out[:, -1] # Gets final output from LSTM
        out = self.activation(out)
        out = self.linear(out) 
        return out


### Training Functions

In [33]:
def validate(model, batch_splitter, loss_fn, val_loader):
    with torch.no_grad():
        labels, predictions, losses = [],[],[]
        for batch in val_loader:
            input_args, Y = batch_splitter(batch)
            preds = model(*input_args)
            loss  = loss_fn(preds, Y)
            losses.append(loss.item())

            labels.append(Y)
            predictions.append(preds.argmax(dim=-1))

        #Flatten the labels and predictions
        labels_flat = torch.cat(labels).to(CPU).numpy()
        predictions_flat = torch.cat(predictions).to(CPU).numpy()

        mean_loss = np.mean(losses)
        f1 = evaluate_simple([LABELS[label] for label in labels_flat], [LABELS[pred] for pred in predictions_flat])
        print("Validation Loss : {:.3f}".format(mean_loss))
        print("Validation SEMEVAL F1  : {:.3f}".format(f1))
        
        return mean_loss,f1

In [34]:
def train_loop(model, batch_splitter, loss_fn, optimizer, train_loader, val_loader, epochs=10):
    training_losses = []
    training_f1s = []
    validation_losses = []
    validation_f1s = []

    for i in range(1, epochs+1):
        losses, predictions, labels = [],[],[]

        for batch in tqdm(train_loader):
            
            input_args, Y = batch_splitter(batch)

            Y_preds = model(*input_args)

            
            loss = loss_fn(Y_preds, Y)
            losses.append(loss.item())

            Y_pred_labels = Y_preds.argmax(dim=-1)
            predictions.append(Y_pred_labels)
            
            labels.append(Y)
            
            optimizer.zero_grad()
            loss.backward()            
            optimizer.step()

        
        labels_flat = torch.cat(labels).to(CPU).detach().numpy()
        predictions_flat = torch.cat(predictions).to(CPU).detach().numpy()
        mean_train_loss = np.mean(losses)

        train_f1 = evaluate_simple([LABELS[label] for label in labels_flat], [LABELS[pred] for pred in predictions_flat])
        print("Train Loss : {:.3f}".format(mean_train_loss))
        print("Training SEMEVAL F1 : {:.3f}".format(train_f1))
        val_loss, val_f1= validate(model, batch_splitter, loss_fn, val_loader)

        training_losses.append(mean_train_loss)
        training_f1s.append(train_f1)
        validation_losses.append(val_loss)
        validation_f1s.append(val_f1)

    return (training_losses,training_f1s,validation_losses,validation_f1s)



### Training LSTM-base

In [35]:
train = False
lstm_base_filename = join(data_dir,'lstm_base.p')

if not train:
    print("Attempting to load model from file")
    lstm_base = torch.load(lstm_base_filename)
    print("Loaded successfuly")
else:

    lstm_base = LSTM_base(embeddings_matrix,EMBEDDING_DIM).to(DEVICE)

    EPOCHS=30
    LEARN_RATE = 1e-4

    loss_fn = nn.CrossEntropyLoss(weight = torch.tensor([1,0.5,1]).to(DEVICE))

    optimizer = Adam(lstm_base.parameters(), lr=LEARN_RATE)

    train_losses = []
    valid_losses = []

    #Train loop returns training statistics for use in the next cell
    tl, ta, vl, va = train_loop(lstm_base, lstm_batch_splitter, loss_fn, optimizer, lstm_train_loader, lstm_dev_loader, EPOCHS)

    print("Saving trained model")


    torch.save(lstm_base.to(CPU), lstm_base_filename)



Attempting to load model from file
Loaded successfuly


In [36]:
#If training was just performed we can plot the training graphs here

if (train):
    plt.figure(figsize=(12,5))
    plt.subplot(1,2,1)
    plt.plot(tl,label = 'Training Loss')
    plt.plot(vl, label = 'Validation Loss')
    plt.legend()

    plt.subplot(1,2,2)
    plt.plot(ta, label = 'Training F1')
    plt.plot(va, label = 'Validation F1')
    plt.legend()
    plt.show()

### Evaluating LSTM-Base

In [37]:
to_test = testsets

lstm_base.to(DEVICE)
lstm_base.eval()


for testset in to_test:

    test_tweets = processed_tweets[testset]
    test_labels = tweetlabels[testset]
    test_ids = tweetids[testset]

    X_test = torch.tensor(lstm_tokenizer.transform(test_tweets)).long().to(DEVICE)
    Y_test = torch.tensor([LABELS_IDS[label] for label in test_labels]).long().to(DEVICE)

    with torch.no_grad():
        test_pred = lstm_base(X_test)
        
        test_pred_labels = test_pred.argmax(dim=-1)

        predictions = [LABELS[i.item()] for i in test_pred_labels]

        pred_dict = {id:label for id,label in zip(test_ids,predictions)}


    evaluate(pred_dict, join(data_dir,testset), "lstm-base")

    confusion(pred_dict, join(data_dir,testset), "lstm-base")





data/twitter-test1.txt (lstm-base): 0.599
            positive  negative  neutral
positive    0.574     0.079     0.347     
negative    0.146     0.638     0.215     
neutral     0.171     0.148     0.681     

data/twitter-test2.txt (lstm-base): 0.575
            positive  negative  neutral
positive    0.635     0.072     0.294     
negative    0.136     0.612     0.252     
neutral     0.273     0.106     0.620     

data/twitter-test3.txt (lstm-base): 0.578
            positive  negative  neutral
positive    0.614     0.082     0.305     
negative    0.179     0.481     0.340     
neutral     0.209     0.132     0.659     



### Training Bi-LSTM

In [38]:
train = False
bilstm_filename = join(data_dir,'bilstm.p')

if not train:
    print("Attempting to load model from file")
    bi_lstm = torch.load(bilstm_filename)
    print("Loaded successfully")
else:

    bi_lstm = BiLSTM(embeddings_matrix,EMBEDDING_DIM).to(DEVICE)

    EPOCHS= 30
    LEARN_RATE = 1e-4

    loss_fn = nn.CrossEntropyLoss(weight=torch.tensor([1,0.5,1]).to(DEVICE))

    optimizer = Adam(bi_lstm.parameters(), lr=LEARN_RATE)

    train_losses = []
    valid_losses = []

    tl, ta, vl, va = train_loop(bi_lstm, lstm_batch_splitter, loss_fn, optimizer, lstm_train_loader, lstm_dev_loader, EPOCHS)

    print("Saving trained model")

    torch.save(bi_lstm.to(CPU),bilstm_filename)



Attempting to load model from file
Loaded successfully


### Evaluating Bi-LSTM

In [39]:

to_test = testsets

bi_lstm.to(DEVICE)
bi_lstm.eval()

for testset in to_test:

    test_tweets = processed_tweets[testset]
    test_labels = tweetlabels[testset]
    test_ids = tweetids[testset]

    X_test = torch.tensor(lstm_tokenizer.transform(test_tweets)).long().to(DEVICE)
    Y_test = torch.tensor([LABELS_IDS[label] for label in test_labels]).long().to(DEVICE)



    with torch.no_grad():
        test_pred = bi_lstm(X_test.to(DEVICE))
        
        test_pred_labels = test_pred.argmax(dim=-1)

        predictions = [LABELS[i.item()] for i in test_pred_labels]

        pred_dict = {id:label for id,label in zip(test_ids,predictions)}

    evaluate(pred_dict, join(data_dir,testset), "bi-lstm")
    confusion(pred_dict,join(data_dir,testset),"bi-lstm")


data/twitter-test1.txt (bi-lstm): 0.598
            positive  negative  neutral
positive    0.577     0.082     0.342     
negative    0.130     0.683     0.187     
neutral     0.172     0.149     0.679     

data/twitter-test2.txt (bi-lstm): 0.575
            positive  negative  neutral
positive    0.644     0.071     0.284     
negative    0.115     0.596     0.288     
neutral     0.254     0.109     0.637     

data/twitter-test3.txt (bi-lstm): 0.594
            positive  negative  neutral
positive    0.613     0.082     0.305     
negative    0.175     0.538     0.287     
neutral     0.197     0.125     0.678     



## Transformer Based Classifier

### Preparing training data for BERT

In [40]:

#Tokenizer *must* be retrieved here for evaluation as well as training
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')


bert_prepare= False # Only set True if you need to train the model

#If bert_prepare is true we prepare training and dev data to use in the training
if (bert_prepare):

    train_tweets = processed_tweets[trainset]
    train_labels = tweetlabels[trainset]

    dev_tweets = processed_tweets[devset]
    dev_labels = tweetlabels[devset]

    X_train = bert_tokenizer(train_tweets,padding=True, truncation=True, max_length=40)
    Y_train = torch.tensor([LABELS_IDS[label] for label in train_labels]).long() 

    X_dev = bert_tokenizer(dev_tweets,padding=True, truncation=True, max_length=40)
    Y_dev = torch.tensor([LABELS_IDS[label] for label in dev_labels]).long()
    ids_train = torch.tensor(X_train['input_ids'])
    masks_train = torch.tensor(X_train['attention_mask'])

    ids_dev = torch.tensor(X_dev['input_ids'])
    masks_dev = torch.tensor(X_dev['attention_mask'])

    train_zipped = list(zip(ids_train, masks_train, Y_train))
    dev_zipped = list(zip(ids_dev, masks_dev, Y_dev))

    bert_train_loader = DataLoader(train_zipped, batch_size=256)
    bert_dev_loader  = DataLoader(dev_zipped, batch_size=256)



### Model Definition

In [41]:

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.bert.requires_grad = False
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 3) # Bert outputs 768 dimensional encodings

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        out = self.dropout(pooled_output)
        out = self.linear(out)
        return out

### Training BERT-based model

This requires a GPU, to use it set `DEVICE=torch.device('cuda')` in the user defined constants section\
Training took around 6 minutes for 3 epochs on the the lab computer

In [42]:
train=False

bert_filename = join(data_dir,'bert-classifier.p') 

if (train):

    bert_classifier = BertClassifier().to(DEVICE)

    EPOCHS= 3
    LEARN_RATE = 1e-4


    loss_fn = nn.CrossEntropyLoss(weight = torch.tensor([1,0.5,1]).to(DEVICE))

    optimizer = Adam(bert_classifier.parameters(), lr=LEARN_RATE)

    tl, ta, vl, va = train_loop(bert_classifier, bert_batch_splitter, loss_fn, optimizer, bert_train_loader, bert_dev_loader, EPOCHS)

    torch.save(bert_classifier.to(CPU),bert_filename)

else:
    bert_classifier = torch.load(bert_filename)

    print("Loaded bert classifier from file")


Loaded bert classifier from file


### Evaluating BERT-based model

GPU is reccomended for evaluation. It can be done on cpu but took around 15 minutes for one dataset

To use GPU, set `DEVICE=torch.device('cuda')` in the user defined constants section

In [43]:
#Evaluating the bert model on the test set 1 This cell takes a long time to run on cpu

to_test = testsets
eval = True

if (eval):

    for testset in to_test:
        bert_classifier.to(DEVICE)
        bert_classifier.eval()
        test_tweets = processed_tweets[testset]
        test_labels = tweetlabels[testset]
        test_ids = tweetids[testset]

        X_test = bert_tokenizer(test_tweets, padding=True, truncation=True, max_length=40)
        Y_test = torch.tensor([LABELS_IDS[label] for label in test_labels]).long()                                                                              

        ids_test = torch.tensor(X_test['input_ids']).to(DEVICE)
        masks_test = torch.tensor(X_test['attention_mask']).to(DEVICE)


        with (torch.no_grad()):
            test_pred = bert_classifier.to(DEVICE)(ids_test,masks_test)

            test_pred_labels = test_pred.argmax(dim=-1)

            id_preds = {id:LABELS[pred.item()] for id,pred in zip(test_ids,test_pred_labels)}
        

        evaluate(id_preds,join(data_dir,testset),'BERT+Linear')

        confusion(id_preds,join(data_dir,testset),'BERT+Linear')


data/twitter-test1.txt (BERT+Linear): 0.707
            positive  negative  neutral
positive    0.708     0.032     0.260     
negative    0.066     0.789     0.145     
neutral     0.189     0.124     0.688     

data/twitter-test2.txt (BERT+Linear): 0.698
            positive  negative  neutral
positive    0.769     0.028     0.202     
negative    0.065     0.748     0.187     
neutral     0.269     0.100     0.631     

data/twitter-test3.txt (BERT+Linear): 0.662
            positive  negative  neutral
positive    0.744     0.034     0.222     
negative    0.105     0.629     0.266     
neutral     0.257     0.112     0.632     

