In [None]:
!pip install emoji
!pip install keras_self_attention
!pip install hydra-core
!pip install sentencepiece
!pip install fairseq
!pip install matplotlib
!pip install seaborn

In [None]:
from sklearn.model_selection import train_test_split
import itertools
import emoji
import re
import tensorflow
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Input, GlobalAveragePooling1D, GlobalAveragePooling2D
from keras_self_attention import SeqSelfAttention
from tensorflow.keras.layers import Flatten, Dropout, Dense, Bidirectional, Average, Concatenate, LSTM

import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm_notebook as tqdm

In [None]:
def parse_lines(lines):
    u = [] # uids
    t = [] # tokens
    l = [] # token labels
    s = [] # sentiment labels
    max_length = 0

    print("Parsing lines from file...")
    for i, line in tqdm(enumerate(lines), total=len(lines)):
        line = line.strip().split('\t')
        if line[0]=='meta':
            if i!=0:
                u.append(buffer_id)
                t.append(buffer_tokens)
                l.append(buffer_labels)
                s.append(buffer_sentiment)
                if len(buffer_tokens) > max_length:
                    max_length = len(buffer_tokens)
            buffer_id = line[1]
            try:
                buffer_sentiment = line[2]
            except:
                buffer_sentiment = ''
            buffer_tokens = []
            buffer_labels = []
        else:
            buffer_tokens.append(line[0])
            try:
                buffer_labels.append(line[1])
            except:
                buffer_labels.append('')

    u.append(buffer_id)
    t.append(buffer_tokens)
    l.append(buffer_labels)
    s.append(buffer_sentiment)
    if len(buffer_tokens) > max_length:
        max_length = len(buffer_tokens)

    num_samples = len(u)
    
    return u, t, l, s, max_length

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/datasets/* .

In [None]:
train = open('/content/train_14k_split_conll.txt', encoding='utf8').readlines()
valid = open('/content/dev_3k_split_conll.txt', encoding='utf8').readlines()
test = open('/content/Hindi_test_unalbelled_conll_updated.txt', encoding='utf8').readlines()

u_train, t_train, l_train, s_train, max_length = parse_lines(train)
u_dev, t_dev, l_dev, s_dev, max_length_dev = parse_lines(valid)
u_test, t_test, l_test, s_test, max_length_test = parse_lines(test)

In [None]:
print(len(s_train),len(s_dev))

### Cleaning

In [None]:
def load_dict_smileys():
    return {
        ":‑)":"smiley",
        ":-]":"smiley",
        ":-3":"smiley",
        ":->":"smiley",
        "8-)":"smiley",
        ":-}":"smiley",
        ":)":"smiley",
        ":]":"smiley",
        ":3":"smiley",
        ":>":"smiley",
        "8)":"smiley",
        ":}":"smiley",
        ":o)":"smiley",
        ":c)":"smiley",
        ":^)":"smiley",
        "=]":"smiley",
        "=)":"smiley",
        ":-))":"smiley",
        ":‑D":"smiley",
        "8‑D":"smiley",
        "x‑D":"smiley",
        "X‑D":"smiley",
        ":D":"smiley",
        "8D":"smiley",
        "xD":"smiley",
        "XD":"smiley",
        ":‑(":"sad",
        ":‑c":"sad",
        ":‑<":"sad",
        ":‑[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'‑(":"sad",
        ":'(":"sad",
        ":‑P":"playful",
        "X‑P":"playful",
        "x‑p":"playful",
        ":‑p":"playful",
        ":‑Þ":"playful",
        ":‑þ":"playful",
        ":‑b":"playful",
        ":P":"playful",
        "XP":"playful",
        "xp":"playful",
        ":p":"playful",
        ":Þ":"playful",
        ":þ":"playful",
        ":b":"playful",
        "<3":"love"
        }

# source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
def load_dict_contractions():
    return {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks"
        }

def tweet_cleaning_for_sentiment_analysis(tweet):
    
        # lower case
    tweet = tweet.lower()
    tweet = emoji.demojize(tweet)
    tweet = tweet.replace(":"," ")
    tweet = tweet.replace("’","'")
    # replace duplicate character
    tweet = re.sub(r"(.)\1{2,}", r'\1\1', tweet)
    
    
    CONTRACTIONS = load_dict_contractions()
    SMILEY = load_dict_smileys() 
    words = tweet.split()
    reformed = words
    reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
    reformed = [SMILEY[word] if word in SMILEY else word for word in reformed]
    tweet = ' '.join(reformed)
    return tweet


def clean(t,l):
    for i in range(len(t)):
        temp = tweet_cleaning_for_sentiment_analysis(' '.join(t[i])).split(' ')
        t[i] = []
        j=0
        while j<len(temp):
            t[i].append(temp[j])
            j+=1
        
    return t, l

In [None]:
t_train, l_train = clean(t_train, l_train)
t_dev, l_dev = clean(t_dev, l_dev)
t_test, l_test = clean(t_test, l_test)

In [None]:
newTrain = t_train + t_dev
newLabel = s_train + s_dev

t_train, t_dev, s_train, s_dev = train_test_split(newTrain, newLabel, test_size=0.2, random_state=42)
print(len(t_train), len(t_dev), len(s_train), len(s_dev))

In [None]:
print(len(t_train[1]), len(l_train[1]))
print(sum([len(i) for i in t_train])/len(t_train))
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

data = [len(i) for i in t_train]

# Generate histogram/distribution plot
sns.displot(data)

plt.show()

In [None]:
tok_w = Tokenizer(char_level=False,lower=True,oov_token='UNK')
tok_w.fit_on_texts(t_train) 

MAX_LEN = 60
trainInput_w = pad_sequences(tok_w.texts_to_sequences(t_train),
                          maxlen=MAX_LEN, padding="post")

valInput_w = pad_sequences(tok_w.texts_to_sequences(t_dev) ,
                          maxlen=MAX_LEN, padding="post")
testInput_w = pad_sequences(tok_w.texts_to_sequences(t_test),
                          maxlen=MAX_LEN, padding="post")

In [None]:
tok_w.word_index.keys()
len(tok_w.word_index.keys())
# list(tok_w.word_index.keys())[0]
print(len(s_dev),len(s_train))

In [None]:
le = preprocessing.LabelEncoder()
le.fit(s_train)

trainLabels = to_categorical(le.transform(s_train))
valLabels = to_categorical(le.transform(s_dev))

In [None]:
def get_embedding_lookup(embedding_path):
    embedding_lookup = {}
    with open(embedding_path) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coef = np.array(values[1:], dtype=np.float32)
            embedding_lookup[word] = coef

    return embedding_lookup


def get_pretrained_embedding(embedding_path,
                             index2word,
                             max_features: int) -> np.ndarray:
    embedding_lookup = get_embedding_lookup(embedding_path)

    pretrained_embedding = np.stack(list(embedding_lookup.values()))
    embedding_dim = pretrained_embedding.shape[1]
    embeddings = np.random.normal(pretrained_embedding.mean(),
                                  pretrained_embedding.std(),
                                  (max_features, embedding_dim)).astype(np.float32)

    n_found = 0
    
    for i in range(1, max_features):
        word = index2word[i]
        embedding_vector = embedding_lookup.get(word)
        if embedding_vector is not None:
            embeddings[i] = embedding_vector
            n_found += 1

    print('number of words found:', n_found)
    return embeddings

In [None]:
# using glove embeddings 

glove_path = 'glove.42B.300d.txt'
max_features = len(tok_w.word_index) + 1

pretrained_embedding = get_pretrained_embedding(glove_path, tok_w.index_word, max_features)
pretrained_embedding.shape

In [None]:
# using xlm-r encodings

!wget https://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz
!tar -xvf /content/xlmr.base.tar.gz

from fairseq.models.roberta import XLMRModel
xlmr = XLMRModel.from_pretrained('/content/xlmr.base', checkpoint_file='model.pt')
xlmr.eval() 

In [None]:
pretrained_embedding = []
toklist  = list(tok_w.word_index.keys())

for i,tok in enumerate(toklist):
  if i%100 == 0:
    print(i) 
  n_tokens = xlmr.encode(tok)
  features = xlmr.extract_features(n_tokens)
  fea = features.reshape(-1,768).detach().numpy()
  meanr = np.mean(fea,axis = 0)
  pretrained_embedding.append(list(meanr))


pretrained_embedding = np.array(pretrained_embedding)
newrow = np.random.normal(0,1, 768)
pretrained_embedding = np.vstack([newrow, pretrained_embedding])
print(pretrained_embedding.shape)

### Model Architecture, Optimizer, Loss Function and Hyperparameters

In [None]:
#baseline Model  (Attention Based CNN) 

max_features = len(tok_w.word_index)
maxlen = 60
embedding_size = 128

# Convolution
kernel_size = 5
filters = 264
pool_size = 4

model = Sequential()
# model.add(Input(shape=(None,)))
model.add(Embedding(max_features+1, 300,weights=[pretrained_embedding], input_length=maxlen, trainable=False))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(SeqSelfAttention(attention_activation='sigmoid'))
# model.add(Flatten())
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

In [None]:
# CNN  + Attention Based LSTM

max_features = len(tok_w.word_index)
maxlen = 60
embedding_size = 256

# Convolution
kernel_size = 4
filters = 128
pool_size = 4


from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
# This returns a tensor
inputs = Input(shape=(60,))
emb = Embedding(max_features+1, embedding_size, input_length=maxlen)(inputs)
x4 = Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1)(emb)

x4 = Dropout(0.2)(x4)                
x4 = MaxPooling1D(pool_size=pool_size)(x4)
x4 = Dropout(0.2)(x4) 

x2 = Conv1D(filters,
                 3,
                 padding='valid',
                 activation='relu',
                 strides=1)(emb)

x2 = Dropout(0.2)(x2) 
x2 = MaxPooling1D(pool_size=pool_size)(x2)
x2 = Dropout(0.2)(x2) 

x3 = Conv1D(filters,
                 2,
                 padding='valid',
                 activation='relu',
                 strides=1)(emb)

x3 = Dropout(0.2)(x3) 
x3 = MaxPooling1D(pool_size=pool_size)(x3)
x3 = Dropout(0.2)(x3) 

x = Concatenate(axis = 1)([x4,x2,x3])


x = SeqSelfAttention(attention_activation='sigmoid')(x)
x = Dropout(0.2)(x) 
x = GlobalAveragePooling1D()(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)

x1 = Bidirectional(LSTM(128, return_sequences=True))(emb)
x1 = Dropout(0.2)(x1)
x1 = Dense(64, activation='relu')(x1)
x1 = Dropout(0.2)(x1)

o1 = Dense(3, activation='softmax')(x)
o2 = Dense(3, activation='softmax')(x1)
outputs = tensorflow.keras.layers.Multiply()([o1,o2])


model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy',
              optimizer= Adam(lr=0.001),
              metrics=['categorical_accuracy'])
model.summary()

In [None]:
# CNN  + Attention Based LSTM

max_features = len(tok_w.word_index)
maxlen = 60
embedding_size = 256

# Convolution
kernel_size = 4
filters = 128
pool_size = 4


from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
# This returns a tensor
inputs = Input(shape=(60,))
emb = Embedding(max_features+1, embedding_size, input_length=maxlen)(inputs)
x4 = Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1)(emb)

x4 = Dropout(0.2)(x4)                
x4 = MaxPooling1D(pool_size=pool_size)(x4)
x4 = Dropout(0.2)(x4) 

x2 = Conv1D(filters,
                 3,
                 padding='valid',
                 activation='relu',
                 strides=1)(emb)

x2 = Dropout(0.2)(x2) 
x2 = MaxPooling1D(pool_size=pool_size)(x2)
x2 = Dropout(0.2)(x2) 

x3 = Conv1D(filters,
                 2,
                 padding='valid',
                 activation='relu',
                 strides=1)(emb)

x3 = Dropout(0.2)(x3) 
x3 = MaxPooling1D(pool_size=pool_size)(x3)
x3 = Dropout(0.2)(x3) 

x = Concatenate(axis = 1)([x4,x2,x3])


# x = SeqSelfAttention(attention_activation='sigmoid')(x)
# x = Dropout(0.2)(x) 
x = GlobalAveragePooling1D()(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)

x1 = Bidirectional(LSTM(128, return_sequences=True))(emb)
x1 = SeqSelfAttention(attention_activation='sigmoid')(x1)
x1 = Dropout(0.2)(x1) 
x1 = GlobalAveragePooling1D()(x1)
x1 = Dropout(0.2)(x1)
x1 = Dense(64, activation='relu')(x1)
x1 = Dropout(0.2)(x1)

# o1 = Concatenate(axis = 1)([x,x1])
# tf.keras.layers.Dot

o1 = Dense(3, activation='softmax')(x)
o2 = Dense(3, activation='softmax')(x1)
outputs = tensorflow.keras.layers.Multiply()([o1,o2])


model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy',
              optimizer= Adam(lr=0.001),
              metrics=['categorical_accuracy'])
model.summary()

# Train

In [None]:
model.fit([trainInput_w], trainLabels,
          batch_size= 16 ,
          validation_data=(valInput_w,valLabels),
          epochs=1)
model.optimizer.get_config()

### Make Predictions on Test Set

In [None]:
predictions = model.predict([testInput_w])
predictions = np.argmax(predictions,axis=-1)

# write predictions to file
with open('preds.txt', 'w') as out:
    out.write('Uid,Sentiment')
    for i, uid in enumerate(u_test):
        if predictions[i] == 0:
            sentiment = 'negative'
        elif predictions[i] == 1:
            sentiment = 'neutral'
        else:
            sentiment = 'positive'
        out.write("\n%s,%s"%(uid, sentiment))
        
        
# load correct labels
test = pd.read_csv('test_labels_hinglish.txt')
# load predictions
preds = pd.read_csv('preds.txt')

# compute evaluation metrics
results = {'preds': classification_report(test['Sentiment'], 
                                          preds['Sentiment'], 
                                          labels=['positive', 'neutral', 'negative'], 
                                          output_dict=True, digits=6)}
results2 = classification_report(test['Sentiment'], 
                                          preds['Sentiment'], 
                                          labels=['positive', 'neutral', 'negative'], 
                                          output_dict=True, digits=6)

In [None]:
print(results2)

In [None]:
# format and print scores
formatted_results = [['model', 'precision', 'recall', 'accuracy', 'f1-score']]
for ki in results.keys():
    scores = results[ki]['macro avg']
    model = [ki, scores['precision'], scores['recall'], results[ki]['accuracy'], scores['f1-score']]
    formatted_results.append(model)
    
formatted_results = pd.DataFrame(formatted_results[1:], columns=formatted_results[0])
print(formatted_results)