In [None]:
# !pip install emoji
# !pip install keras_self_attention
# !pip install matplotlib
# !pip install seaborn

In [None]:
from sklearn.model_selection import train_test_split
import itertools
import emoji
import re
import tensorflow
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Input, GlobalAveragePooling1D, GlobalAveragePooling2D
from keras_self_attention import SeqSelfAttention
from tensorflow.keras.layers import Flatten, Dropout, Dense, Bidirectional, Average, Concatenate, LSTM

import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from tqdm import tqdm_notebook as tqdm

In [None]:
def parse_lines(lines):
    u = [] # uids
    t = [] # tokens
    l = [] # token labels
    s = [] # sentiment labels
    max_length = 0

    print("Parsing lines from file...")
    for i, line in tqdm(enumerate(lines), total=len(lines)):
        line = line.strip().split('\t')
        if line[0]=='meta':
            if i!=0:
                u.append(buffer_id)
                t.append(buffer_tokens)
                l.append(buffer_labels)
                s.append(buffer_sentiment)
                if len(buffer_tokens) > max_length:
                    max_length = len(buffer_tokens)
            buffer_id = line[1]
            try:
                buffer_sentiment = line[2]
            except:
                buffer_sentiment = ''
            buffer_tokens = []
            buffer_labels = []
        else:
            buffer_tokens.append(line[0])
            try:
                buffer_labels.append(line[1])
            except:
                buffer_labels.append('')

    u.append(buffer_id)
    t.append(buffer_tokens)
    l.append(buffer_labels)
    s.append(buffer_sentiment)
    if len(buffer_tokens) > max_length:
        max_length = len(buffer_tokens)

    num_samples = len(u)
    
    return u, t, l, s, max_length

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/datasets/* .

In [None]:
train = open('/content/train_14k_split_conll.txt', encoding='utf8').readlines()
valid = open('/content/dev_3k_split_conll.txt', encoding='utf8').readlines()
test = open('/content/Hindi_test_unalbelled_conll_updated.txt', encoding='utf8').readlines()

u_train, t_train, l_train, s_train, max_length = parse_lines(train)
u_dev, t_dev, l_dev, s_dev, max_length_dev = parse_lines(valid)
u_test, t_test, l_test, s_test, max_length_test = parse_lines(test)

In [None]:
print(len(s_train),len(s_dev))

### Cleaning

In [None]:
def tweet_cleaning_for_sentiment_analysis(tweet):
    
        # lower case
    tweet = tweet.lower()
    tweet = emoji.demojize(tweet)
    tweet = tweet.replace(":"," ")
    tweet = tweet.replace("’","'")
    # replace duplicate character
    tweet = re.sub(r"(.)\1{2,}", r'\1\1', tweet)
    
    
    CONTRACTIONS = load_dict_contractions()
    SMILEY = load_dict_smileys() 
    words = tweet.split()
    reformed = words
    tweet = ' '.join(reformed)
    return tweet


def clean(t,l):
    for i in range(len(t)):
        temp = tweet_cleaning_for_sentiment_analysis(' '.join(t[i])).split(' ')
        t[i] = []
        j=0
        while j<len(temp):
            t[i].append(temp[j])
            j+=1
        
    return t, l

In [None]:
t_train, l_train = clean(t_train, l_train)
t_dev, l_dev = clean(t_dev, l_dev)
t_test, l_test = clean(t_test, l_test)

In [None]:
newTrain = t_train + t_dev
newLabel = s_train + s_dev

t_train, t_dev, s_train, s_dev = train_test_split(newTrain, newLabel, test_size=0.2, random_state=42)
print(len(t_train), len(t_dev), len(s_train), len(s_dev))

In [None]:
print(len(t_train[1]), len(l_train[1]))
print(sum([len(i) for i in t_train])/len(t_train))
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

data = [len(i) for i in t_train]

# Generate histogram/distribution plot
sns.displot(data)

plt.show()

In [None]:
tok_w = Tokenizer(char_level=False,lower=True,oov_token='UNK')
tok_w.fit_on_texts(t_train) 

MAX_LEN = 60
trainInput_w = pad_sequences(tok_w.texts_to_sequences(t_train),
                          maxlen=MAX_LEN, padding="post")

valInput_w = pad_sequences(tok_w.texts_to_sequences(t_dev) ,
                          maxlen=MAX_LEN, padding="post")
testInput_w = pad_sequences(tok_w.texts_to_sequences(t_test),
                          maxlen=MAX_LEN, padding="post")

In [None]:
tok_w.word_index.keys()
len(tok_w.word_index.keys())
# list(tok_w.word_index.keys())[0]
print(len(s_dev),len(s_train))

In [None]:
le = preprocessing.LabelEncoder()
le.fit(s_train)

trainLabels = to_categorical(le.transform(s_train))
valLabels = to_categorical(le.transform(s_dev))

In [None]:
print(len(tok_w.word_index))

### Model Architecture, Optimizer, Loss Function and Hyperparameters

In [None]:
# CNN  + Attention Based LSTM

max_features = len(tok_w.word_index)
maxlen = 60
embedding_size = 256

# Convolution
kernel_size = 4
filters = 128
pool_size = 4


from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
# This returns a tensor
inputs = Input(shape=(60,))
emb = Embedding(max_features+1, embedding_size, input_length=maxlen)(inputs)
x1 = Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1)(emb)

x1 = Dropout(0.2)(x1)                
x1 = MaxPooling1D(pool_size=pool_size)(x1)
x1 = Dropout(0.2)(x1) 

x = SeqSelfAttention(attention_activation='sigmoid')(x1)
x = Dropout(0.2)(x) 
x = GlobalAveragePooling1D()(x)
x = Dropout(0.2)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.2)(x)
outputs = Dense(3, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy',
              optimizer= Adam(lr=0.001),
              metrics=['categorical_accuracy'])
model.summary()

In [None]:
model.fit([trainInput_w], trainLabels,
          batch_size= 16 ,
          validation_data=(valInput_w,valLabels),
          epochs=1)
model.optimizer.get_config()

### Make Predictions on Test Set

In [None]:
predictions = model.predict([testInput_w])
predictions = np.argmax(predictions,axis=-1)

# write predictions to file
with open('preds.txt', 'w') as out:
    out.write('Uid,Sentiment')
    for i, uid in enumerate(u_test):
        if predictions[i] == 0:
            sentiment = 'negative'
        elif predictions[i] == 1:
            sentiment = 'neutral'
        else:
            sentiment = 'positive'
        out.write("\n%s,%s"%(uid, sentiment))
        
        
# load correct labels
test = pd.read_csv('test_labels_hinglish.txt')
# load predictions
preds = pd.read_csv('preds.txt')

# compute evaluation metrics
results = {'preds': classification_report(test['Sentiment'], 
                                          preds['Sentiment'], 
                                          labels=['positive', 'neutral', 'negative'], 
                                          output_dict=True, digits=6)}
results2 = classification_report(test['Sentiment'], 
                                          preds['Sentiment'], 
                                          labels=['positive', 'neutral', 'negative'], 
                                          output_dict=True, digits=6)

In [None]:
print(results2)

In [None]:
# format and print scores
formatted_results = [['model', 'precision', 'recall', 'accuracy', 'f1-score']]
for ki in results.keys():
    scores = results[ki]['macro avg']
    model = [ki, scores['precision'], scores['recall'], results[ki]['accuracy'], scores['f1-score']]
    formatted_results.append(model)
    
formatted_results = pd.DataFrame(formatted_results[1:], columns=formatted_results[0])
print(formatted_results)