# Assignment Two:  Sentiment Classification

For this exercise you will be using the "SemEval 2017 task 4" corpus provided on the module website, available through the following link: https://warwick.ac.uk/fac/sci/dcs/teaching/material/cs918/semeval-tweets.tar.bz2 You will focus particularly on Subtask A, i.e. classifying the overall sentiment of a tweet as positive, negative or neutral.

You are requested to produce a Jupyter notebook for the coursework submission. The input to your program is the SemEval data downloaded. Note that TAs need to run your program on their own machine by using the original SemEval data. As such, don’t submit a Python program that takes as input some preprocessed files.

#### Import necessary packages
You may import more packages here.

In [56]:
# Import necessary packages
import re
from os.path import join
import os
from tqdm import tqdm

from collections import Counter

In [57]:
import warnings
from sklearn.exceptions import ConvergenceWarning

with warnings.catch_warnings():
    warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [58]:
import pandas as pd
import numpy as np

In [59]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [60]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityamahamuni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adityamahamuni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adityamahamuni/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adityamahamuni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [61]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, classification_report

In [63]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, Dataset

from transformers import BertModel, BertTokenizer
from transformers import GPT2Model, GPT2Tokenizer, AdamW
from transformers import AdamW

#### Skeleton Code

In [64]:
# Define test sets
testsets = ['twitter-test1.txt', 'twitter-test2.txt', 'twitter-test3.txt']

In [65]:
# Skeleton: Evaluation code for the test sets
def read_test(testset):
    '''
    readin the testset and return a dictionary
    :param testset: str, the file name of the testset to compare
    '''
    id_gts = {}
    with open(testset, 'r', encoding='utf8') as fh:
        for line in fh:
            fields = line.split('\t')
            tweetid = fields[0]
            gt = fields[1]

            id_gts[tweetid] = gt
            
    return id_gts


def confusion(id_preds, testset, classifier):
    '''
    print the confusion matrix of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    gts = []
    for m, c1 in id_gts.items():
        if c1 not in gts:
            gts.append(c1)

    gts = ['positive', 'negative', 'neutral']

    conf = {}
    for c1 in gts:
        conf[c1] = {}
        for c2 in gts:
            conf[c1][c2] = 0

    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'
        conf[pred][gt] += 1

    print(''.ljust(12) + '  '.join(gts))

    for c1 in gts:
        print(c1.ljust(12), end='')
        for c2 in gts:
            if sum(conf[c1].values()) > 0:
                print('%.3f     ' % (conf[c1][c2] / float(sum(conf[c1].values()))), end='')
            else:
                print('0.000     ', end='')
        print('')

    print('')


def evaluate(id_preds, testset, classifier):
    '''
    print the macro-F1 score of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    acc_by_class = {}
    for gt in ['positive', 'negative', 'neutral']:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    catf1s = {}

    ok = 0
    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'

        if gt == pred:
            ok += 1
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    catcount = 0
    itemcount = 0
    macro = {'p': 0, 'r': 0, 'f1': 0}
    micro = {'p': 0, 'r': 0, 'f1': 0}
    semevalmacro = {'p': 0, 'r': 0, 'f1': 0}

    microtp = 0
    microfp = 0
    microtn = 0
    microfn = 0
    for cat, acc in acc_by_class.items():
        catcount += 1

        microtp += acc['tp']
        microfp += acc['fp']
        microtn += acc['tn']
        microfn += acc['fn']

        p = 0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        catf1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            semevalmacro['p'] += p
            semevalmacro['r'] += r
            semevalmacro['f1'] += f1

        itemcount += n

    micro['p'] = float(microtp) / float(microtp + microfp)
    micro['r'] = float(microtp) / float(microtp + microfn)
    micro['f1'] = 2 * float(micro['p']) * micro['r'] / float(micro['p'] + micro['r'])

    semevalmacrof1 = semevalmacro['f1'] / 2

    print(testset + ' (' + classifier + '): %.3f' % semevalmacrof1)

## Build Classifier and running evaluation is at the end. 

### Jump to section - [Build Classifier Section](#build)

### Experimentation Section

#### Load Training, Dev and Test sets

In [66]:
def load_dataset_pandas(filepath):
    df = pd.read_csv(filepath, sep='\t', header=None, names=['tweet_id', 'sentiment', 'tweet'])
    return df

train_set_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-training-data.txt')
dev_set_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-dev-data.txt')
test_set1_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-test1.txt')
test_set2_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-test2.txt')
test_set3_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-test3.txt')

df_train = load_dataset_pandas(train_set_path)
df_dev = load_dataset_pandas(dev_set_path)
df_test1 = load_dataset_pandas(test_set1_path)
df_test2 = load_dataset_pandas(test_set2_path)
df_test3 = load_dataset_pandas(test_set3_path)

In [67]:
print(df_train.head())

             tweet_id sentiment  \
0  335104872099066692  positive   
1  796528524030124618  positive   
2  760964834217238632  positive   
3  147713180324524046  negative   
4  732302280474120023  negative   

                                               tweet  
0  Felt privileged to play Foo Fighters songs on ...  
1  @AaqibAfzaal Pakistan may be an Islamic countr...  
2  Happy Birthday to the coolest golfer in Bali! ...  
3  @SimpplyA TMILLS is going to Tucson! But the 2...  
4  Hmmmmm where are the #BlackLivesMatter when ma...  


#### EDA / Data Pre-Processing

In [68]:
mentions_pattern = re.compile(r'@\w+')
hashtags_pattern = re.compile(r'#\w+')
links_or_urls_pattern = re.compile(r'\b(https?://|www\.)\S+\b')
non_alphanumeric_pattern = re.compile(r'[^a-zA-Z0-9 ]')
single_char_pattern = re.compile(r'\b\w\b')
pure_numbers_pattern = re.compile(r'\b\d+\b')

def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(mentions_pattern, '', tweet)
    tweet = re.sub(hashtags_pattern, '', tweet)
    tweet = re.sub(links_or_urls_pattern, '', tweet)
    tweet = re.sub(non_alphanumeric_pattern, ' ', tweet)
    tweet = re.sub(pure_numbers_pattern, '', tweet)
    tweet = re.sub(single_char_pattern, '', tweet)
    tweet = ' '.join(tweet.split())
    return tweet

In [69]:
df_train['tweet'] = df_train['tweet'].apply(clean_tweet)
df_dev['tweet'] = df_dev['tweet'].apply(clean_tweet)

In [70]:
df_test1['tweet'] = df_test1['tweet'].apply(clean_tweet)
df_test2['tweet'] = df_test2['tweet'].apply(clean_tweet)
df_test3['tweet'] = df_test3['tweet'].apply(clean_tweet)

In [71]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_and_remove_stopwords(tweet):
    tokens = word_tokenize(tweet)
    tagged_tokens = nltk.pos_tag(tokens)

    stop_words = set(stopwords.words('english'))

    lemmatized_tokens = []
    for word, tag in tagged_tokens:
        if word not in stop_words:
            wordnet_tag = get_wordnet_pos(tag)
            if wordnet_tag is None:
                lemmatized_tokens.append(word)
            else:
                lemmatized_tokens.append(lemmatizer.lemmatize(word, wordnet_tag))
    
    cleaned_tweet = ' '.join(lemmatized_tokens)
    return cleaned_tweet

In [72]:
df_train['tweet'] = df_train['tweet'].apply(lemmatize_and_remove_stopwords)
df_dev['tweet'] = df_dev['tweet'].apply(lemmatize_and_remove_stopwords)

df_test1['tweet'] = df_test1['tweet'].apply(lemmatize_and_remove_stopwords)
df_test2['tweet'] = df_test2['tweet'].apply(lemmatize_and_remove_stopwords)
df_test3['tweet'] = df_test3['tweet'].apply(lemmatize_and_remove_stopwords)

## Sentiment Classification

In [19]:
X_train = df_train['tweet']
y_train = df_train['sentiment']

X_dev = df_dev['tweet']
y_dev = df_dev['sentiment']

X_test1 = df_test1['tweet']
y_test1 = df_test1['sentiment']

X_test2 = df_test2['tweet']
y_test2 = df_test2['sentiment']

X_test3 = df_test3['tweet']
y_test3 = df_test3['sentiment']

### 1. Logistic Regression (MaxEnt) Classifier

### Hyperparameter Optimisation

In [73]:
pipeline_lr_base = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(solver='lbfgs'))
])

lr_params = {
    'clf__C': [0.1, 1, 10, 100],
    'clf__max_iter': [1, 5, 10, 20, 50, 100]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lr_grid_search = GridSearchCV(pipeline_lr_base, lr_params, cv=cv, n_jobs=-1, verbose=1)
lr_grid_search.fit(X_train, y_train)

print(f"Best Parameters : {lr_grid_search.best_params_}")
print(f"Best Score : {lr_grid_search.best_score_}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters : {'clf__C': 1, 'clf__max_iter': 100}
Best Score : 0.6461377330900259


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


==================================================

Running Logistic Regression Classifier

LR Best classifier hyperparam search

Fitting 5 folds for each of 24 candidates, totalling 120 fits

Search Complete

**Best Parameters :** {'clf__C': 1, 'clf__max_iter': 100}

**Best Score :** 0.6487362401251902

==================================================

### Running Classifier on the Best hyperparameter obtained after search

In [74]:
best_clf_lr = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
    ('clf', LogisticRegression(solver='lbfgs', C=1, max_iter=100))
])

best_clf_lr.fit(X_train, y_train)
y_pred_lr = best_clf_lr.predict(X_dev)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [75]:
f1_lr = f1_score(y_dev, y_pred_lr, average='macro')
print(f"F1 Score (LR) : {f1_lr}")

F1 Score (LR) : 0.6368676146807313


### 2. SVM

### Hyperparameter Optimisation

In [76]:
pipeline_svm_base = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(kernel='linear'))
])

svm_params = {
    'clf__C': [0.1, 1, 10, 100]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
svm_grid_search = GridSearchCV(pipeline_svm_base, svm_params, cv=cv, n_jobs=-1, verbose=1)
svm_grid_search.fit(X_train, y_train)

print(f"Best Parameters : {svm_grid_search.best_params_}")
print(f"Best Score : {svm_grid_search.best_score_}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters : {'clf__C': 1}
Best Score : 0.6481588497420993


==================================================

Running SVM Classifier

SVM Best classifier hyperparam search

Fitting 5 folds for each of 4 candidates, totalling 20 fits

Search Complete

**Best Parameters :** {'clf__C': 1}

**Best Score :** 0.6544218790286394

==================================================

### Running Classifier on the Best hyperparameter obtained after search

In [77]:
best_clf_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(kernel='linear', C=1))

])

best_clf_svm.fit(X_train, y_train)

In [78]:
y_pred_svm = best_clf_svm.predict(X_dev)

In [79]:
f1_svm = f1_score(y_dev, y_pred_svm, average='macro')
print(f"F1 Score (SVM) : {f1_svm}")

F1 Score (SVM) : 0.6307961951435329


In [80]:
print(classification_report(y_dev, y_pred_svm))

              precision    recall  f1-score   support

    negative       0.67      0.47      0.55       378
     neutral       0.62      0.76      0.68       919
    positive       0.72      0.60      0.66       703

    accuracy                           0.65      2000
   macro avg       0.67      0.61      0.63      2000
weighted avg       0.66      0.65      0.65      2000



### 3. Naive Bayes

### Hyperparameter Optimisation

In [81]:
pipeline_nb_base = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

nb_params = {
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2, 1e-3]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
nb_grid_search = GridSearchCV(pipeline_nb_base, nb_params, cv=cv, n_jobs=-1, verbose=1)
nb_grid_search.fit(X_train, y_train)

print(f"Best Parameters : {nb_grid_search.best_params_}")
print(f"Best Score : {nb_grid_search.best_score_}")

Fitting 5 folds for each of 16 candidates, totalling 80 fits


Best Parameters : {'clf__alpha': 0.1, 'tfidf__norm': 'l2', 'tfidf__use_idf': False}
Best Score : 0.6144227840949515


==================================================

Running Naive Bayers Classifier

NB Best classifier hyperparam search

Fitting 5 folds for each of 16 candidates, totalling 80 fits

Search Complete

**Best Parameters :** {'clf__alpha': 0.1, 'tfidf__norm': 'l2'}

**Best Score :** 0.6186202473691733

==================================================

### Running Classifier on the Best hyperparameter obtained after search

In [82]:
best_clf_nb = Pipeline([
    ('tfidf', TfidfVectorizer(norm='l2')),
    ('clf', MultinomialNB(alpha=0.1))
])

best_clf_nb.fit(X_train, y_train)

In [83]:
y_pred_nb = best_clf_nb.predict(X_dev)

In [84]:
f1_nb = f1_score(y_dev, y_pred_nb, average='macro')
print(f"F1 Score (NB) : {f1_nb}")

F1 Score (NB) : 0.5754099412057598


In [85]:
print(classification_report(y_dev, y_pred_nb))

              precision    recall  f1-score   support

    negative       0.64      0.37      0.47       378
     neutral       0.59      0.71      0.64       919
    positive       0.62      0.60      0.61       703

    accuracy                           0.61      2000
   macro avg       0.62      0.56      0.58      2000
weighted avg       0.61      0.61      0.60      2000



GloVe

In [86]:
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_dict[word] = vector
    return embeddings_dict

In [87]:
glove_path = os.path.join(os.getcwd(), "glove.6B", "glove.6B.100d.txt")
glove_embeddings = load_glove_embeddings(glove_path)

def tweet_to_embedding(tweet, embeddings_dict, embedding_dim=100):
    tokens = word_tokenize(tweet.lower())
    token_embeddings = [embeddings_dict.get(token, np.zeros(embedding_dim)) for token in tokens]
    if token_embeddings:
        tweet_embedding = np.mean(token_embeddings, axis=0)
    else:
        tweet_embedding = np.zeros(embedding_dim)
    return tweet_embedding

In [88]:
X_train_embeddings = np.array([tweet_to_embedding(tweet, glove_embeddings) for tweet in X_train])
X_dev_embeddings = np.array([tweet_to_embedding(tweet, glove_embeddings) for tweet in X_dev])

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_dev_encoded = label_encoder.transform(y_dev)

## LSTM

### LSTM with GLOVE Embeddings

In [89]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: cpu


In [90]:
X_train_tokens = X_train.apply(lambda x: word_tokenize(x.lower()))
X_dev_tokens = X_dev.apply(lambda x: word_tokenize(x.lower()))

In [91]:
word_index = {}
current_index = 1
for tokens in X_train_tokens:
    for token in tokens:
        if token not in word_index:
            word_index[token] = current_index
            current_index += 1

X_train_seq = [[word_index[token] for token in tokens if token in word_index] for tokens in X_train_tokens]
X_dev_seq = [[word_index[token] for token in tokens if token in word_index] for tokens in X_dev_tokens]

In [92]:
max_length = max(max(len(seq) for seq in X_train_seq), max(len(seq) for seq in X_dev_seq))

In [93]:
def pad_sequences(sequences, maxlen):
    padded_sequences = np.zeros((len(sequences), maxlen), dtype=int)
    for i, seq in enumerate(sequences):
        len_seq = len(seq)
        if len_seq > maxlen:
            padded_sequences[i, :] = seq[:maxlen]
        elif len_seq > 0:
            padded_sequences[i, -len_seq:] = seq
    return padded_sequences

In [94]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_dev_pad = pad_sequences(X_dev_seq, maxlen=max_length)

In [95]:
vocab_size = len(word_index) + 1
embedding_dim = 100

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [96]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        _, (hidden, _) = self.lstm(embedded)
        hidden = hidden[-1]
        output = self.fc(hidden)
        return output

In [97]:
lstm_model = SentimentLSTM(vocab_size, embedding_dim, hidden_dim=128, output_dim=3, embedding_matrix=embedding_matrix).to(device)

In [98]:
X_train_tensor = torch.tensor(X_train_pad, dtype=torch.long)
X_dev_tensor = torch.tensor(X_dev_pad, dtype=torch.long)

y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_test_tensor = torch.tensor(y_dev_encoded, dtype=torch.long)

X_train_tensor = X_train_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)

X_dev_tensor = X_dev_tensor.to(device)
y_dev_tensor = y_test_tensor.to(device)

In [99]:
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

In [100]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)

In [101]:
num_epochs = 6
lstm_model.train()

for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        output = lstm_model(inputs)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 0.5453991889953613
Epoch 2, Loss: 0.5743475556373596
Epoch 3, Loss: 0.36705300211906433
Epoch 4, Loss: 0.7404666543006897
Epoch 5, Loss: 1.1280733346939087
Epoch 6, Loss: 0.26729288697242737


In [102]:
lstm_model.eval()

with torch.no_grad():
    predictions = lstm_model(X_dev_tensor)
    _, predicted_labels = torch.max(predictions, 1)
    
accuracy = accuracy_score(y_test_tensor.cpu().numpy(), predicted_labels.cpu().numpy())
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 0.65


In [103]:
print(classification_report(y_test_tensor.cpu().numpy(), predicted_labels.cpu(), target_names=label_encoder.classes_))

              precision    recall  f1-score   support

    negative       0.67      0.43      0.53       378
     neutral       0.62      0.75      0.68       919
    positive       0.69      0.63      0.66       703

    accuracy                           0.65      2000
   macro avg       0.66      0.61      0.62      2000
weighted avg       0.66      0.65      0.64      2000



### BERT with Bi-Directional LSTM

In [232]:
X_train = df_train['tweet']
y_train = df_train['sentiment']

X_dev = df_dev['tweet']
y_dev = df_dev['sentiment']

X_test = df_test1['tweet']
y_test = df_test1['sentiment']

test_tweet_ids = df_test1['tweet_id'].tolist()


In [233]:
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_dev_encoded = label_encoder.transform(y_dev)
y_test_encoded = label_encoder.transform(y_test)

In [234]:
class TweetLoader(Dataset):
    def __init__(self, tweets, labels, tokenizer_name='bert-base-uncased', max_length=128):
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
        self.tweets = tweets
        self.labels = labels
        self.max_length = max_length
    
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [235]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [236]:
train_dataset = TweetLoader(X_train, y_train_encoded, tokenizer_name='bert-base-uncased', max_length=128)
dev_dataset = TweetLoader(X_dev, y_dev_encoded, tokenizer_name='bert-base-uncased', max_length=128)
test_dataset = TweetLoader(X_test, y_test_encoded, tokenizer_name='bert-base-uncased', max_length=128)

from torch.utils.data import DataLoader

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [241]:
class BertBiLSTM(nn.Module):
    def __init__(self, bert_model_name, hidden_dim, output_dim, freeze_bert=True):
        super(BertBiLSTM, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
        self.lstm = nn.LSTM(768, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs.last_hidden_state
        
        _, (hidden, _) = self.lstm(last_hidden_state)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        output = self.fc(hidden)
        
        return output

In [238]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [242]:
bert_bilstm_model = BertBiLSTM('bert-base-uncased', hidden_dim=128, output_dim=3).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(bert_bilstm_model.parameters(), lr=2e-5)

In [244]:
num_epochs = 6
bert_bilstm_model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = bert_bilstm_model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

In [None]:
bert_bilstm_model.eval()

true_labels = []
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = bert_bilstm_model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)
        
        predictions.extend(preds.detach().cpu().numpy())
        true_labels.extend(batch['labels'].detach().cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print(f'Test Accuracy: {accuracy:.4f}')

print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))


### Build sentiment classifiers
You need to create your own classifiers (at least 3 classifiers). For each classifier, you can choose between the bag-of-word features and the word-embedding-based features. Each classifier has to be evaluated over 3 test sets. Make sure your classifier produce consistent performance across the test sets. Marking will be based on the performance over all 5 test sets (2 of them are not provided to you).

### Load Dataset
<a id='build'></a>

In [20]:
def load_dataset_pandas(filepath):
    df = pd.read_csv(filepath, sep='\t', header=None, names=['tweet_id', 'sentiment', 'tweet'])
    return df

In [73]:
train_set_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-training-data.txt')
dev_set_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-dev-data.txt')

df_train = load_dataset_pandas(train_set_path)
df_dev = load_dataset_pandas(dev_set_path)

### Pre-process Dataset

In [21]:
mentions_pattern = re.compile(r'@\w+')
hashtags_pattern = re.compile(r'#\w+')
links_or_urls_pattern = re.compile(r'\b(https?://|www\.)\S+\b')
non_alphanumeric_pattern = re.compile(r'[^a-zA-Z0-9 ]')
single_char_pattern = re.compile(r'\b\w\b')
pure_numbers_pattern = re.compile(r'\b\d+\b')

def clean_tweet(tweet):
    tweet = tweet.lower()
    tweet = re.sub(mentions_pattern, '', tweet)
    tweet = re.sub(hashtags_pattern, '', tweet)
    tweet = re.sub(links_or_urls_pattern, '', tweet)
    tweet = re.sub(non_alphanumeric_pattern, ' ', tweet)
    tweet = re.sub(pure_numbers_pattern, '', tweet)
    tweet = re.sub(single_char_pattern, '', tweet)
    tweet = ' '.join(tweet.split())
    return tweet

In [22]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_and_remove_stopwords(tweet):
    tokens = word_tokenize(tweet)
    tagged_tokens = nltk.pos_tag(tokens)

    stop_words = set(stopwords.words('english'))

    lemmatized_tokens = []
    for word, tag in tagged_tokens:
        if word not in stop_words:
            wordnet_tag = get_wordnet_pos(tag)
            if wordnet_tag is None:
                lemmatized_tokens.append(word)
            else:
                lemmatized_tokens.append(lemmatizer.lemmatize(word, wordnet_tag))
    
    cleaned_tweet = ' '.join(lemmatized_tokens)
    return cleaned_tweet

In [74]:
df_train['tweet'] = df_train['tweet'].apply(clean_tweet)
df_dev['tweet'] = df_dev['tweet'].apply(clean_tweet)

df_train['tweet'] = df_train['tweet'].apply(lemmatize_and_remove_stopwords)
df_dev['tweet'] = df_dev['tweet'].apply(lemmatize_and_remove_stopwords)

In [75]:
X_train = df_train['tweet']
y_train = df_train['sentiment']

X_dev = df_dev['tweet']
y_dev = df_dev['sentiment']

In [23]:
test_set1_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-test1.txt')
test_set2_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-test2.txt')
test_set3_path = os.path.join(os.getcwd(), 'semeval-tweets', 'twitter-test3.txt')

testsets = [test_set1_path, test_set2_path, test_set3_path]

### GLOVE Embeddings Initialization

In [24]:
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_dict[word] = vector
    return embeddings_dict

In [25]:
glove_path = os.path.join(os.getcwd(), "glove.6B", "glove.6B.100d.txt")
glove_embeddings = load_glove_embeddings(glove_path)

def tweet_to_embedding(tweet, embeddings_dict, embedding_dim=100):
    tokens = word_tokenize(tweet.lower())
    token_embeddings = [embeddings_dict.get(token, np.zeros(embedding_dim)) for token in tokens]
    if token_embeddings:
        tweet_embedding = np.mean(token_embeddings, axis=0)
    else:
        tweet_embedding = np.zeros(embedding_dim)
    return tweet_embedding

In [26]:
X_train_embeddings = np.array([tweet_to_embedding(tweet, glove_embeddings) for tweet in X_train])
X_dev_embeddings = np.array([tweet_to_embedding(tweet, glove_embeddings) for tweet in X_dev])

label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_dev_encoded = label_encoder.transform(y_dev)

### LSTM Initialization for Training

In [36]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


In [27]:
def train_lstm_model(model, train_loader, optimizer, criterion, device, num_epochs=6):
    model.train()
    for epoch in range(num_epochs):
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            output = model(inputs)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')


In [28]:
X_train_tokens = X_train.apply(lambda x: word_tokenize(x.lower()))
X_dev_tokens = X_dev.apply(lambda x: word_tokenize(x.lower()))


In [29]:
word_index = {}
current_index = 1
for tokens in X_train_tokens:
    for token in tokens:
        if token not in word_index:
            word_index[token] = current_index
            current_index += 1

X_train_seq = [[word_index[token] for token in tokens if token in word_index] for tokens in X_train_tokens]
X_dev_seq = [[word_index[token] for token in tokens if token in word_index] for tokens in X_dev_tokens]

max_length = max(max(len(seq) for seq in X_train_seq), max(len(seq) for seq in X_dev_seq))

In [30]:
def pad_sequences(sequences, maxlen):
    padded_sequences = np.zeros((len(sequences), maxlen), dtype=int)
    for i, seq in enumerate(sequences):
        len_seq = len(seq)
        if len_seq > maxlen:
            padded_sequences[i, :] = seq[:maxlen]
        elif len_seq > 0:
            padded_sequences[i, -len_seq:] = seq
    return padded_sequences

X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_dev_pad = pad_sequences(X_dev_seq, maxlen=max_length)

In [31]:
vocab_size = len(word_index) + 1
embedding_dim = 100

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [32]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        _, (hidden, _) = self.lstm(embedded)
        hidden = hidden[-1]
        output = self.fc(hidden)
        return output

In [51]:
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_dev_encoded = label_encoder.transform(y_dev)

In [37]:
X_train_tensor = torch.tensor(X_train_pad, dtype=torch.long)
X_dev_tensor = torch.tensor(X_dev_pad, dtype=torch.long)

y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_dev_tensor = torch.tensor(y_dev_encoded, dtype=torch.long)

X_train_tensor = X_train_tensor.to(device)
y_train_tensor = y_train_tensor.to(device)

X_dev_tensor = X_dev_tensor.to(device)
y_dev_tensor = y_dev_tensor.to(device)

train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

In [38]:
def get_predictions(model, test):
    test_tokens = test.apply(lambda x: word_tokenize(x.lower()))
    test_seq = [[word_index[token] for token in tokens if token in word_index] for tokens in test_tokens]
    test_pad = pad_sequences(test_seq, maxlen=max_length)
    test_tensor = torch.tensor(test_pad, dtype=torch.long)

    model.eval()

    with torch.no_grad():
        predictions = model(test_tensor)
        _, predicted_labels = torch.max(predictions, 1)
        
    return predicted_labels

In [72]:
# The Data Loading and Data pre-processing cells needs to run before running this

for classifier in ['LR', 'SVM', 'NB', 'LSTM']:
    for features in ['bow', 'glove']:
        if classifier == 'LR':
            print('Training ' + classifier)

            if features == 'bow':
                clf = Pipeline([
                    ('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
                    ('clf', LogisticRegression(
                        solver='lbfgs', C=1, max_iter=100))
                ])

                clf.fit(X_train, y_train)
            else:
                continue

        elif classifier == 'SVM':
            print('Training ' + classifier)

            if features == 'bow':
                clf = Pipeline([
                    ('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
                    ('clf', SVC(kernel='linear', C=1))

                ])

                clf.fit(X_train, y_train)

            else:
                continue

        elif classifier == 'NB':
            print('Training ' + classifier)

            if features == 'bow':
                clf = Pipeline([
                    ('tfidf', TfidfVectorizer(norm='l2', ngram_range=(1, 3))),
                    ('clf', MultinomialNB(alpha=0.1))
                ])
                clf.fit(X_train, y_train)

            else:
                continue

        elif classifier == 'LSTM':
            if features == 'bow':
                continue
            print('Training ' + classifier)

            train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
            lstm_model = SentimentLSTM(vocab_size, embedding_dim, hidden_dim=128, output_dim=3, embedding_matrix=embedding_matrix).to(device)

            optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)
            criterion = nn.CrossEntropyLoss()

            train_lstm_model(lstm_model, train_loader, optimizer, criterion, device)

        else:
            print('Unknown classifier name' + classifier)
            continue

        for testset in testsets:
            print(f"Running - Test set : {testset}")
            id_preds = {}

            # Load the dataset
            df_test = load_dataset_pandas(testset)

            # Pre-Process the data in testset
            df_test['tweet'] = df_test['tweet'].apply(clean_tweet)
            df_test['tweet'] = df_test['tweet'].apply(
                lemmatize_and_remove_stopwords)
            
            test_tweet_ids = df_test['tweet_id'].tolist()

            X_test = df_test['tweet']

            if classifier != 'LSTM':
                # Call predict function for performing predictions
                y_pred = clf.predict(X_test)

                # Use the tweet_id for creating the predictions dictionary
                predictions_dict = {str(tweet_id): pred for tweet_id,
                                    pred in zip(test_tweet_ids, y_pred)}

                id_preds = predictions_dict

                testset_name = testset
                testset_path = join('semeval-tweets', testset_name)
                evaluate(id_preds, testset_path, features + '-' + classifier)   

            elif classifier == 'LSTM':
                preds = get_predictions(lstm_model, X_test)

                preds = [label_encoder.inverse_transform([idx])[0] for idx in preds]

                preds = {str(tweet_id): pred for tweet_id, pred in zip(test_tweet_ids, preds)}
                id_preds = preds

                testset_name = testset
                testset_path = join('semeval-tweets', testset_name)
                evaluate(id_preds, testset_path, features + '-' + classifier)


Training LR


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Running - Test set : /Users/adityamahamuni/Work/Warwick/Modules/CS918_NLP/CW/semeval-tweets/twitter-test1.txt
/Users/adityamahamuni/Work/Warwick/Modules/CS918_NLP/CW/semeval-tweets/twitter-test1.txt (bow-LR): 0.478
Running - Test set : /Users/adityamahamuni/Work/Warwick/Modules/CS918_NLP/CW/semeval-tweets/twitter-test2.txt
/Users/adityamahamuni/Work/Warwick/Modules/CS918_NLP/CW/semeval-tweets/twitter-test2.txt (bow-LR): 0.492
Running - Test set : /Users/adityamahamuni/Work/Warwick/Modules/CS918_NLP/CW/semeval-tweets/twitter-test3.txt
/Users/adityamahamuni/Work/Warwick/Modules/CS918_NLP/CW/semeval-tweets/twitter-test3.txt (bow-LR): 0.476
Training LR
Training SVM
Running - Test set : /Users/adityamahamuni/Work/Warwick/Modules/CS918_NLP/CW/semeval-tweets/twitter-test1.txt
/Users/adityamahamuni/Work/Warwick/Modules/CS918_NLP/CW/semeval-tweets/twitter-test1.txt (bow-SVM): 0.535
Running - Test set : /Users/adityamahamuni/Work/Warwick/Modules/CS918_NLP/CW/semeval-tweets/twitter-test2.txt
/Use