# a) Traditional machine learning-based approaches

The final model used is SVM with Tf-idf embedding.

Link to the model: https://drive.google.com/file/d/1ahOWV7v8W9PMVukCGJ0ClkYWVaN16YaX/view?usp=drive_link

In [None]:
import pandas as pd
import numpy as np
import re, string
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Read data

In [None]:
training_data = pd.read_csv('train.csv')
dev_data = pd.read_csv('dev.csv')

## Data Preprocess
1. Convert to lower case
2. Remove punctuation
3. tokenize\
   (remove stopwords)
4. lemmatize

In [None]:
# preprocess text
def preprocess(text, remove_stop=False):
    # to lower case
    text = text.lower()
    # remove punctuation
    punctuations = string.punctuation
    translator = str.maketrans('', '', punctuations)
    text = text.translate(translator)
    # tokenize
    tokens = word_tokenize(text)
    if remove_stop:
        # remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return ' '.join(tokens)

In [None]:
def get_combined_lst(claim, evidence):
    # preprocess
    claim = claim.map(preprocess)
    evidence = evidence.map(preprocess)

    # add claim sentence to odd rows, evidence sentence to even rows
    lst = []
    claim = list(claim)
    evidence = list(evidence)

    for i in range(len(claim)):
        lst.append(claim[i])
        lst.append(evidence[i])
    return lst

Combine the claim and evidence into one list, where odd rows represent the claim, and even rows represent the evidence.

Eg.\
claim1\
evidence1\
claim2\
evidence2\
...

In [None]:
lst_train = get_combined_lst(training_data['Claim'], training_data['Evidence'])
lst_dev = get_combined_lst(dev_data['Claim'], dev_data['Evidence'])

In [None]:
lst_dev[1]

'seeing the involvement of the coca grower the bolivian government claimed that the demonstrator were actually agent or pawn of drug trafficker ref'

## Word Embedding

Convert the training data and development data into matrix.

### Tf-idf

In [None]:
vectorizer = TfidfVectorizer()
tfidf_train = vectorizer.fit_transform(lst_train)
tfidf_dev = vectorizer.transform(lst_dev)

Compute the difference between a claim and its corresponding evidence.

In [None]:
diff_train = np.abs(tfidf_train[0::2] - tfidf_train[1::2])
diff_dev = np.abs(tfidf_dev[0::2] - tfidf_dev[1::2])

### Word2Vec
To compare with Tf-idf method.

In [None]:
# split each sentence into tokens
w2v_train = []
for i in range(len(lst_train)):
    w2v_train.append(lst_train[i].split())

w2v_dev = []
for i in range(len(lst_dev)):
    w2v_dev.append(lst_dev[i].split())

In [None]:
print(w2v_dev[1])

['seeing', 'the', 'involvement', 'of', 'the', 'coca', 'grower', 'the', 'bolivian', 'government', 'claimed', 'that', 'the', 'demonstrator', 'were', 'actually', 'agent', 'or', 'pawn', 'of', 'drug', 'trafficker', 'ref']


In [None]:
# take a token list as input and computer the averaged vector for the entire sentence
def get_average_word2vec(tokens_list, vector, size):
    vectorized = [vector[w] if w in vector else np.zeros(size) for w in tokens_list]
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, size)
    return averaged

Build the word to vector model:

In [None]:
model_w2v = Word2Vec(w2v_train, vector_size=100, window=5, min_count=1, workers=4, sg=1)

**Input features:**

Compute the different between the claim and its evidence.

In [None]:
diff_w2v_train = []
for i in range(0, len(w2v_train)-1, 2):
    diff_w2v_train.append(np.abs(get_average_word2vec(w2v_train[i], model_w2v.wv, 100) -
                                 get_average_word2vec(w2v_train[i+1], model_w2v.wv, 100)))

diff_w2v_dev = []
for i in range(0, len(w2v_dev)-1, 2):
    diff_w2v_dev.append(np.abs(get_average_word2vec(w2v_dev[i], model_w2v.wv, 100) -
                                 get_average_word2vec(w2v_dev[i+1], model_w2v.wv, 100)))

## Experiments
In this section, we compare the reuslts of the two word embedding technique: tf-idf and word2vec.

And compare the performace of different models: logistic regression, support vector machine and naive bayes.

In [None]:
X_train_tfidf = diff_train  # tf-idf
X_train_w2v = diff_w2v_train  # word2vec
y_train = training_data['label']

X_dev_tfidf = diff_dev  # tf-idf
X_dev_w2v = diff_w2v_dev  # word2vec
y_dev = dev_data['label']

### Logistic regression

In [None]:
model_lr_tfidf = LogisticRegression(max_iter=1000)
model_lr_tfidf.fit(X_train_tfidf, y_train)
# predict
pred_lr_tfidf = model_lr_tfidf.predict(X_dev_tfidf)

print('Logistic regression with tf-idf:', accuracy_score(y_dev, pred_lr_tfidf))

Logistic regression with tf-idf: 0.8182585217684779


In [None]:
model_lr_w2v = LogisticRegression(max_iter=1000)
model_lr_w2v.fit(X_train_w2v, y_train)
# predict
pred_lr_w2v = model_lr_w2v.predict(X_dev_w2v)

print('Logistic regression with word2vec:', accuracy_score(y_dev, pred_lr_w2v))

Logistic regression with word2vec: 0.7472156598042524


### Naive Bayes

In [None]:
model_nb_tfidf = MultinomialNB()
model_nb_tfidf.fit(X_train_tfidf, y_train)
# predict
pred_nb_tfidf = model_nb_tfidf.predict(X_dev_tfidf)

print('Naive Bayes with tf-idf:', accuracy_score(y_dev, pred_nb_tfidf))

Naive Bayes with tf-idf: 0.7686466419169761


In [None]:
model_nb_w2v = MultinomialNB()
model_nb_w2v.fit(X_train_w2v, y_train)
# predict
pred_nb_w2v = model_nb_w2v.predict(X_dev_w2v)

print('Naive Bayes with word2vec:', accuracy_score(y_dev, pred_nb_w2v))

Naive Bayes with word2vec: 0.7301721228484644


### Support vector machine

In [None]:
model_svm_tfidf = SVC(kernel='rbf')
model_svm_tfidf.fit(X_train_tfidf, y_train)
# predict
pred_svm_tfidf = model_svm_tfidf.predict(X_dev_tfidf)

print('SVM with tf-idf (rbf):', accuracy_score(y_dev, pred_svm_tfidf))

SVM with tf-idf (rbf): 0.8212959838002025


In [None]:
model_svm_w2v = SVC(kernel='rbf')
model_svm_w2v.fit(X_train_w2v, y_train)
# predict
pred_svm_w2v = model_svm_w2v.predict(X_dev_w2v)

print('SVM with word2vec (rbf):', accuracy_score(y_dev, pred_svm_w2v))

SVM with word2vec (rbf): 0.7593655079311509


In [None]:
model_svm_tfidf2 = SVC(kernel='linear')
model_svm_tfidf2.fit(X_train_tfidf, y_train)
# predict
pred_svm_tfidf2 = model_svm_tfidf2.predict(X_dev_tfidf)

print('SVM with tf-idf (linear):', accuracy_score(y_dev, pred_svm_tfidf2))

SVM with tf-idf (linear): 0.8170772865339183


In [None]:
model_svm_w2v2 = SVC(kernel='linear')
model_svm_w2v2.fit(X_train_w2v, y_train)
# predict
pred_svm_w2v2 = model_svm_w2v2.predict(X_dev_w2v)

print('SVM with word2vec (linear):', accuracy_score(y_dev, pred_svm_w2v2))

SVM with word2vec (linear): 0.7315221059736753


### Development dataset results evaluation

In [None]:
pred_results = [pred_lr_tfidf, pred_lr_w2v, pred_nb_tfidf, pred_nb_w2v, pred_svm_tfidf, pred_svm_w2v, pred_svm_tfidf2, pred_svm_w2v2]
models = ['LR(tfidf)', 'LR(w2v)', 'Naive Bayes(tfidf)', 'Naive Bayes(w2v)', 'SVM(tfidf, rbf)', 'SVM(w2v, rbf)', 'SVM(tfidf, linear)', 'SVM(w2v, linear)']
results = []
for i in range(len(pred_results)):
    acc = float(np.round(accuracy_score(y_dev, pred_results[i]), 3))
    # calculate metrics for each label, and find their unweighted mean
    precision = float(np.round(precision_score(y_dev, pred_results[i], average='macro'), 3))
    recall = float(np.round(recall_score(y_dev, pred_results[i], average='macro'), 3))
    f1 = float(np.round(f1_score(y_dev, pred_results[i], average='macro'), 3))
    auc = float(np.round(roc_auc_score(y_dev, pred_results[i]), 3))
    results.append([models[i], acc, precision, recall, f1, auc])

df_eval = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'AUC'])
df_eval

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,AUC
0,LR(tfidf),0.818,0.78,0.731,0.749,0.731
1,LR(w2v),0.747,0.701,0.555,0.538,0.555
2,Naive Bayes(tfidf),0.769,0.766,0.592,0.593,0.592
3,Naive Bayes(w2v),0.73,0.365,0.5,0.422,0.5
4,"SVM(tfidf, rbf)",0.821,0.786,0.734,0.752,0.734
5,"SVM(w2v, rbf)",0.759,0.774,0.566,0.552,0.566
6,"SVM(tfidf, linear)",0.817,0.773,0.741,0.754,0.741
7,"SVM(w2v, linear)",0.732,0.866,0.503,0.427,0.503


From the table above, we can see the the result of using tf-idf is better than the result of word2vec. Specifically, the F1 score is much lower when using word2vec.

With tf-idf, logistic regression and SVM can achieve an accuracy of around 0.82, and SVM (with rbf kernel) slightly outperforms logistic regression. The result of Naive Bayes gives the worst result.

Hence, next stage we choose logistic regression and SVM to fine-tune the parameters, to see if we can improve the model performance.

## Fine-tune

### Logistic Regression

In [None]:
model = LogisticRegression(max_iter=10000, tol=0.1)

param_grid = {
    # strength of regularization, larger C leads to stronger regularization
    'C': [1, 2, 3, 4, 5, 6, 7, 8]
}

grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train_tfidf, y_train)

print("Best parameters:", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters: {'C': 2}
Best cross-validation score: 0.82


Set C = 2:

In [None]:
model_lr = LogisticRegression(max_iter=1000, C=2)
model_lr.fit(X_train_tfidf, y_train)
pred_lr = model_lr.predict(X_dev_tfidf)

print('Accuracy:', accuracy_score(y_dev, pred_lr))
print('F1:', f1_score(y_dev, pred_lr, average='macro'))

Accuracy: 0.8150523118461019
F1: 0.7475688881737531


### SVM

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],  # regularization
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],  # kernel coefficient, control the influence of distance
              'kernel': ['rbf']}  # Gaussian kernel

grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)

# fitting the model for grid search
grid.fit(X_train_tfidf, y_train)


print(grid.best_params_)
print(grid.best_estimator_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.750 total time= 4.3min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.755 total time= 4.4min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.759 total time= 4.4min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.754 total time= 4.5min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.754 total time= 4.5min
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.737 total time= 2.0min
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.745 total time= 2.0min
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.748 total time= 2.1min
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.741 total time= 2.1min
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.744 total time= 2.1min
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.726 total time= 2.0min
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

Set C = 10, gamma = 1:

In [None]:
model_svm = SVC(kernel='rbf', C=10, gamma=1)
model_svm.fit(X_train_tfidf, y_train)
pred_svm = model_svm.predict(X_dev_tfidf)

print('Accuracy:', accuracy_score(y_dev, pred_svm))
print('F1:', f1_score(y_dev, pred_svm, average='macro'))

Accuracy: 0.8341208234897064
F1: 0.7735411437691138


It can be seen the SVM model gives a slightly better result, hence is selected as our final model.

### Save output file
Write the predictions into a csv file.

In [None]:
output_df = pd.DataFrame(pred_svm, columns=['prediction'])

In [None]:
output_df.to_csv('task_a_pred.csv', index=False)

### Save model

In [None]:
with open('model_svm.pkl', 'wb') as file:
    pickle.dump(model_svm, file)
print('Model Saved')

Model Saved


In [None]:
# read the model

# with open('model_svm.pkl', 'rb') as file:
#     clf_svm = pickle.load(file)

## Evaluation
The evaluation of the SVM model using development file can be checked here: https://colab.research.google.com/drive/1QPrZZIAHhJSoKdOt45ULcuHIytDHC5Vp?usp=drive_link