## Demo code

Reqired file for demo:
- svm model
- bi-gru model
- train.csv file (for preprocessing, embedding)
- test (input) csv file

Please place the two models under the same directory.

In [None]:
import pandas as pd
import numpy as np
import re, string
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import tensorflow as tf
from keras.models import Model, load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# from keras.layers import Input, GRU, Embedding, Dense, concatenate, Dropout, Bidirectional, Attention

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# read train and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv').astype("str")

#### SVM (tf-idf)
Link to the model: https://drive.google.com/file/d/1ahOWV7v8W9PMVukCGJ0ClkYWVaN16YaX/view?usp=sharing

Read model:

In [None]:
# read SVM model
with open('model_svm.pkl', 'rb') as file:
    model_svm = pickle.load(file)

Preprocess:

In [None]:
# preprocess text
def preprocess(text, remove_stop=False):
    # to lower case
    text = text.lower()
    # remove punctuation
    punctuations = string.punctuation
    translator = str.maketrans('', '', punctuations)
    text = text.translate(translator)
    # tokenize
    tokens = word_tokenize(text)
    if remove_stop:
        # remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return ' '.join(tokens)

def get_combined_lst(claim, evidence):
    # preprocess
    claim = claim.map(preprocess)
    evidence = evidence.map(preprocess)

    # add claim sentence to odd rows, evidence sentence to even rows
    lst = []
    claim = list(claim)
    evidence = list(evidence)

    for i in range(len(claim)):
        lst.append(claim[i])
        lst.append(evidence[i])
    return lst

# preprocess
lst_train = get_combined_lst(train_data['Claim'], train_data['Evidence'])
lst_test = get_combined_lst(test_data['Claim'], test_data['Evidence'])

# tf-idf
vectorizer = TfidfVectorizer()
tfidf_train = vectorizer.fit_transform(lst_train)
tfidf_test = vectorizer.transform(lst_test)

# difference
diff_train = np.abs(tfidf_train[0::2] - tfidf_train[1::2])
diff_test = np.abs(tfidf_test[0::2] - tfidf_test[1::2])

Predict:

In [None]:
pred_svm = model_svm.predict(diff_test)
output_svm = pd.DataFrame(pred_svm, columns=['prediction'])
output_svm.to_csv('Group_50_A.csv', index=False)
output_svm

Unnamed: 0,prediction
0,0
1,0
2,0
3,0
4,0
...,...
4686,1
4687,1
4688,1
4689,1


---

#### Bi-GRU
Link to the model: https://drive.google.com/file/d/1WC6CifsnJe2Xs8NFO86beBxayjRTLt-a/view?usp=sharing

Read model:

In [None]:
# read bi-gru model
model_gru = load_model('model_gru.keras')

Preprocess:

In [None]:
# tokenize
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(list(train_data['Claim']) + list(train_data['Evidence']))

# convert text to sequences of integers
test_claim_seq = tokenizer.texts_to_sequences(test_data['Claim'])
test_evidence_seq = tokenizer.texts_to_sequences(test_data['Evidence'])

# pad sequences
max_len = 307
test_claim_seq = pad_sequences(test_claim_seq, maxlen=max_len)
test_evidence_seq = pad_sequences(test_evidence_seq, maxlen=max_len)

In [None]:
pred_gru = model_gru.predict([test_claim_seq, test_evidence_seq])
pred_gru[pred_gru >= 0.5] = 1
pred_gru[pred_gru < 0.5] = 0
output_gru = pd.DataFrame(pred_gru, columns=['prediction'])
# output_gru.to_csv('Group_50_B.csv', index=False)
output_gru



Unnamed: 0,prediction
0,0.0
1,1.0
2,1.0
3,0.0
4,0.0
...,...
4686,1.0
4687,1.0
4688,1.0
4689,1.0
