In [18]:
import json
import pandas as pd
import numpy as np
import random
import string
from string import punctuation

In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

## Load Data

In [3]:
filename = "dataset/web_science_dataset.jsonl"
json_data = []
with open(filename) as f:
    json_data = f.readlines()
json_data_list = []
for item in json_data:
    json_data_list.append(json.loads(item))
df = pd.DataFrame(json_data_list)

#labels
label_map = dict(zip(df['category'],df['categoryId']))
label_map

{'medical-science': 2,
 'nutrition': 0,
 'psychology': 4,
 'climate-change': 1,
 'physics': 3}

#### Preprocessing the dataset

In [5]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return (' '.join(lemmatizer.lemmatize(w) for w in word_tokenize(text)))

In [6]:
def preprocess_text(df):    

    # remove leading/trailing spaces
    df = df.str.strip()
    
    # convert to lowercase
    df = df.str.lower()
    
    #df = df.replace(to_replace ='http\S+', value = '', regex = True)
    
    # remove punctuation
    translator = str.maketrans('', '', string.punctuation) 
    df = df.str.translate(translator)
    
    # remove non-alphanumeric characters
    df = df.replace(to_replace ='\s*[^A-Za-z0-9]+\s*', value = ' ', regex = True)
    
    # remove digits
    translator = str.maketrans('', '', string.digits) 
    df = df.str.translate(translator)
    
    df = df.str.strip()
    
    #lemmatize
    df = df.apply(lemmatize_text)
    
    return df

In [7]:
df['question'] = preprocess_text(df['question'])
df.head()

Unnamed: 0,question,questionId,questionUrl,category,categoryId,answer,answerUrl,answerId
0,can headbanging cause brain damage,14138,https://skeptics.stackexchange.com/questions/1...,medical-science,2,A number of injuries have been attributed to t...,https://skeptics.stackexchange.com/questions/1...,14139
1,doe the shangrila diet work according to it su...,10103,https://skeptics.stackexchange.com/questions/1...,nutrition,0,The Shangri-La diet depends on two theories:\n...,https://skeptics.stackexchange.com/questions/1...,16121
2,can phobia be genetic but created in one gener...,18713,https://skeptics.stackexchange.com/questions/1...,psychology,4,This question has remained unanswered yet not ...,https://skeptics.stackexchange.com/questions/1...,22322
3,do of u american think that global warming is ...,36010,https://skeptics.stackexchange.com/questions/3...,climate-change,1,The&nbsp;40% figure most likely comes from Pew...,https://skeptics.stackexchange.com/questions/3...,36011
4,doe boiling the same water twice make it dange...,11118,https://skeptics.stackexchange.com/questions/1...,nutrition,0,The claims\n\n\nevery time the same water is b...,https://skeptics.stackexchange.com/questions/1...,11119


#### Stratified splits

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train_data, test_data, train_labels, test_labels = train_test_split(
        df, df['category'],stratify=df['category'], test_size=0.4)

test_data, valid_data, test_labels, valid_labels = train_test_split(
        test_data, test_labels, stratify=test_labels, test_size=0.5)

In [10]:
train_data.head()

Unnamed: 0,question,questionId,questionUrl,category,categoryId,answer,answerUrl,answerId
68,doe garlic enhance immune system,20713,https://skeptics.stackexchange.com/questions/2...,nutrition,0,I'll take it that having potent anti-cancer pr...,https://skeptics.stackexchange.com/questions/2...,20727
548,doe fda not test anything in order to approve it,4309,https://skeptics.stackexchange.com/questions/4309,nutrition,0,The first two parts are easy to address. From ...,https://skeptics.stackexchange.com/questions/4...,4312
645,is coconut oil the best for hightemperature co...,23317,https://skeptics.stackexchange.com/questions/2...,nutrition,0,"Coconut Oil does suffer from this behaviour, a...",https://skeptics.stackexchange.com/questions/2...,23362
753,is it safe to stand by the window during a thu...,22562,https://skeptics.stackexchange.com/questions/2...,physics,3,It's not just your culture. Advice from the U...,https://skeptics.stackexchange.com/questions/2...,22565
13,is eating clovenhoofed animal bottomfeeders or...,2330,https://skeptics.stackexchange.com/questions/2330,nutrition,0,Judiasm 101 has some interesting insights into...,https://skeptics.stackexchange.com/questions/2...,2347


### Load pre-trained word embeddings: WordToVec

In [12]:
from gensim.models import KeyedVectors
from gensim.utils import tokenize
import pandas as pd
import numpy as np

In [15]:
EMBEDDING_FILE = "C:\\Users\\Reen\\Desktop\\web science\\WordEmbeddings\\GoogleNews-vectors-negative300.bin.gz"
word_to_vec_model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True,limit=50000)

In [17]:
pretrained_embeddings = np.concatenate([word_to_vec_model.vectors, np.zeros(shape=(1,300))], axis=0) #last vector padding

# Model

In [20]:
# Parts of code provided in the tutorial was used. Reference: liar_liar_bilstm.ipynb

In [21]:
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
from torch import nn
from torch.optim import Adam
from typing import List, Tuple

In [22]:
## defining global parameters

In [23]:
seed = 1000
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

num_labels = len(label_map)
batch_size = 64
lr = 2e-4
lstm_dim = 200
n_epochs = 20
device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")

## Loading data into the model

In [31]:
class ClassificationDatasetReader():
    
    def __init__(self,df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        row = self.df.values[idx]
        input_seq,seq_lens = text_to_batch_bilstm([row[0]]) #sending a list of one row
        label = label_map[row[3]]
        return input_seq, seq_lens, label

In [32]:
def collate_batch_bilstm(input_data: Tuple) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    input_ids = [i[0][0] for i in input_data]
    seq_lens = [i[1][0] for i in input_data]
    labels = [i[2] for i in input_data]

    max_length = max([len(i) for i in input_ids])

    input_ids = [(i + [25000] * (max_length - len(i))) for i in input_ids]

    assert (all(len(i) == max_length for i in input_ids))
    return torch.tensor(input_ids), torch.tensor(seq_lens), torch.tensor(labels)

In [33]:
def text_to_batch_bilstm(text: List) -> Tuple[List, List]:
    oov_word_token = 500000 #oov token
    
    sentences = []
    for t in text:
        t = t.lower()
        t = ''.join([c for c in t if c not in punctuation])
    sentences.append(t)
        
    tokens = [ list(tokenize(t)) for t in sentences]
    #print(tokens)
    
    input_ids = [[word_to_vec_model.vocab[token].index for token in sentence if token in word_to_vec_model] for sentence in tokens]
    #print(input_ids)
    
    #input_ids = [tokenizer.encode_ids_with_eos(t) for t in text]

    return input_ids, [len(ids) for ids in input_ids]

In [34]:
# Create the dataset readers and read data
train_dataset = ClassificationDatasetReader(train_data)
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch_bilstm)

valid_dataset = ClassificationDatasetReader(valid_data)
valid_dl = DataLoader(valid_dataset, batch_size=len(valid_data), collate_fn=collate_batch_bilstm)

## Creating the model

In [50]:
# Define the model
class LSTMNetwork(nn.Module):

    def __init__(self, pretrained_embeddings: torch.tensor, lstm_dim: int, dropout_prob: float = 0.1, n_classes: int = 5):
        
        super(LSTMNetwork, self).__init__()
        
        #LSTM input
        self.input_size = pretrained_embeddings.shape[1] #features in input
        self.hidden_size = 200 
        self.num_layers = 1 #default 
        
        self.embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pretrained_embeddings.shape[0] - 1)
        self.lstm = nn.LSTM(self.input_size,self.hidden_size,self.num_layers,batch_first=True,dropout=dropout_prob,bidirectional=True)
        self.ff = nn.Linear(2*self.hidden_size,n_classes)
        self.n_classes = n_classes
        
    def forward(self, inputs, input_lens, labels = None):
        
        # Get embeddings (b x sl x edim)
        embeds = self.embeddings(inputs)
        lstm_out, hidden = self.lstm(embeds)
        
        # Get the last output for classification (b x 2*lstm_dim)
        ff_in = lstm_out.gather(1, input_lens.view(-1,1,1).expand(lstm_out.size(0), 1, lstm_out.size(2)) - 1).squeeze()
        #print(input_lens.view(-1,1,1).expand(lstm_out.size(0), 1, lstm_out.size(2)))
        
        # Get logits (b x 2)
        logits = self.ff(ff_in).view(-1, self.n_classes)
        
        outputs = (logits,)
        if labels is not None:
            # Xentropy loss
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            outputs = (loss,) + outputs

        return outputs


In [51]:
# Create the model
model = LSTMNetwork(
    pretrained_embeddings=torch.FloatTensor(pretrained_embeddings), 
    lstm_dim=lstm_dim, 
    dropout_prob=0, 
    n_classes=len(label_map)
  ).to(device)

## Training 

In [52]:
def accuracy(logits, labels):
    return torch.sum(torch.argmax(logits, dim=-1) == labels).type(torch.float) / float(labels.shape[0])

In [53]:
def train(model, train_dl, valid_dl, optimizer, n_epochs, device):
    
    losses = []
    best_acc = 0.0
  
    for ep in range(n_epochs):
        loss_epoch = []
        
        for batch in tqdm(train_dl):
            
            model.train()
            optimizer.zero_grad()
            
            batch = tuple(t.to(device) for t in batch)
            input_ids = batch[0]
            seq_lens = batch[1]
            labels = batch[2]

            loss, logits = model(input_ids, seq_lens, labels=labels)
            losses.append(loss.item())
            loss_epoch.append(loss.item())
      
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
        #gc.collect()
    
        # Evaluate
        model.eval()
        with torch.no_grad():
            for batch in valid_dl:

                batch = tuple(t.to(device) for t in batch)
                input_ids = batch[0]
                seq_lens = batch[1]
                labels = batch[2]

                val_loss, logits = model(input_ids, seq_lens, labels=labels)
                acc = accuracy(logits, labels)

                print(f'Validation accuracy: {acc}, train loss: {sum(loss_epoch) / len(loss_epoch)}',"val_loss: ",val_loss.item())
                best_model = model.state_dict()

                if acc > best_acc:
                  #best_model = model.state_dict()
                    best_acc = acc
                #gc.collect()

    model.load_state_dict(best_model)
    return model, losses

In [55]:
# Create the optimizer
optimizer = Adam(model.parameters(), lr=lr)

# Train
model, losses = train(model, train_dl, valid_dl, optimizer, n_epochs, device)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.37383174896240234, train loss: 1.5731640815734864 val_loss:  1.530668020248413


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.37383174896240234, train loss: 1.5062084436416625 val_loss:  1.4802632331848145


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.37383174896240234, train loss: 1.477591335773468 val_loss:  1.4621939659118652


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.37383174896240234, train loss: 1.4493348956108094 val_loss:  1.4399510622024536


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.37383174896240234, train loss: 1.4258828282356262 val_loss:  1.4084078073501587


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.37383174896240234, train loss: 1.3812713146209716 val_loss:  1.34652578830719


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.40186914801597595, train loss: 1.3012239575386046 val_loss:  1.243157982826233


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.5327102541923523, train loss: 1.215136206150055 val_loss:  1.1528927087783813


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.5747663378715515, train loss: 1.1518130421638488 val_loss:  1.0899977684020996


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.5700934529304504, train loss: 1.1002074837684632 val_loss:  1.065401315689087


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.5560747385025024, train loss: 1.0564054250717163 val_loss:  1.0314016342163086


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.630841076374054, train loss: 1.0074251472949982 val_loss:  1.008083462715149


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.6355140209197998, train loss: 0.9604259192943573 val_loss:  0.954371988773346


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.663551390171051, train loss: 0.8983135044574737 val_loss:  0.896719217300415


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.7102803587913513, train loss: 0.8419132351875305 val_loss:  0.8528652787208557


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.7242990136146545, train loss: 0.7886219799518586 val_loss:  0.7867127656936646


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.7196261286735535, train loss: 0.7240154325962067 val_loss:  0.7450224757194519


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.7336448431015015, train loss: 0.6837927997112274 val_loss:  0.7154514789581299


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.7757009267807007, train loss: 0.6431334972381592 val_loss:  0.7034911513328552


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Validation accuracy: 0.7710280418395996, train loss: 0.6063642382621766 val_loss:  0.6997005343437195
