In [39]:
import json
import pandas as pd
import numpy as np
import random
import string
from string import punctuation
import torch.nn.functional as F
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.metrics import accuracy_score

In [2]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [3]:
filename = "dataset/web_science_dataset.jsonl"
json_data = []
with open(filename) as f:
    json_data = f.readlines()
json_data_list = []
for item in json_data:
    json_data_list.append(json.loads(item))
df_ = pd.DataFrame(json_data_list)

In [4]:
df = pd.read_csv('cleaned_data.csv',delimiter=",")
df.head()
df.shape

(957, 7)

In [5]:
map_ = dict(zip(df_['questionId'].astype(int),df_['answer']))
df['answer'] = df['questionId'].map(map_)

map_cat = dict(zip(df_['questionId'].astype(int),df_['categoryId']))
df['categoryId'] = df['questionId'].map(map_cat)

df.head()

Unnamed: 0,Question,Answer URL,Answer Label,Question Rating,Answer Quality,Factual,questionId,answer,categoryId
0,does water have a memory as claimed in homeopathy,https://skeptics.stackexchange.com/questions/2#27,no,1.666667,2.333333,1,2,No\n\nWater forms strong intermolecular forces...,3
1,does chamomile help you to relax,https://skeptics.stackexchange.com/questions/3...,na,2.0,2.0,1,3,There is a website by the NIH about Chamomile ...,0
2,are there benefits to the eca stack for bodybu...,https://skeptics.stackexchange.com/questions/2...,yes,1.666667,2.0,1,22,It appears that the combination of ephedrine a...,0
3,can positive thinking provide an improved outc...,https://skeptics.stackexchange.com/questions/2...,yes,2.666667,2.666667,1,26,To add to Krzysztof's answer. There were also ...,2
4,are vegetables good for me,https://skeptics.stackexchange.com/questions/3...,yes,2.0,2.333333,1,32,"\n From a young age, most people are told tha...",0


In [6]:
df['answer'].apply(lambda x: np.mean(len(x)))

0      1328.0
1       599.0
2      1912.0
3      2044.0
4      2535.0
        ...  
952    2051.0
953    2216.0
954     404.0
955    3757.0
956    1149.0
Name: answer, Length: 957, dtype: float64

In [7]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return (' '.join(lemmatizer.lemmatize(w) for w in word_tokenize(text)))

In [8]:
def preprocess_text(df):    

    # remove leading/trailing spaces
    df = df.str.strip()
    
    # convert to lowercase
    df = df.str.lower()
    
    df = df.replace(to_replace ='http\S+', value = '', regex = True)
    
    # remove punctuation
    translator = str.maketrans('', '', string.punctuation) 
    df = df.str.translate(translator)
    
    # remove non-alphanumeric characters
    df = df.replace(to_replace ='\s*[^A-Za-z0-9]+\s*', value = ' ', regex = True)
    
    # remove digits
    translator = str.maketrans('', '', string.digits) 
    df = df.str.translate(translator)
    
    df = df.str.strip()
    
    #lemmatize
    #df = df.apply(lemmatize_text)
    
    return df

In [9]:
# convert to lower case, remove leading/trailing spaces
df['Question'] = df['Question'].astype(str).str.lower().str.strip()
df['Answer Label'] = df['Answer Label'].astype(str).str.lower().str.strip()

In [10]:
df['Question'] = preprocess_text(df['Question'])
df['answer'] = preprocess_text(df['answer'])
print(np.unique(df['Answer Label']))
df['Answer Label'] = df['Answer Label'].replace('nan','na')

answer_label_map = {'yes':2,'no':0,'na':1}
df['Answer Label'] = df['Answer Label'].map(answer_label_map)
#df['Answer Quality'] = ((df['Answer Quality']-1)/(3-1))

['na' 'no' 'yes']


In [11]:
for i, row in df.iterrows():
    text = remove_stopwords(row['answer'])
    #print(text)
    new_text = " ".join(text.split(" ")[:100])
    #print(new_text)
    df.at[i,'answer'] = new_text

mean_answer_lengths = df['answer'].apply(lambda x: np.mean(len(x)))
mean_answer_lengths

0      799.0
1      344.0
2      895.0
3      726.0
4      741.0
       ...  
952    837.0
953    777.0
954    260.0
955    741.0
956    841.0
Name: answer, Length: 957, dtype: float64

In [13]:
df.head()

Unnamed: 0,Question,Answer URL,Answer Label,Question Rating,Answer Quality,Factual,questionId,answer,categoryId
0,does water have a memory as claimed in homeopathy,https://skeptics.stackexchange.com/questions/2#27,0,1.666667,2.333333,1,2,water forms strong intermolecular forces molec...,3
1,does chamomile help you to relax,https://skeptics.stackexchange.com/questions/3...,1,2.0,2.0,1,3,website nih chamomile listing evidence effecti...,0
2,are there benefits to the eca stack for bodybu...,https://skeptics.stackexchange.com/questions/2...,2,1.666667,2.0,1,22,appears combination ephedrine caffeine positiv...,0
3,can positive thinking provide an improved outc...,https://skeptics.stackexchange.com/questions/2...,2,2.666667,2.666667,1,26,add krzysztofs answer studies suggesting peopl...,2
4,are vegetables good for me,https://skeptics.stackexchange.com/questions/3...,2,2.0,2.333333,1,32,young age people told vegetables good eat vege...,0


In [14]:
def read_file(filename):
    with open(filename) as f:
        data = f.readlines()
        data = [int(i.strip()) for i in data ]
    return data

In [15]:
train_ids = read_file('quality_prediction_data/training_ids.txt')
test_ids = read_file('quality_prediction_data/testing_ids.txt')

In [16]:
train_data = df.loc[df['questionId'].isin(train_ids)]
test_data = df.loc[df['questionId'].isin(test_ids)]

In [17]:
from sklearn.model_selection import train_test_split

train_data, valid_data = train_test_split(df,stratify=df['categoryId'].values, test_size=0.2)
train_data.shape
valid_data.shape
valid_data.head()

Unnamed: 0,Question,Answer URL,Answer Label,Question Rating,Answer Quality,Factual,questionId,answer,categoryId
713,is this description of the processes which tak...,https://skeptics.stackexchange.com/questions/2...,1,2.333333,3.0,1,28612,lets run list recommended daily intake sugar w...,0
905,does heads up texas holdem have more possible ...,https://skeptics.stackexchange.com/questions/4...,2,2.0,2.666667,1,43220,according superhuman ai headsup nolimit poker ...,3
196,do saunas affect your health,https://skeptics.stackexchange.com/questions/4...,0,2.0,2.5,1,4996,benefits listed sentence detoxification negati...,2
862,is climate change causing jellyfish to wash up...,https://skeptics.stackexchange.com/questions/3...,2,2.333333,2.333333,1,39904,articles ive climate change factor contributed...,1
286,does honey ever go bad,https://skeptics.stackexchange.com/questions/7...,0,2.666667,2.666667,1,7247,common knowledge sugar help microorganism grow...,0


In [18]:
unique_labels = set(train_data['Answer Label'])
unique_labels

{0, 1, 2}

In [19]:
label_map = {v:k for k,v in enumerate(unique_labels)}
label_map

{0: 0, 1: 1, 2: 2}

In [20]:
from gensim.models import KeyedVectors
from gensim.utils import tokenize
import pandas as pd
import numpy as np

In [21]:
EMBEDDING_FILE = "C:\\Users\\Reen\\Desktop\\web science\\WordEmbeddings\\GoogleNews-vectors-negative300.bin.gz"
word_to_vec_model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True,limit=50000)

In [22]:
pretrained_embeddings = np.concatenate([word_to_vec_model.vectors, np.zeros(shape=(1,300))], axis=0) #last vector padding

In [23]:
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
from torch import nn
from torch.optim import Adam
from typing import List, Tuple

In [24]:
seed = 1000
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

num_labels = len(label_map)
batch_size = 64
lr = 2e-4
lstm_dim = 20
n_epochs = 20
#device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")

In [25]:
class ClassificationDatasetReader():
    
    def __init__(self,df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,idx):
        row = self.df.values[idx]
        input_seq,seq_lens = text_to_batch_bilstm([row[7]]) #sending a list of one row
        label = row[2]
        return input_seq, seq_lens, label

In [26]:
def collate_batch_bilstm(input_data: Tuple) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    input_ids = [i[0][0] for i in input_data]
    seq_lens = [i[1][0] for i in input_data]
    labels = [i[2] for i in input_data]

    max_length = max([len(i) for i in input_ids])

    input_ids = [(i + [50000] * (max_length - len(i))) for i in input_ids]

    assert (all(len(i) == max_length for i in input_ids))
    return torch.tensor(input_ids), torch.tensor(seq_lens), torch.tensor(labels)

In [27]:
def text_to_batch_bilstm(text: List) -> Tuple[List, List]:
    oov_word_token = 500000 #oov token
    
    sentences = []
    for t in text:
        t = t.lower()
        t = ''.join([c for c in t if c not in punctuation])
    sentences.append(t)
        
    tokens = [ list(tokenize(t)) for t in sentences]
    #print(tokens)
    
    input_ids = [[word_to_vec_model.vocab[token].index for token in sentence if token in word_to_vec_model] for sentence in tokens]
    #print(input_ids)
    
    #input_ids = [tokenizer.encode_ids_with_eos(t) for t in text]

    return input_ids, [len(ids) for ids in input_ids]

In [28]:
# Create the dataset readers and read data
train_dataset = ClassificationDatasetReader(train_data)
train_dl = DataLoader(train_dataset, batch_size=len(train_data), shuffle=True, collate_fn=collate_batch_bilstm)

valid_dataset = ClassificationDatasetReader(valid_data)
valid_dl = DataLoader(valid_dataset, batch_size=len(valid_data), collate_fn=collate_batch_bilstm)

test_dataset = ClassificationDatasetReader(test_data)
test_dl = DataLoader(test_dataset, batch_size=len(test_data), collate_fn=collate_batch_bilstm)
valid_data.shape

(192, 9)

In [29]:
train_data.shape

(765, 9)

In [30]:
# Define the model
class LSTMNetwork(nn.Module):

    def __init__(self, pretrained_embeddings: torch.tensor, lstm_dim: int, dropout_prob: float = 0.1, n_classes: int = 5):
        
        super(LSTMNetwork, self).__init__()
        
        #LSTM input
        self.input_size = pretrained_embeddings.shape[1] #features in input
        self.hidden_size = 200 
        self.num_layers = 1 #default 
        
        self.embeddings = nn.Embedding.from_pretrained(pretrained_embeddings, padding_idx=pretrained_embeddings.shape[0] - 1)
        self.lstm = nn.LSTM(self.input_size,self.hidden_size,self.num_layers,batch_first=True,dropout=dropout_prob,bidirectional=True)
        self.ff = nn.Linear(2*self.hidden_size,n_classes)
        self.n_classes = n_classes
        
    def forward(self, inputs, input_lens, labels = None):
        
        # Get embeddings (b x sl x edim)
        embeds = self.embeddings(inputs)
        lstm_out, hidden = self.lstm(embeds)
        
        # Get the last output for classification (b x 2*lstm_dim)
        ff_in = lstm_out.gather(1, input_lens.view(-1,1,1).expand(lstm_out.size(0), 1, lstm_out.size(2)) - 1).squeeze()
        #print(input_lens.view(-1,1,1).expand(lstm_out.size(0), 1, lstm_out.size(2)))
        
        # Get logits (b x 2)
        logits = self.ff(ff_in).view(-1, self.n_classes)
        
        outputs = (logits,)
        if labels is not None:
            # Xentropy loss
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            outputs = (loss,) + outputs

        return outputs
      

In [31]:
# Create the model
model = LSTMNetwork(
    pretrained_embeddings=torch.FloatTensor(pretrained_embeddings), 
    lstm_dim=lstm_dim, 
    dropout_prob=0, 
    n_classes=len(label_map)
  ).to(device)

In [32]:
def accuracy(logits, labels):
    return torch.sum(torch.argmax(logits, dim=-1) == labels).type(torch.float) / float(labels.shape[0])

In [33]:
def train(model, train_dl, valid_dl, optimizer, n_epochs, device):
    
    losses = []
    best_acc = 0.0
  
    for ep in range(n_epochs):
        loss_epoch = []
        
        for batch in tqdm(train_dl):
            
            model.train()
            optimizer.zero_grad()
            
            batch = tuple(t.to(device) for t in batch)
            input_ids = batch[0]
            seq_lens = batch[1]
            labels = batch[2]

            loss, logits = model(input_ids, seq_lens, labels=labels)
            losses.append(loss.item())
            loss_epoch.append(loss.item())
      
            #print(loss.dtype)
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
        #gc.collect()
    
        # Evaluate
        model.eval()
        with torch.no_grad():
            for batch in valid_dl:

                batch = tuple(t.to(device) for t in batch)
                input_ids = batch[0]
                seq_lens = batch[1]
                labels = batch[2]

                val_loss, logits = model(input_ids, seq_lens, labels=labels)
                acc = accuracy(logits, labels)

                print(f'Validation accuracy: {acc}, train loss: {sum(loss_epoch) / len(loss_epoch)}',"val_loss: ",val_loss.item())
                best_model = model.state_dict()

                if acc > best_acc:
                  #best_model = model.state_dict()
                    best_acc = acc
                #gc.collect()

    model.load_state_dict(best_model)
    return model, losses

In [34]:
# Create the optimizer
optimizer = Adam(model.parameters(), lr=lr)

# Train
model, losses = train(model, train_dl, valid_dl, optimizer, n_epochs, device)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.3958333432674408, train loss: 1.0959408283233643 val_loss:  1.0925304889678955


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.4010416865348816, train loss: 1.0944340229034424 val_loss:  1.0913115739822388


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.4166666865348816, train loss: 1.0929973125457764 val_loss:  1.0901834964752197


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.4114583432674408, train loss: 1.091625690460205 val_loss:  1.0891424417495728


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.40625, train loss: 1.090315341949463 val_loss:  1.0881855487823486


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.3958333432674408, train loss: 1.0890611410140991 val_loss:  1.087310552597046


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.3958333432674408, train loss: 1.0878580808639526 val_loss:  1.0865134000778198


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.390625, train loss: 1.0867005586624146 val_loss:  1.0857908725738525


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.3958333432674408, train loss: 1.0855833292007446 val_loss:  1.0851398706436157


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.375, train loss: 1.0844999551773071 val_loss:  1.084555745124817


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.375, train loss: 1.083444595336914 val_loss:  1.0840353965759277


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.3802083432674408, train loss: 1.082411527633667 val_loss:  1.083574652671814


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.3645833432674408, train loss: 1.0813931226730347 val_loss:  1.083168387413025


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.359375, train loss: 1.0803834199905396 val_loss:  1.0828124284744263


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.359375, train loss: 1.0793747901916504 val_loss:  1.0825012922286987


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.3697916865348816, train loss: 1.078360915184021 val_loss:  1.082229733467102


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.3645833432674408, train loss: 1.077335000038147 val_loss:  1.081992506980896


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.359375, train loss: 1.0762903690338135 val_loss:  1.0817841291427612


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.3697916865348816, train loss: 1.0752224922180176 val_loss:  1.0815997123718262


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Validation accuracy: 0.375, train loss: 1.0741257667541504 val_loss:  1.0814348459243774
