## Optimizing Stack Overflow Question Retrieval using BERT-based Re-ranking

### Useful libraries

In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import spacy
spacy.prefer_gpu()
import re
import joblib
import numpy as np

from transformers import BertTokenizer, BertModel
import torch
import nltk

from gensim.models import Word2Vec
from sklearn.neighbors import KDTree

In [None]:
# check if GPU is available in torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

# check if GPU is available in keras
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

### Extracting data from web pages

In [13]:
# load web pages data
stack_data_raw = r"path to web pages data"
stack_data_df = pd.read_csv(stack_data_raw)
stack_data_df.head()

Unnamed: 0,id,codice sorgente
0,189602,"[<!DOCTYPE html>\n\n<html class=""html__respons..."
1,189603,"[<!DOCTYPE html>\n\n<html class=""html__respons..."
2,189604,"[<!DOCTYPE html>\n\n<html class=""html__respons..."
3,189605,"[<!DOCTYPE html>\n\n<html class=""html__respons..."
4,189606,"[<!DOCTYPE html>\n\n<html class=""html__respons..."


In [14]:
# creating 2 new columns for tags and answers
stack_data_df['answer'] = None
stack_data_df['tags'] = None

stack_data_df = stack_data_df.rename(columns={'id': 'header', 'codice sorgente': 'question'})
stack_data_df.head()

Unnamed: 0,header,question,answer,tags
0,189602,"[<!DOCTYPE html>\n\n<html class=""html__respons...",,
1,189603,"[<!DOCTYPE html>\n\n<html class=""html__respons...",,
2,189604,"[<!DOCTYPE html>\n\n<html class=""html__respons...",,
3,189605,"[<!DOCTYPE html>\n\n<html class=""html__respons...",,
4,189606,"[<!DOCTYPE html>\n\n<html class=""html__respons...",,


In [15]:
# extracting question, context question, accepted answer, tags from html
missing_info = 0

for idx,soup in enumerate(stack_data_df['question']):

    soup = BeautifulSoup(soup,"html.parser")

    try:
      # Getting the answer
      answer = soup.find('div', class_ = ['answer', 'js-answer'], itemprop='acceptedAnswer') 
      answer = answer.find('div', class_ = ['s-prose js-post-body'], itemprop='text')   
      stack_data_df['answer'][idx] = answer.text

    except:
      missing_info+=1
      continue
    
    # Getting the header
    header = soup.find('h1', class_ = ['fs-headline1 ow-break-word mb8 flex--item fl1'], itemprop ='name')
    stack_data_df['header'][idx] = header.text

    # Getting the question
    question = soup.find('div', class_ = ['question js-question'], id ='question') 
    question = question.find('div', class_ = ['s-prose js-post-body'], itemprop='text') 
    stack_data_df['question'][idx] = question.text

    # Getting the tags
    tags = soup.find('div', class_ = ['d-flex ps-relative fw-wrap'])
    tags = tags.text.split()[1:]
    # se è missing riempio con lista vuota
    if len(tags) == 0:
      tags = []
    stack_data_df['tags'][idx] =  tags

stack_data_df.to_csv(fr'C:\Users\kevin\Desktop\Text Mining data\df_21.csv', index = False)
print(f"In this scraping process we have {missing_info} of {idx} question/answer that were dropped because of missing information")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stack_data_df['answer'][idx] = answer.text
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stack_data_df['header'][idx] = header.text


In this scraping process we have 19 of 10397 question/answer that were dropped because of missing information


### Pre-Processing: from raw data to cleaned text

In [None]:
df = pd.read_csv('data.csv')
df.head()

In [None]:
# Cleaning the question 
# 1. lower text
# 2. remove the new line character
df['cleaned_question'] = df['question'].apply(lambda x:x.lower().replace('\n', ''))
df['cleaned_answer'] = df['answer'].apply(lambda x:str(x).lower().replace('\n', ''))
df['cleaned_header'] = df['header'].apply(lambda x:str(x).lower().replace('\n', ''))
df.head()

In [50]:
# drop rows with nan question or answer
print(df.shape)
df.dropna(subset=['question', 'answer', 'header'], inplace=True)
print(df.shape)

(244095, 7)
(244095, 7)


In [32]:
# print rows with missing question or answer: OKS, no missing
print(df[df['question'].isna()])
print(df[df['answer'].isna()])

Empty DataFrame
Columns: [header, question, answer, tags, cleaned_question, cleaned_answer]
Index: []
Empty DataFrame
Columns: [header, question, answer, tags, cleaned_question, cleaned_answer]
Index: []


In [2]:
# Expand contractions

contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not","can't": "can not","can't've": "cannot have",
"'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have",
"didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have",
"hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will",
"he'll've": "he will have","how'd": "how did","how'd'y": "how do you","how'll": "how will","i'd": "i would",
"i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have",
"isn't": "is not","it'd": "it would","it'd've": "it would have","it'll": "it will","it'll've": "it will have",
"let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not",
"mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have",
"needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
"oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
"shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will",
"she'll've": "she will have","should've": "should have","shouldn't": "should not",
"shouldn't've": "should not have","so've": "so have","that'd": "that would","that'd've": "that would have",
"there'd": "there would","there'd've": "there would have",
"they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have",
"they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would",
"we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
"weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are",
"what've": "what have","when've": "when have","where'd": "where did",
"where've": "where have","who'll": "who will","who'll've": "who will have","who've": "who have",
"why've": "why have","will've": "will have","won't": "will not","won't've": "will not have",
"would've": "would have","wouldn't": "would not","wouldn't've": "would not have","y'all": "you all",
"y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
"you'd": "you would","you'd've": "you would have","you'll": "you will","you'll've": "you will have",
"you're": "you are","you've": "you have"}

In [33]:
# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions
df['cleaned_question'] = df['cleaned_question'].apply(lambda x:expand_contractions(x))
df['cleaned_answer'] = df['cleaned_answer'].apply(lambda x:expand_contractions(x))
df['cleaned_header'] = df['cleaned_header'].apply(lambda x:expand_contractions(x))

In [35]:
print(df['question'][4][0:100])
print(df['cleaned_question'][4][0:100])


I'm a beginner at Python and I don't know what to set command to so I can open one of the links in 
i am a beginner at python and i do not know what to set command to so i can open one of the links in


In [36]:
# Function for Cleaning Text
# For cleaning the documents, I have created a function clean_text() which will remove:
# 1. the words with digits
# 2. replace newline characters with space, 
# 3. remove URLs 
# 4. replace everything that isn’t English alphabets with space

def clean_text(text):
    text = re.sub('\w*\d\w*','', text)
    #text=re.sub('\n',' ',text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub('[^a-z]',' ',text)
    return text

# Cleaning corpus using RegEx
df['cleaned_question'] = df['cleaned_question'].apply(lambda x: clean_text(x))
df['cleaned_answer'] = df['cleaned_answer'].apply(lambda x: clean_text(x))
df['cleaned_header'] = df['cleaned_header'].apply(lambda x: clean_text(x))

In [37]:
print(df['answer'][2][0:100])
print(df['cleaned_answer'][2][0:100])


Use floor division:
>>> 111111111111111111//10
11111111111111111
>>> 


use floor division           


In [38]:
# Removing extra spaces
df['cleaned_question'] = df['cleaned_question'].apply(lambda x: re.sub(' +',' ',x))
df['cleaned_answer'] = df['cleaned_answer'].apply(lambda x: re.sub(' +',' ',x))
df['cleaned_header'] = df['cleaned_header'].apply(lambda x: re.sub(' +',' ',x))

In [45]:
print(df['question'][200].replace('\n',''))
print('--------------------------------------------------------')
print(df['cleaned_question'][200])

I have two dataframes with different indexing that I want to sum the same column from the two dataframes. Based on a suggestion, I tried the following but removes disregards other columns like catdf = df.set_index('date')tmp = tmp.set_index('date')result = df['Anomaly'].add(tmp['Anomaly'], fill_value=0).reset_index()df    date       cat    Anomaly0 2018-12-06    a      01 2019-01-07    b      02 2019-02-06    a      13 2019-03-06    a      04 2019-04-06    b      0tmp    date        cat   Anomaly0 2018-12-06     a      01 2019-01-07     b      14 2019-04-06     b      0result    date           Anomaly0 2018-12-06         0.01 2019-01-07         1.0 2 2019-02-06         1.03 2019-03-06         0.04 2019-04-06         0.0What I want actually is to sum based on index and keep the category column and int dtype of Anomaly:result    date          cat    Anomaly0 2018-12-06       a         01 2019-01-07       b         12 2019-02-06       a         13 2019-03-06       a         04 2019-04-06 

In [57]:
# Stopwords removal & Lemmatizing tokens using SpaCy
# The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base 
# form. However, the two words differ in their flavor. Stemming usually refers to a crude heuristic process that chops off the ends of words in the 
# hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to 
# doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to 
# return the base or dictionary form of a word, which is known as the lemma.

nlp = spacy.load('en_core_web_sm',disable=['ner','parser'])
nlp.max_length = 5000000


# Removing Stopwords and Lemmatizing words
df['lemmatized_question']=df['cleaned_question'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))
df['lemmatized_answer']=df['cleaned_answer'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))
df['lemmatized_header']=df['cleaned_header'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))

In [59]:
df.to_csv("data_preprocessed.csv", index=False)

### Word2Vec: from words to vectors

In [None]:
pre_data = pd.read_csv("data_preprocessed.csv")
pre_data.head(2)

In [7]:
# shuffle the data, divide into 3/4 training and 1/4 test
pre_data = pre_data.sample(frac=1, random_state = 19091999).reset_index(drop=True)
train = pre_data[:int(len(pre_data)*0.85)]
test = pre_data[int(len(pre_data)*0.85):]
print(train.shape, test.shape, pre_data.shape)

(207480, 10) (36615, 10) (244095, 10)


In [58]:
train.iloc[0]

header                                    Sliding window reports caching
question               \nI have api endpoint that takes as an input s...
answer                 \nYou may consider using redis pipeline rather...
tags                                       ['flask', 'caching', 'redis']
cleaned_question       i have api endpoint that takes as an input som...
cleaned_answer         you may consider using redis pipeline rather t...
cleaned_header                            sliding window reports caching
lemmatized_question    api endpoint take input datum date date field ...
lemmatized_answer      consider redis pipeline individual command loo...
lemmatized_header                              slide window report cache
Name: 0, dtype: object

In [65]:
# Combining corpus and queries for training
combined=pd.concat([train.rename(columns={'lemmatized_header':'text1'})['text1'],\
                             train.rename(columns={'lemmatized_question':'text2'})['text2'],\
                             train.rename(columns={'lemmatized_answer':'text3'})['text3']])\
                             .sample(frac=1).reset_index(drop=True)
print(combined.shape)
combined = combined.to_list()
# Creating data for the model training
corpus_tokens = [str(sentence).split() for sentence in combined]
print(corpus_tokens[0])

['have', 'follow', 'code', 'def', 'choose', 'set', 'lst', 'k', 'k', 'return', 'len', 'lst', 'k', 'return', 'lst', 'return', 'choose', 'set', 'lst', 'k', 'lst', 'choose', 'set', 'lst', 'k', 'choose', 'set', 'lst', 'k', 'work', 'possible', 'loop', 'instead', 'write', 'lst', 'choose', 'set', 'lst', 'k', 'function', 'return', 'list', 'contain', 'different', 'list', 'length', 'k', 'create', 'original', 'list', 'member', 'order', 'important']


In [67]:
# Training a word2vec model from the given data set
w2v_model = Word2Vec(corpus_tokens, vector_size=500, min_count=2, window=5, sg=1, workers=8) 

# save the model
w2v_model.save(r"word2vec4.model")

In [4]:
train = pd.read_csv(r'training.csv')
test = pd.read_csv(r'test.csv')

In [3]:
# NOTA: devono esserci tutti e 3 i file nella cartella Models (word2vec.model, word2vec.model.trainables.syn1neg.npy, word2vec.model.wv.vectors.npy)
w2v_model = Word2Vec.load(r"/home/kevin/workspace/text mining/Project/Models/word2vec_v4/word2vec4.model")

In [None]:
# Function returning vector reperesentation of a document: per ogni documento ritorna la media dei vettori delle parole che lo compongono
def get_embedding_w2v(doc_tokens, w2v_model=w2v_model):
    embeddings = []
    if len(doc_tokens)<1:
        print("collection is empty")
        return np.zeros(500)
    else:
        for tok in doc_tokens:
            try:
                embeddings.append(w2v_model.wv[tok])
                #print("embedding riuscito per: ", tok)
            except:
                embeddings.append(np.random.rand(500))
                print("embedding fallito per: ", tok)
                
        # mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0) 

# Getting Word2Vec Vectors for Testing Corpus and Queries
test['answer_vector'] = test['lemmatized_answer'].apply(lambda x : get_embedding_w2v(str(x).split(), w2v_model))

### Create KDTree for fast retrieval

In [15]:
answer_retrievals = np.array(test['answer_vector'].tolist())
tree = KDTree(answer_retrievals) 

joblib.dump(tree, "kdtree_word2vec_from_scratch2.joblib")
tree = joblib.load("kdtree_word2vec_from_scratch2.joblib")

['kdtree_word2vec_from_scratch2.joblib']

In [4]:
def ranking_ir(query, tree, data, k):
  
  # pre-process Query
  query=query.lower().replace('\n', '')
  query=expand_contractions(query)
  query=clean_text(query)
  query=re.sub(' +',' ', query)
  query = ' '.join([token.lemma_ for token in list(nlp(query)) if (token.is_stop==False)])

  # generating vector
  vector = get_embedding_w2v(query.split())

  # ranking documents
  dist, ind = tree.query(np.expand_dims(vector, axis = 0), k = k)   
  
  return data.iloc[ind.tolist()[0]][['cleaned_answer', 'tags']]

In [None]:
# Recall@5, Recall@10, Recall@20
recall5 = []
recall10 = []
recall20 = []
for idx, question in enumerate(test['header']):
    if idx % 1000 == 0:
        print(idx)
    results = ranking_ir(question, tree, test, 20)
    if idx in results[0:5].index.tolist():
        recall5.append(1)
    if idx in results[0:10].index.tolist():
        recall10.append(1)
    if idx in results[0:20].index.tolist():
        recall20.append(1)
recall5_result = sum(recall5)/len(test['header'])
recall10_result = sum(recall10)/len(test['header'])
recall20_result = sum(recall20)/len(test['header'])
print("Recall@5: ", recall5_result)
print("Recall@10: ", recall10_result)
print("Recall@20: ", recall20_result)


### BERT: training

In [11]:
def augment_train(train):
  new_train = pd.DataFrame({'header': [], 'answer':[], 'label':[]})
  for idx, header in enumerate(train['cleaned_header'].values):
    if idx % 1000 == 0:
      print(idx)
    results = ranking_ir(header, tree,test, 9)
    rand_answers = list(results['cleaned_answer'].values)
    rand_answers.insert(0, train['cleaned_answer'][idx])
    headers = [header]*9
    labels = [1,0,0,0,0,0,0,0,0,0]
    current_df = pd.DataFrame({'header': headers, 'answer': rand_answers, 'label':labels})
    new_train = pd.concat([new_train, current_df], ignore_index = True)
  return new_train

In [None]:
train_aug2 = augment_train(train)
train_aug2.head()

In [5]:
import pandas as pd
train = pd.read_csv('train_aug.csv')
train.head(15)

Unnamed: 0,header,answer,label
0,sliding window reports caching,you may consider using redis pipeline rather t...,1.0
1,sliding window reports caching,see below a corrected version of your code exp...,0.0
2,sliding window reports caching,use a transaction with spkdb transaction for k...,0.0
3,sliding window reports caching,fo open headervalue txt r data l split for l i...,0.0
4,sliding window reports caching,had this same issue myself once i had worked i...,0.0
5,sliding window reports caching,try this import rewith open results txt as inf...,0.0
6,sliding window reports caching,askopenfilename returns a string containing th...,0.0
7,sliding window reports caching,there are too many questions on this topic you...,0.0
8,sliding window reports caching,this seems like an example of a repeated probl...,0.0
9,sliding window reports caching,i solved my question i was looking to implemen...,0.0


In [None]:
# Now we sample the negative examples in order to insert some noise in the data, otherwise the model predict always the most relevant class

import pandas as pd
train = pd.read_csv('train_aug.csv')

# Select the rows with label 0
zero_rows = train[train['label'] == 0]

# Randomly select 50% of the rows with label 0
random_zero_rows = zero_rows.sample(frac=0.7)

# Get the index values of the selected rows
random_zero_row_indices = random_zero_rows.index

# Drop the selected rows from the DataFrame
train = train.drop(random_zero_row_indices)

# reset the index
train = train.reset_index(drop = True)

from sklearn.utils import shuffle
test = train.iloc[700000:767676]  # 767676 length
train = train.iloc[:700000]


train = train.reset_index(drop = True)
print(train.shape, test.shape)

train.tail(30)

In [9]:
# count the number of 0 and 1 labels
train['label'].value_counts(normalize=True)

0.0    0.729775
1.0    0.270225
Name: label, dtype: float64

In [None]:
# Il training del modello è stato spezzato in 3 fasi, quindi si è ottenuto 1 modello ogni 2 epoche. Per questa ragione si è caricato il modello dopo 2 epoche 
# e si è continuato il training per altre 2 epoche per 2 volte.

from transformers import BertModel, BertTokenizer
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the model
model = BertModel.from_pretrained('bert-base-uncased').to(device)
model.load_state_dict(torch.load(r'Models/BERT/v3/model3.pt'))

# Load the classification head
classification_head = torch.nn.Sequential(torch.nn.Dropout(0.4), torch.nn.Linear(768, 1),torch.nn.Sigmoid()).to(device)
classification_head.load_state_dict(torch.load(r'Models/BERT/v3/classification_head3.pt'))

In [None]:
from transformers import BertModel, BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
import warnings
import numpy as np
import random
from sklearn.model_selection import train_test_split


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model = BertModel.from_pretrained('bert-base-uncased')

# Set the number of epochs and the batch size
num_epochs = 2
batch_size = 100

# Set the device to use for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# freeze the BERT model
for param in model.parameters():
    param.requires_grad = False

# Instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Model e Classification head sono commmentati poichè il training è stato spezzato in 3 fasi, quindi il modello è stato salvato e caricato in seguito
# send the model to the device
#model = BertClassificationModel()
#model = model.to(device)

# Add a classification head on top of the BERT model
#classification_head = torch.nn.Sequential(torch.nn.Dropout(0.4), torch.nn.Linear(768, 1),torch.nn.Sigmoid()).to(device)


# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)



# Set the optimizer and the loss function
optimizer = torch.optim.Adam(classification_head.parameters(), lr=2e-5, eps=1e-8)  #classification_head.parameters() se uso la head separata e non il modello
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = 700000)
loss_fn = torch.nn.BCEWithLogitsLoss()

# Track the number of correct predictions
accuracy_test = []
precision_test = []
recall_test = []
f1_test = []
total_train_loss = []
total_test_loss = []

skipped_train = 0
skipped_test = 0

# Training loop
for epoch in range(num_epochs):

    # Shuffle the training data
    train = train.sample(frac=1).reset_index(drop=True)   

    # Reset the total loss for this epoch.
    train_epoch_loss = 0
    test_epoch_loss = 0
    
    model.train()

    # Split the training data into batches
    num_batches = (len(train) // batch_size) + 1

    for i in range(num_batches):
        # Get the next batch of data
        print('TRAIN:', i)
        batch = train[i * batch_size:(i + 1) * batch_size]
        try:
            encoding = tokenizer(
                        text = batch['header'].to_list(),
                        text_pair = batch['answer'].to_list(),
                        add_special_tokens = True,
                        max_length = 512,
                        padding  = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation = True
                   ).to(device)
        except:
            skipped_train += 1        # Skip the batch if there is an error
            continue 

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass through the model
        ids = encoding['input_ids']
        mask = encoding['attention_mask']
        token_type_ids = encoding['token_type_ids']
        output = model(ids, mask, token_type_ids)[0]
        output = output[:, 0, :] + output[:, 1, :]   # take the first and last token embeddings and sum them up
        output = classification_head(output) 

        # Compute the loss
        labels = torch.tensor(batch['label'].values).float().to(device)
        loss = loss_fn(output.squeeze(), labels)
        train_epoch_loss += loss.item()

        # Backward pass
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(classification_head.parameters(), 1.0)

        # Update the model's weights
        optimizer.step()
        scheduler.step()


    # udate total training loss
    total_train_loss.append(train_epoch_loss / num_batches)


    # Calculate the accuracy on the validation set
    with torch.no_grad():
        model.eval()
        num_correct = 0
        num_true_positives = 0
        num_false_positives = 0
        num_true_negatives = 0
        num_false_negatives = 0
        num_total = 0
        num_batches_test = (len(test) // batch_size) + 1

        for i in range(num_batches_test):
            print('TEST:', i)
            batch_test = test[i * batch_size : (i + 1) * batch_size]
            try: 
                encoding = tokenizer(
                        text = batch_test['header'].to_list(),
                        text_pair = batch_test['answer'].to_list(),
                        add_special_tokens = True,
                        max_length = 512,
                        padding  = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                        truncation = True
                   ).to(device)
            except:
                skipped_test += 1
                continue  

            ids = encoding['input_ids']
            mask = encoding['attention_mask']
            token_type_ids = encoding['token_type_ids']

            output = model(ids, mask, token_type_ids)[0]
            output = output[:, 0, :] + output[:, 1, :]
            output = classification_head(output)

            # Compute the loss
            labels = torch.tensor(batch_test['label'].values).float().to(device)
            loss = loss_fn(output.squeeze(), labels)
            test_epoch_loss += loss.item()

            # Compute the accuracy
            predicted_label = (output > 0.5).long()                                           # long: trasforma vettore Boolean in 0 e 1
            label = torch.tensor(batch_test['label'].values).to(device)[:, np.newaxis]
            num_correct += (predicted_label == label).sum().item()

            # Calculate the true and false positives and negatives
            num_true_positives += ((predicted_label == 1) & (label == 1)).sum().item()
            num_true_negatives += ((predicted_label == 0) & (label == 0)).sum().item()
            num_false_positives += ((predicted_label == 1) & (label == 0)).sum().item()
            num_false_negatives += ((predicted_label == 0) & (label == 1)).sum().item()
            
            
        # update total test loss
        total_test_loss.append(test_epoch_loss / num_batches_test)

        # Calculate the accuracy
        accuracy = num_correct / (len(test) - skipped_test)
        accuracy_test.append(accuracy)

        # Compute precision
        precision = num_true_positives / (num_true_positives + num_false_positives)
        precision_test.append(precision)

        # Compute recall
        recall = num_true_positives / (num_true_positives + num_false_negatives)
        recall_test.append(recall)

        # Compute f1
        f1 = 2 * precision * recall / (precision + recall)
        f1_test.append(f1)

In [4]:
print(f'Accuracy: {accuracy_test}')
print('Precision', precision_test)
print('Recall', recall_test)
print('F1', f1_test)

Accuracy: [0.872637452749055, 0.8729949197967919]
Precision [0.897346525747695, 0.894356005788712]
Recall [0.5939569844459329, 0.5979013172583166]
F1 [0.7147910975773587, 0.7166815343443355]


In [5]:
# save the model
torch.save(model.state_dict(), r'Models/BERT/v4/model4.pt')
# save the classification head
torch.save(classification_head.state_dict(), r'Models/BERT/v4/classification_head4.pt')


In [None]:
from matplotlib import pyplot as plt
# Plot the training and validation loss
plt.plot(total_train_loss, label='Training loss')
plt.plot(total_test_loss, label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
from matplotlib import pyplot as plt
# Plot the training and validation loss
plt.plot(accuracy_test, label='Accuracy')
plt.title('Validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### BERT: re-ranking

In [18]:
from transformers import BertModel, BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
import warnings
import numpy as np
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the model
model = BertModel.from_pretrained('bert-base-uncased').to(device)
model.load_state_dict(torch.load(r'/home/kevin/workspace/text mining/Project/Models/BERT/v4/model4.pt'))

# tokenizer 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the classification head
classification_head = torch.nn.Sequential(torch.nn.Dropout(0.4), torch.nn.Linear(768, 1),torch.nn.Sigmoid()).to(device)
classification_head.load_state_dict(torch.load(r'/home/kevin/workspace/text mining/Project/Models/BERT/v4/classification_head4.pt'))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [None]:
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()

num_precision_5 = 0
num_precision_10 = 0
num_precision_20 = 0

# precision lists
prec_5 = []
prec_10 = []
prec_20 = []

# import the test set
#test = pd.read_csv('test.csv')
average_precisions = []

with torch.no_grad():
    model.eval()

    for idx, question in enumerate(test['header']):
        # get the top 1000 results
        results = ranking_ir(question, tree, test, 1000)

        # take the questions and put them in a list
        answers = results['cleaned_answer'].to_list()
        
        # create a list with 1000 times the same question
        questions = [question] * 125

        scores = []
        for i in range(8):
            encoding = tokenizer(
                text = questions,
                text_pair = answers[i*125:(i+1)*125],
                add_special_tokens = True,
                max_length = 512,
                padding  = True,
                return_attention_mask = True,
                return_tensors = 'pt',
                truncation = True
            ).to(device)

            ids = encoding['input_ids']
            mask = encoding['attention_mask']
            token_type_ids = encoding['token_type_ids']

            output = model(ids, mask, token_type_ids)[0]
            output = output[:, 0, :] + output[:, 1, :]
            output = classification_head(output)
            scores.append(output.squeeze().cpu().numpy())

        scores = np.array(scores).flatten()
        results['score'] = scores
        results = results.sort_values(by=['score'], ascending=False)

        # check if test['cleaned_answer'][idx] is in the top 5,10 or 20 results
        if idx in results[0:5].index.tolist():
            num_precision_5 += 1
        if idx in results[0:10].index.tolist():
            num_precision_10 += 1
        if idx in results[0:20].index.tolist():
            num_precision_20 += 1
        
        # MAP
        precisions11 = []
        for idx2 in range(11):
            if idx in results[0:idx2+1].index.tolist():
                precisions11.append(1)
            else:
                precisions11.append(0)
        average_precisions.append(np.mean(precisions11))


        # print the progress made in precisions
        if idx % 10 == 0:
            print(f'Precision@5: {num_precision_5 / (idx + 1)}')
            print(f'Precision@10: {num_precision_10 / (idx + 1)}')
            print(f'Precision@20: {num_precision_20 / (idx + 1)}')
            prec_5.append(num_precision_5 / (idx + 1))
            prec_10.append(num_precision_10 / (idx + 1))
            prec_20.append(num_precision_20 / (idx + 1))


# compute precision@5, precision@10 and precision@20
precision_5 = num_precision_5 / len(test['header'])
precision_10 = num_precision_10 / len(test['header'])
precision_20 = num_precision_20 / len(test['header'])

In [None]:
print(f'Precision@5: {num_precision_5 / (idx + 1)}')
print(f'Precision@10: {num_precision_10 / (idx + 1)}')
print(f'Precision@20: {num_precision_20 / (idx + 1)}')

In [33]:
# MAP score
np.mean(average_precisions)

0.1942628137523729

### RUN ALL THIS BLOCK IF YOU WANT TO CHECK FOR MODEL PERFORMANCE
* Note: before running this block, you need to insert the right paths for:
    1. the test dataset
    2. w2v model (all 3 files must be in the folder)
    3. bert model (both model and classification head)
    4. kdtree. 
* Note2: in order to shorter the time, this code only consider 100 questions from the test dataset. If you want to check the performance on the whole test dataset, please take into consideration that the code will take a lot of time to run (2 days, using a GPU accelerated environment).

In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import spacy
spacy.prefer_gpu()
import re
import joblib
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
import nltk
from gensim.models import Word2Vec
from sklearn.neighbors import KDTree
from transformers import BertModel, BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
import warnings
import numpy as np
import random

# SOSTITUIRE CON I PATH GIUSTI
w2v_model = Word2Vec.load(r"word2vec4.model")
test = pd.read_csv(r'test.csv')
tree = joblib.load("kdtree_word2vec_from_scratch2.joblib")
# Load the model
model = BertModel.from_pretrained('bert-base-uncased').to(device)
model.load_state_dict(torch.load(r'model4.pt'))
# Load the classification head
classification_head = torch.nn.Sequential(torch.nn.Dropout(0.4), torch.nn.Linear(768, 1),torch.nn.Sigmoid()).to(device)
classification_head.load_state_dict(torch.load(r'classification_head4.pt'))

test_cases = 1000   # set to len(test) for complete evaluation




# Expand contractions

contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not","can't": "can not","can't've": "cannot have",
"'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have",
"didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have",
"hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will",
"he'll've": "he will have","how'd": "how did","how'd'y": "how do you","how'll": "how will","i'd": "i would",
"i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have",
"isn't": "is not","it'd": "it would","it'd've": "it would have","it'll": "it will","it'll've": "it will have",
"let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not",
"mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have",
"needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
"oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
"shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will",
"she'll've": "she will have","should've": "should have","shouldn't": "should not",
"shouldn't've": "should not have","so've": "so have","that'd": "that would","that'd've": "that would have",
"there'd": "there would","there'd've": "there would have",
"they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have",
"they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would",
"we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
"weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are",
"what've": "what have","when've": "when have","where'd": "where did",
"where've": "where have","who'll": "who will","who'll've": "who will have","who've": "who have",
"why've": "why have","will've": "will have","won't": "will not","won't've": "will not have",
"would've": "would have","wouldn't": "would not","wouldn't've": "would not have","y'all": "you all",
"y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
"you'd": "you would","you'd've": "you would have","you'll": "you will","you'll've": "you will have",
"you're": "you are","you've": "you have"}






# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)





def clean_text(text):
    text = re.sub('\w*\d\w*','', text)
    #text=re.sub('\n',' ',text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub('[^a-z]',' ',text)
    return text






nlp = spacy.load('en_core_web_sm',disable=['ner','parser'])
nlp.max_length = 5000000







def get_embedding_w2v(doc_tokens, w2v_model=w2v_model):
    embeddings = []
    if len(doc_tokens)<1:
        print("collection is empty")
        return np.zeros(500)
    else:
        for tok in doc_tokens:
            try:
                embeddings.append(w2v_model.wv[tok])
                #print("embedding riuscito per: ", tok)
            except:
                embeddings.append(np.random.rand(500))
                #print("embedding fallito per: ", tok)
                
        # mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0) 

# Getting Word2Vec Vectors for Testing Corpus and Queries
test['answer_vector'] = test['lemmatized_answer'].apply(lambda x : get_embedding_w2v(str(x).split(), w2v_model))







def ranking_ir(query, tree, data, k):
  
  # pre-process Query
  query=query.lower().replace('\n', '')
  query=expand_contractions(query)
  query=clean_text(query)
  query=re.sub(' +',' ', query)
  query = ' '.join([token.lemma_ for token in list(nlp(query)) if (token.is_stop==False)])

  # generating vector
  vector = get_embedding_w2v(query.split())

  # ranking documents
  dist, ind = tree.query(np.expand_dims(vector, axis = 0), k = k)   
  
  return data.iloc[ind.tolist()[0]][['cleaned_answer', 'tags']]







device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



# tokenizer 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')








# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()

num_precision_5 = 0
num_precision_10 = 0
num_precision_20 = 0

# precision lists
prec_5 = []
prec_10 = []
prec_20 = []

# import the test set
#test = pd.read_csv('test.csv')
average_precisions = []

with torch.no_grad():
    model.eval()

    for idx, question in enumerate(test['header'][0:test_cases]):
        # get the top 1000 results
        results = ranking_ir(question, tree, test, 1000)

        # take the questions and put them in a list
        answers = results['cleaned_answer'].to_list()
        
        # create a list with 1000 times the same question
        questions = [question] * 125

        scores = []
        for i in range(8):
            encoding = tokenizer(
                text = questions,
                text_pair = answers[i*125:(i+1)*125],
                add_special_tokens = True,
                max_length = 512,
                padding  = True,
                return_attention_mask = True,
                return_tensors = 'pt',
                truncation = True
            ).to(device)

            ids = encoding['input_ids']
            mask = encoding['attention_mask']
            token_type_ids = encoding['token_type_ids']

            output = model(ids, mask, token_type_ids)[0]
            output = output[:, 0, :] + output[:, 1, :]
            output = classification_head(output)
            scores.append(output.squeeze().cpu().numpy())

        scores = np.array(scores).flatten()
        results['score'] = scores
        results = results.sort_values(by=['score'], ascending=False)

        # check if test['cleaned_answer'][idx] is in the top 5,10 or 20 results
        if idx in results[0:5].index.tolist():
            num_precision_5 += 1
        if idx in results[0:10].index.tolist():
            num_precision_10 += 1
        if idx in results[0:20].index.tolist():
            num_precision_20 += 1
        
        # MAP
        precisions11 = []
        for idx2 in range(11):
            if idx in results[0:idx2+1].index.tolist():
                precisions11.append(1)
            else:
                precisions11.append(0)
        average_precisions.append(np.mean(precisions11))


        # print the progress made in precisions
        if idx % 10 == 0:
            print(f'Precision@5: {num_precision_5 / (idx + 1)}')
            print(f'Precision@10: {num_precision_10 / (idx + 1)}')
            print(f'Precision@20: {num_precision_20 / (idx + 1)}')
            prec_5.append(num_precision_5 / (idx + 1))
            prec_10.append(num_precision_10 / (idx + 1))
            prec_20.append(num_precision_20 / (idx + 1))


# compute precision@5, precision@10 and precision@20
precision_5 = num_precision_5 / len(test['header'])
precision_10 = num_precision_10 / len(test['header'])
precision_20 = num_precision_20 / len(test['header'])


# print results
print(f'Precision@5: {precision_5}')
print(f'Precision@10: {precision_10}')
print(f'Precision@20: {precision_20}')
print(f'MAP: {np.mean(average_precisions)}')