# **INF8460 A20 Project: Open-domain questions answering**

<br>

Equipe 8:


*   Cedric Sadeu (Glove, ranking with classification)
*   Mamoudou Sacko (pretraitement + TF-IDF, cosine ranking)
*   Oumayma Messoussi (PCP Bert, ML/DL for ranking)

<br>

---

<br>

In [4]:
!pip install transformers



In [5]:
import io
import os
import math
import nltk
import time
import torch
import random
import sklearn
import zipfile
import operator
import requests
import functools
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import multiprocessing
from functools import partial
from typing import Dict, List, Tuple
from collections import Counter, defaultdict
from concurrent.futures import ProcessPoolExecutor
from transformers import pipeline, Trainer, TrainingArguments
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, BertTokenizer, BertModel, BertForQuestionAnswering

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from google.colab import drive
drive.mount('/content/drive')
!ls '/content/drive/My Drive/Colab Notebooks/INF8460/Project/'

Mounted at /content/drive
data				  LSTMSiameseTextSimilarity  yahooLTR_C14.tgz
inf8460_projet_A20_equipe8.ipynb  output


### Lecture des donnees

In [7]:
def read_data(path: str) -> Tuple[List[int], List[str]]:
    data = pd.read_csv(path)
    ids = data["id"].tolist()
    paragraphs = data["paragraph"].tolist()
    return ids, paragraphs

def read_questions(path: str) -> Tuple[List[int], List[str], List[int], List[str]]:
    data = pd.read_csv(path)
    ids = data["id"].tolist()
    questions = data["question"].tolist()
    paragraph_ids = data["paragraph_id"].tolist()
    answers = data["answer"].tolist()
    return ids, questions, paragraph_ids, answers

def save_to_csv(path: str, corpus):
    df = pd.DataFrame(corpus, columns= list(corpus.keys())).head()
    df.to_csv (os.path.join(output_path, path), index = False, header=True)

In [8]:
data_path = "data"
output_path = "/content/drive/My Drive/Colab Notebooks/INF8460/Project/output"

train_data = read_data(os.path.join(data_path, "/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/corpus.csv"))
train_ids = read_questions(os.path.join(data_path, "/content/drive/My Drive/Colab Notebooks/INF8460/Project/data/train_ids.csv"))


paragraphs = [" ".join(sentence.split()).lower() for sentence in train_data[1]]
questions = [" ".join(sentence.split()).lower() for sentence in train_ids[1]]

### Pretraitement

In [9]:
class Preprocess(object):
    def __init__(self, lemmatize=True):
        self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.lemmatize = lemmatize

    def preprocess_pipeline(self, data):
        clean_tokenized_data = self._clean_doc(data)
        if self.lemmatize:
            clean_tokenized_data = self._lemmatize(clean_tokenized_data)

        return clean_tokenized_data

    def _clean_doc(self, data):
        tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
        return [
            [
                token.lower()
                for token in tokenizer.tokenize(review)
                if token.lower() not in self.stopwords
                and len(token) > 1
                and token.isalpha()
            ]
            for review in data
        ]

    def _lemmatize(self, data):
        lemmatizer = nltk.stem.WordNetLemmatizer()
        return [[lemmatizer.lemmatize(word) for word in review] for review in data]

    def convert_to_reviews(self, tokenized_reviews):
        reviews = []
        for tokens in tokenized_reviews:
            reviews.append(" ".join(tokens))

        return reviews

In [10]:
pre = Preprocess()

paragraphs_tokenized = pre.preprocess_pipeline(paragraphs)
questions_tokenized = pre.preprocess_pipeline(questions)

paragraphs_text = [" ".join(sentence) for sentence in paragraphs_tokenized]
questions_text = [" ".join(sentence) for sentence in questions_tokenized]



---

<br>

## **1. Plongements lexicaux**

### TF-IDF

In [11]:
def buildVocab(X) -> object:
  vectorizer = CountVectorizer(min_df=0, lowercase=False)
  vectorizer.fit(X)
  return vectorizer.vocabulary_

def getTfIdfReprentation(vocab, data) -> object:
  vectorizer = TfidfVectorizer(ngram_range=(1,3), vocabulary=vocab) 
  data_tfidf = vectorizer.fit_transform(data)
  return data_tfidf

In [12]:
paragraphs_vocab = buildVocab(paragraphs_text)
questions_vocab = buildVocab(questions_text)

paragraphs_tfidf = getTfIdfReprentation(paragraphs_vocab, paragraphs_text)
questions_tfidf = getTfIdfReprentation(questions_vocab, questions_text)

print('paragraphs_tfidf:', paragraphs_tfidf.shape)
print('questions_tfidf:', questions_tfidf.shape)

paragraphs_tfidf: (83327, 138070)
questions_tfidf: (106176, 34996)


In [13]:
corpus = {'id': train_data[0], 'paragraph': paragraphs_tfidf }
save_to_csv("corpus.csv", corpus)

train_ids = {'id': train_ids[0], 'question': paragraphs_tfidf, 'paragraph_id': train_ids[2], 'answer': train_ids[3] }
save_to_csv("train_ids.csv", train_ids)

### GloVe

In [20]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!rm glove.6B.50d.txt
!rm glove.6B.100d.txt
!rm glove.6B.200d.txt

--2020-11-17 23:35:47--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-11-17 23:35:48--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-11-17 23:35:48--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-1

In [17]:
def read_from_csv(path):
    """ 
    reads a matrix from a csv
    """
    data = pd.read_csv(path)
    data = data.dropna(axis=1,how='all')
    return (data.to_numpy().T).tolist()

def get_lines_gloves(line):
    """ 
    this function takes:
    line: a line from the glove text file (a string)
    returns a tuple (word, embeddings vector)
    """
    values = line.split()
    word = values[0]
    return word, np.asarray(values[1:], dtype=float)

def get_gloves_dict(path = "glove.6B.300d.txt"):
    """ 
    this function takes:
    path: to a  glove text file (a string)
    returns a dict {key=word:Value=embeddings vector}
    """
    with open(path, "r", encoding="UTF-8") as f:
            lines = f.readlines()
    p = multiprocessing.Pool()
    result = p.map(get_lines_gloves, lines)
    p.close()
    p.join()
    p.terminate()
    return dict(result)

def get_plong_doc(doc, embeddings_dict, len_vec_emb):
    """
    this functions takes in:
    doc: a string representing a doc in the corpus ex:'il est'
    embeddings_dict: a dict {key=word:Value=embeddings}
    len_vec_emb: the length of the embedding vector (d)
    return an embedding vector for the doc 
    this result is the mean of the vector embedding of each word
    """
    vectorizer = CountVectorizer()
    temp_ = vectorizer.fit([doc]).vocabulary_
    vec = np.zeros(len_vec_emb, dtype=float)
    for word in temp_.keys():
        vec += (embeddings_dict.get(word, 0) * temp_[word])
    return vec / sum(temp_.values())

def get_plong_corpus(corpus, embeddings_dict):
    """
    his functions takes in:
    corpus: ['je vais' 'il est']a list of strings representing the corpus. each string in the list is document in the corpus
    embeddings_dict: a dict {key=word:Value=embeddings}
    return a list of embedding vector [] each vector is the embedding vector for a doc
    """
    p = multiprocessing.Pool()
    result = p.map(partial(get_plong_doc, embeddings_dict=embeddings_dict, len_vec_emb=len(list(embeddings_dict.items())[0][1])), corpus)
    p.close()
    p.join()
    p.terminate()
    return result

In [18]:
path = "/content/drive/My Drive/Colab Notebooks/INF8460/Project/output/corpus.csv"
datat = read_from_csv(path)

vectorizer = CountVectorizer()
X = vectorizer.fit(datat[1]).vocabulary_

In [22]:
glove_dict = get_gloves_dict()
key_set = set(X.keys()) & set(glove_dict.keys())
glove_dict_vocab_corpus = {key: glove_dict[key] for key in key_set}

In [23]:
plongement_doc = get_plong_corpus(datat[1], glove_dict_vocab_corpus)

### Plongements contextuels pré-entraînés / non pré-entraînés



> #### Huggingface ready pipeline



In [None]:
question = "How many parameters does BERT-large have?"
answer_text = r"""BERT-large is really big... it has 24-layers and an embedding size of 1,024, 
                  for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take 
                  a couple minutes to download to your Colab instance."""

nlp = pipeline("question-answering")
result = nlp(question=question, context=answer_text)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=473.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=260793700.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…






Answer: '340M', score: 0.7121, start: 111, end: 115




> #### DistilBERT SQuAD pre-trained



In [None]:
questions = ["How many parameters does BERT-large have?"]
answer_text = r"""BERT-large is really big... it has 24-layers and an embedding size of 1,024, 
                  for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take 
                  a couple minutes to download to your Colab instance."""

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad", 
                                                      return_dict=True, output_hidden_states = True)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad", return_dict=True)

for question in questions:
    inputs = tokenizer(question, answer_text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # display tokens and ids
    for token, id in zip(text_tokens, input_ids):
        if id == tokenizer.sep_token_id:
            print('')
        print('{:<12} {:>6,}'.format(token, id))
        if id == tokenizer.sep_token_id:
            print('')

    outputs = model(**inputs)

    last_hidden_states = outputs.hidden_states[-1]
    print(last_hidden_states.shape)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}, start: {answer_start}, end: {answer_end}")

[CLS]           101
How           1,731
many          1,242
parameters   11,934
does          1,674
B               139
##ER          9,637
##T           1,942
-               118
large         1,415
have          1,138
?               136

[SEP]           102

B               139
##ER          9,637
##T           1,942
-               118
large         1,415
is            1,110
really        1,541
big           1,992
.               119
.               119
.               119
it            1,122
has           1,144
24            1,572
-               118
layers        8,798
and           1,105
an            1,126
em            9,712
##bed         4,774
##ding        3,408
size          2,060
of            1,104
1               122
,               117
02            5,507
##4           1,527
,               117
for           1,111
a               170
total         1,703
of            1,104
340          16,984
##M           2,107
parameters   11,934
!               106
Alto         17,76

> #### BERT base pre-trained

In [None]:
torch.cuda.set_device(0)

questions = ["How many parameters does BERT-large have?"]
answer_text = r"""BERT-large is really big... it has 24-layers and an embedding size of 1,024, 
                  for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take 
                  a couple minutes to download to your Colab instance."""

model = BertModel.from_pretrained("bert-base-cased", return_dict=True)
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# print(device)
# # model = model.to(device)

for question in questions:
    inputs = tokenizer(question, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # display tokens and ids
    for token, id in zip(text_tokens, input_ids):
        if id == tokenizer.sep_token_id:
            print('')
        print('{:<12} {:>6,}'.format(token, id))
        if id == tokenizer.sep_token_id:
            print('')

    outputs = model(**inputs)

    last_hidden_states = outputs.last_hidden_state
    print(last_hidden_states.shape)

[CLS]           101
How           1,731
many          1,242
parameters   11,934
does          1,674
B               139
##ER          9,637
##T           1,942
-               118
large         1,415
have          1,138
?               136

[SEP]           102

torch.Size([1, 13, 768])


> #### DistilBERT SQuAD training



---

<br>

## **2. Ordonnancement**



> #### cosine similarity





> #### LambdaMART with lightgbm





> #### LSTM Siamese text similarity



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dpath = '/content/drive/My Drive/Colab Notebooks/INF8460/Project/'
!ls '/content/drive/My Drive/Colab Notebooks/INF8460/Project/'

data				  LSTMSiameseTextSimilarity
inf8460_projet_A20_equipe8.ipynb  yahooLTR_C14.tgz


In [None]:
# !git clone https://github.com/amansrivastava17/lstm-siamese-text-similarity.git

In [None]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/INF8460/Project/LSTMSiameseTextSimilarity/')
!wget https://github.com/brmson/dataset-sts/tree/master/data/sts/sick2014/SICK_train.txt

--2020-11-14 22:43:07--  https://github.com/brmson/dataset-sts/tree/master/data/sts/sick2014/SICK_train.txt
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/brmson/dataset-sts/blob/master/data/sts/sick2014/SICK_train.txt [following]
--2020-11-14 22:43:08--  https://github.com/brmson/dataset-sts/blob/master/data/sts/sick2014/SICK_train.txt
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘SICK_train.txt.1’

SICK_train.txt.1        [<=>                 ]       0  --.-KB/s               SICK_train.txt.1        [ <=>                ]   1.36M  --.-KB/s    in 0.05s   

2020-11-14 22:43:08 (30.3 MB/s) - ‘SICK_train.txt.1’ saved [1430770]



In [None]:
from model import SiameseBiLSTM
from inputHandler import word_embed_meta_data, create_test_data
from config import siamese_config
import pandas as pd

############ Data Preperation ##########

df = pd.read_csv('lstm-siamese-text-similarity/sample_data.csv')

sentences1 = list(df['sentences1'])
sentences2 = list(df['sentences2'])
is_similar = list(df['is_similar'])
del df

######## Word Embedding ############

tokenizer, embedding_matrix = word_embed_meta_data(sentences1 + sentences2,  siamese_config['EMBEDDING_DIM'])

embedding_meta_data = {
	'tokenizer': tokenizer,
	'embedding_matrix': embedding_matrix
}

## creating sentence pairs
sentences_pair = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)]
del sentences1
del sentences2

######## Training ########

class Configuration(object):
    """Dump stuff here"""

CONFIG = Configuration()

CONFIG.embedding_dim = siamese_config['EMBEDDING_DIM']
CONFIG.max_sequence_length = siamese_config['MAX_SEQUENCE_LENGTH']
CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM']
CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM']
CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS']
CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION']
CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE']
CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT']

siamese = SiameseBiLSTM(CONFIG.embedding_dim , CONFIG.max_sequence_length, CONFIG.number_lstm_units , CONFIG.number_dense_units, CONFIG.rate_drop_lstm, CONFIG.rate_drop_dense, CONFIG.activation_function, CONFIG.validation_split_ratio)

best_model_path = siamese.train_model(sentences_pair, is_similar, embedding_meta_data, model_save_directory='./')

Embedding matrix shape: (3052, 50)
Null word embeddings: 1
Epoch 1/200
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200


In [None]:
######## Testing ########

from operator import itemgetter
from keras.models import load_model

model = load_model(best_model_path)

test_sentence_pairs = [('What can make Physics easy to learn?','How can you make physics easy to learn?'),('How many times a day do a clocks hands overlap?','What does it mean that every time I look at the clock the numbers are the same?')]

test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer,test_sentence_pairs,  siamese_config['MAX_SEQUENCE_LENGTH'])

preds = list(model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel())
results = [(x, y, z) for (x, y), z in zip(test_sentence_pairs, preds)]
results.sort(key=itemgetter(2), reverse=True)
print(results)

[('What can make Physics easy to learn?', 'How can you make physics easy to learn?', 0.39372748), ('How many times a day do a clocks hands overlap?', 'What does it mean that every time I look at the clock the numbers are the same?', 0.169769)]
