<a href="https://colab.research.google.com/github/the-SQuAD-squad/IR/blob/deploy/end2end.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Init { form-width: "25%" }

import os
import random
import math
import numpy as np
import tensorflow as tf
import json
import pandas as pd
import re
import string
import nltk
nltk.download('punkt')
from tqdm.notebook import tqdm

!pip3 install wikipedia-api > /dev/null
import wikipediaapi

from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('display.max_colwidth', -1)

# fix random seeds
seed_value = 42 #@param {type:"integer"}

os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

tf.compat.v1.set_random_seed(seed_value)

session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




In [None]:
#@title Get links from Wiki page { form-width: "25%" }

wiki_wiki = wikipediaapi.Wikipedia('en')
page_py = wiki_wiki.page("Tom Cruise filmography")
def get_links(page):
    links = page.links
    for title in sorted(links.keys()):
        if links[title].ns != 0 or "Unauthorized" in title or "Being Tom Cruise" in title:  # remove some links not strictly related to the tom cruise filmography
            links.pop(title)
    return links

links = get_links(page_py)

In [None]:
#@title Get Wiki pages { form-width: "25%" }

pages_text = {}
for title in tqdm(links):
    page_py = links[title]
    pages_text[title] = page_py.text

# remove the last sections of the wiki page from the text
for title in pages_text:
    stop_index = pages_text[title].rfind("References")
    pages_text[title] = pages_text[title][:stop_index]

HBox(children=(FloatProgress(value=0.0, max=127.0), HTML(value='')))




In [None]:
#@title preprocess wiki pages { form-width: "25%" }
def preprocess_text(text):
    REPLACE_WITH_SPACE = re.compile(r"\n") 
    text = [REPLACE_WITH_SPACE.sub(" ", line) for line in text]
    text = [re.sub(r"([(.;:!\'ˈ~?,\"(\[\])\\\/\-–\t```<>_#$€@%*+—°′″“”×’^₤₹‘])", r'', line) for line in text]
    # we noticed that in the text sometimes we find numbers and the following word merged together (ex: 1980february),
    # so we put a space between the number and the word
    text = [re.sub(r"(\d+)([a-z]+)", r'\1 \2', line) for line in text] 
    text = [re.sub('\s{2,}', ' ', line.strip()) for line in text]   # replacing more than one consecutive blank spaces with only one of them
    return text
    
pages_text_preprocessed = preprocess_text(pages_text.values())
titles = list(pages_text.keys())

In [None]:
#@title tf-idf wiki { form-width: "25%" }
vectorizer =  TfidfVectorizer()

questions = ["When was Tom Cruise born?", "What was the first film Tom Cruise acted in?", "What does Tom Cruise believe in?", "What is Tom Cruise character's name in Mission Impossible?", "What is Vanilla Sky?", "Who directed Mission Impossible?"]

# fit the vectorizer on the preprocessed wikipedia text
passages_vectorized = vectorizer.fit_transform(pages_text_preprocessed)
questions_vectorized = vectorizer.transform(questions)

results = cosine_similarity(questions_vectorized,passages_vectorized)

for i,row in enumerate(results):
    index = np.argmax(row)
    print(questions[i], end = " ---> ")
    print(titles[index])

When was Tom Cruise born? ---> Tom Cruise
What was the first film Tom Cruise acted in? ---> Tom Cruise
What does Tom Cruise believe in? ---> Tom Cruise
What is Tom Cruise character's name in Mission Impossible? ---> Mission: Impossible (film series)
What is Vanilla Sky? ---> Vanilla Sky
Who directed Mission Impossible? ---> Mission: Impossible (film series)


In [None]:
#@title model definition { form-width: "25%" }

!pip install 'Transformers==4.3'
import transformers

max_seq_length = 512
pretrained_model_str = "roberta-base"
bert_hf_layer = transformers.TFRobertaModel.from_pretrained(
    pretrained_model_str, output_attentions=True)

#@title model definition { form-width: "25%" }

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')

#pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])

#HUGGINGFACE 🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗
sequence_output = bert_hf_layer(input_ids=input_word_ids, attention_mask=input_mask, 
                                token_type_ids=input_type_ids).last_hidden_state

#do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

start_logits = tf.keras.layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
start_logits = tf.keras.layers.Flatten(name="flatten_start")(start_logits)

end_logits = tf.keras.layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
end_logits = tf.keras.layers.Flatten(name="flatten_end")(end_logits)

start_probs = tf.keras.layers.Activation(tf.keras.activations.softmax, name="softmax_start")(start_logits)
end_probs = tf.keras.layers.Activation(tf.keras.activations.softmax, name="softmax_end")(end_logits)

model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], 
                    outputs=[start_probs, end_probs],
                    name="BERT_QA")

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

optimizer = tf.keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

model.summary(line_length=150)

# load weights from the SQuAD v2 training
!wget https://api.wandb.ai/files/buio/SQUAD/jkgwaatn/model-best.h5
model.load_weights("model-best.h5")

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Model: "BERT_QA"
______________________________________________________________________________________________________________________________________________________
Layer (type)                                     Output Shape                     Param #           Connected to                                      
input_word_ids (InputLayer)       

In [None]:
#@title transformer input preparation { form-width: "25%" }

import nltk.data
from  transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_str)

def preprocess_bert(text):
    tokenized_text = tokenizer(text, return_offsets_mapping=True)
    rows_out = tokenized_text.input_ids
    return rows_out

def custom_inference(context, question):
    tokenizer_nltk = nltk.data.load('tokenizers/punkt/english.pickle')  # sentence tokenizer
    context_sentences = tokenizer_nltk.tokenize(context)
    preprocessed_context = [" ".join(str(line).split()) for line in context_sentences]
    preprocessed_question = " ".join(str(question).split())
    tokenized_question = preprocess_bert(preprocessed_question)
    tokenized_sentences = [preprocess_bert(preprocessed_line) for preprocessed_line in preprocessed_context]
    sentence_index = 0
    tokenized_passages = []

    while sentence_index < len(tokenized_sentences):
        start = sentence_index
        len_count = len(tokenized_question)
        while len_count <= 512 and sentence_index < len(tokenized_sentences):
            len_count += len(tokenized_sentences[sentence_index])
            sentence_index += 1
        end = sentence_index -1
        tokenized_passages.append(preprocess_bert(" ".join(preprocessed_context[start:end])))

    prob = []
    candidate_ans = []
    for tokenized_passage in tokenized_passages:
        input_ids = tokenized_passage + tokenized_question[1:]
        token_type_ids = [0] * len(tokenized_passage) + [1] * len(tokenized_question[1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        input_word_ids = np.array(input_ids)
        input_mask = np.array(attention_mask)
        input_type_ids = np.array(token_type_ids)
        predictions = model.predict([np.expand_dims(input_word_ids, axis =0), 
                                    np.expand_dims(input_mask, axis = 0), 
                                    np.expand_dims(input_type_ids,axis=0)])
        start, end = list(np.argmax(predictions, axis=-1).squeeze())
        if start > end:
            continue 
        else:
            prob_start,prob_end = list(np.max(predictions, axis=-1).squeeze())
            prob_sum = prob_start+prob_end
            predicted_ans = tokenizer.decode(tokenized_passage[start : end+1])
            if predicted_ans != '' and predicted_ans != "<s>":
                candidate_ans.append(predicted_ans)
                prob.append(prob_sum)

    print(*zip(prob, candidate_ans), sep='\n')  
    try:     
        ans = candidate_ans[np.argmax(prob)]
    except:
        ans = "I'm really sorry, I wasn't able to find an answer :("
    return ans

In [None]:
#@title custom inference { form-width: "25%" }

questions = ["Who is the first bride of Tom Cruise?", "When was Tom Cruise born?", "What is Mission Impossible?", "What does Tom Cruise believe in?", "What is the genre of Mission Impossible?", "Who is the villain in Mission impossible 3?", "Who is the villain in Mission impossible ghost protocol?" "What is Tom Cruise character's name in Mission Impossible III?", "What is Vanilla Sky?", "Who directed Mission Impossible?"]
question = questions[0]
question_vectorized = vectorizer.transform([question])
result = cosine_similarity(question_vectorized,passages_vectorized)
index = np.argmax(result[0])
print("TF-IDF result:")
print(question, end = " ---> ")
print(titles[index])
context = pages_text[titles[index]]
predicted_ans = custom_inference(context,question)
print()
print("BERT answer:")
print(predicted_ans)

TF-IDF result:
Who is the first bride of Tom Cruise? ---> Tom Cruise
(1.5755904, ' Mimi Rogers')
(1.9074783, ' Jack South')
(1.8383498, ' Nicole Kidman')
(1.4113784, ' Julian Sands')
(1.4067633, ' Cameron Diaz')
(1.7063069, ' Kathryn Bigelow')
(1.9589767, ' Mimi Rogers')
(1.3722897, ' Katie Holmes')
(1.9474514, ' Mimi Rogers')
(1.8999519, ' Kidman')

BERT answer:
 Mimi Rogers
