# MVP Cognitive Search Application using "distilbert" model

### Importing Libraries

In [5]:
import os
import pandas as pd
from ast import literal_eval

from cdqa.utils.converters import pdf_converter
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model

### Downloading Pre-trained model: BERT (Stanford question and answer data set)

In [6]:
download_model(model = 'distilbert-squad_1.1', dir='./models')


Downloading trained model...
distilbert_qa.joblib already downloaded


### Converting text from PDF into Pandas dataframe

In [8]:
df = pdf_converter(directory_path='./PDF_docs/')
df.head(4)

Unnamed: 0,title,paragraphs
0,09_080661e_Golpanol_ALS,"[Technical Information, June 2016Supersedes is..."
1,PRD_30042690_Golpanol_ALS,"[Product specification, Golpanol ALS, Test pr..."
2,08_150301e_Glucopon_GD_70,"[Technical Information , March 2015, 08_150301..."
3,30594438_Glucopon_GD_70,"[Product specification, Glucopon GD 70, Test ..."


### Using pre-trained language model: distilbert

In [9]:
cdqa_pipeline = QAPipeline(reader='./models/distilbert_qa.joblib', max_df = 1.0)
cdqa_pipeline.fit_retriever(df=df)

QAPipeline(reader=BertQA(adam_epsilon=1e-08,
                         bert_model='distilbert-base-uncased',
                         do_lower_case=True, fp16=False,
                         gradient_accumulation_steps=1, learning_rate=5e-05,
                         local_rank=-1, loss_scale=0, max_answer_length=30,
                         n_best_size=20, no_cuda=False,
                         null_score_diff_threshold=0.0, num_train_epochs=3.0,
                         output_dir=None, predict_batch_size=8, seed=42,
                         server_ip='', ser..._size=8,
                         verbose_logging=False, version_2_with_negative=False,
                         warmup_proportion=0.1, warmup_steps=0),
           retrieve_by_doc=False,
           retriever=BM25Retriever(b=0.75, floor=None, k1=2.0, lowercase=True,
                                   max_df=1.0, min_df=2, ngram_range=(1, 2),
                                   preprocessor=None, stop_words='english',
           

### Questions from technical information document:

In [10]:
question_1 = 'What is the shelf life of Golpanol ALS?'
prediction = cdqa_pipeline.predict(question_1)
print('Question : {}'.format(question_1))
print('Answer: {}'.format(prediction[0]))

Question : What is the shelf life of Golpanol ALS?
Answer: 2 years


In [11]:
question_2 = 'What is the appearance of Golpanol ALS?'
prediction = cdqa_pipeline.predict(question_2)
print('Question : {}'.format(question_2))
print('Answer: {}'.format(prediction[0]))

Question : What is the appearance of Golpanol ALS?
Answer: clear, colorless or yellowish liquid


In [12]:
question_3 = 'What is Golpanol ALS pH value?'
prediction = cdqa_pipeline.predict(question_3)
print('Question : {}'.format(question_3))
print('Answer: {}'.format(prediction[0]))

Question : What is Golpanol ALS pH value?
Answer: Unit Value


### Questions from product specification document:

In [13]:
question_4 = 'What is the PRD number of Golpanol ALS?'
prediction = cdqa_pipeline.predict(question_4)
print('Question : {}'.format(question_4))
print('Answer: {}'.format(prediction[0]))

Question : What is the PRD number of Golpanol ALS?
Answer: 30042690


In [14]:
question_5 = 'What is the density value of Golpanol ALS?'
prediction = cdqa_pipeline.predict(question_5)
print('Question : {}'.format(question_5))
print('Answer: {}'.format(prediction[0]))

Question : What is the density value of Golpanol ALS?
Answer: 1.19 – 1.23pH value


In [15]:
question_6 = 'What is the chemical nature of Golpanol ALS?'
prediction = cdqa_pipeline.predict(question_6)
print('Question : {}'.format(question_6))
print('Answer: {}'.format(prediction[0]))

Question : What is the chemical nature of Golpanol ALS?
Answer: Sodium allyl sulfonate
