In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
folder = '/content/drive/MyDrive/text_data'

df = pd.DataFrame()
for file in ['S10_question_answer_pairs.txt','S09_question_answer_pairs.txt','S08_question_answer_pairs.txt']:
    filename = os.path.join(folder, file)
    df_tmp = pd.read_csv(filename, encoding='latin1', sep='\t').drop_duplicates(subset="Question")
    print(filename, len(df_tmp))
    df = pd.concat([df,df_tmp])

/content/drive/MyDrive/text_data/S10_question_answer_pairs.txt 832
/content/drive/MyDrive/text_data/S09_question_answer_pairs.txt 598
/content/drive/MyDrive/text_data/S08_question_answer_pairs.txt 1033


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2463 entries, 0 to 1714
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ArticleTitle              832 non-null    object
 1   Question                  2461 non-null   object
 2   Answer                    2190 non-null   object
 3   DifficultyFromQuestioner  1570 non-null   object
 4   DifficultyFromAnswerer    2185 non-null   object
 5   ArticleFile               2461 non-null   object
 6   ï»¿ArticleTitle           1631 non-null   object
dtypes: object(7)
memory usage: 153.9+ KB


In [None]:
# normalize the name of columns
df.columns = ['articleTitle','question','answer','difficultyFromQuestioner',
              'difficultyFromAnswerer','articleFile',"articleTitle_to_drop"]

# drop the last column
df.drop('articleTitle_to_drop', axis=1, inplace=True)

In [None]:
# nb of na values in each column
df.isna().sum()

articleTitle                1631
question                       2
answer                       273
difficultyFromQuestioner     893
difficultyFromAnswerer       278
articleFile                    2
dtype: int64

In [None]:
# drop na values
print('original df length: ',len(df))
df.dropna(subset=['question'], inplace=True)
df.dropna(subset=['answer'], inplace=True)
df.dropna(subset=['articleTitle'], inplace=True)
df = df[~df.question.str.contains('#')] # remove badly formatted questions
df = df[~df.answer.isin(['no','yes','Yes','No','No,','Yes,','No.','Yes.','yes.','no.'])] # remove yes/no questions
print('new df length: ',len(df))

original df length:  2463
new df length:  511


In [None]:
pd.options.display.max_colwidth=None
df.head()

Unnamed: 0,articleTitle,question,answer,difficultyFromQuestioner,difficultyFromAnswerer,articleFile
0,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,Alessandro Volta was not a professor of chemistry.,easy,easy,S10_set4_a10
2,Alessandro_Volta,Did Alessandro Volta invent the remotely operated pistol?,Alessandro Volta did invent the remotely operated pistol.,easy,easy,S10_set4_a10
4,Alessandro_Volta,Was Alessandro Volta taught in public schools?,Volta was taught in public schools.,easy,easy,S10_set4_a10
6,Alessandro_Volta,Who did Alessandro Volta marry?,Alessandro Volta married Teresa Peregrini.,medium,medium,S10_set4_a10
8,Alessandro_Volta,What did Alessandro Volta invent in 1800?,"In 1800, Alessandro Volta invented the voltaic pile.",medium,easy,S10_set4_a10


----

In [None]:
pip install transformers



In [None]:
# load question-answering model

from transformers import pipeline
qa_model = pipeline("question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
def answer_question_given_article(question, article_name):
    '''
    given the question, read the article and use it as a context and return a dictionary with an
    answer and a score and the position of the answer in the article
    '''
    folder_name = '/content/drive/MyDrive/text_data/text_data'
    article_file = str(df[df.articleTitle==article_name].articleFile.unique()[0]) + '.txt.clean'
    article_path = os.path.join(folder_name, article_file)

    with open(article_path, 'r') as file:
        context = file.read().replace('\n', '')

    return qa_model(question, context)

In [None]:
%%time

# question example and the time it takes to answer it
question_example = """In 1602, the British East India Company's first voyage, commanded by Sir who, arrived in Aceh and sailed on to Banten where they were allowed to build a trading post?"""
answer_question_given_article(question_example, "Jakarta")

CPU times: user 29.3 s, sys: 104 ms, total: 29.4 s
Wall time: 41.5 s


{'score': 0.8335646986961365,
 'start': 3262,
 'end': 3281,
 'answer': 'Sir James Lancaster'}

In [None]:
df[df.question==question_example]

Unnamed: 0,articleTitle,question,answer,difficultyFromQuestioner,difficultyFromAnswerer,articleFile
663,Jakarta,"In 1602, the British East India Company's first voyage, commanded by Sir who, arrived in Aceh and sailed on to Banten where they were allowed to build a trading post?",James Lancaster,,medium,S10_set3_a5


In [None]:
df_sample = df.sample(30)

df_sample['answer_from_model'] = df_sample.apply(lambda x: answer_question_given_article(x.question, x.articleTitle), axis=1)

df_sample['models_answer'] = df_sample['answer_from_model'].map(lambda x:x['answer'])

df_sample['models_score'] = df_sample['answer_from_model'].map(lambda x:x['score'])

In [None]:
cols_of_interst = ['question','answer','models_answer','models_score']
df_sample.head()[cols_of_interst]

Unnamed: 0,question,answer,models_answer,models_score
100,Do worker ants have wings?,"No, worker ants do not have wings.",do not have workers,0.447633
549,Which guitars use three single-coil pickups?,Fender Statocaster type guitars,Gibson Les Pauls,0.854811
1129,What are the two main groups of Portuguese dialects?,Those of Brazil and those of the Old World,Galician and the Fala,0.91756
1131,What event marked the end of the Old Portuguese period?,The publication of the Cancioneiro Geral by Garcia de Resende in 1516,Portugal established a colonial and commercial empire,0.192172
172,Is Berlin the capital city of Germany?,Berlin is the capital city of Germany.,Berlin as a German state has accumulated more debt than any other city in Germany,0.583601


In [None]:
pip install -Uq sentence-transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [None]:
def how_similar(sent1, sent2, model):
    '''
    compute sentence similarity.
    '''
    #Compute embedding for both lists
    embedding_1= model.encode(str(sent1), convert_to_tensor=True)
    embedding_2 = model.encode(str(sent2), convert_to_tensor=True)

    return util.pytorch_cos_sim(embedding_1, embedding_2).item()

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df_sample['answers_similarity'] = df_sample.apply(lambda x: how_similar(x.answer, x.models_answer, model=model), axis=1)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# let's inspect
df_sample[['question','answer','models_answer','answers_similarity','models_score']].sort_values(by='answers_similarity', ascending=False)

Unnamed: 0,question,answer,models_answer,answers_similarity,models_score
1456,At what age can a zebra breed?,five or six,five or six,1.0,0.058898
615,Where was Isaac Newton buried?,Westminster Abbey,Westminster Abbey,1.0,0.934218
1140,"(where French has a similar phenomenon, with alveolar affricates instead of postalveolars?",Quebec.,Quebec,0.970008,0.729869
309,Do linguists often view Chinese as a language family?,"Yes, linguists often view Chinese as a language family.",Linguists often view Chinese as a language family,0.943564,0.337965
1173,Is San Franscico a popular tourist destination?,Yes. San Francisco is a popular international tourist destination.,San Francisco is a popular international tourist destination,0.88192,0.490606
172,Is Berlin the capital city of Germany?,Berlin is the capital city of Germany.,Berlin as a German state has accumulated more debt than any other city in Germany,0.687285,0.583601
1160,Who laid down the Peter and Paul Fortress?,Peter the Great,Tsar Peter I of Russia,0.620028,0.691974
1103,What are a piano's keys generally made of?,Piano keys are generally made of spruce or basswood.,spruce or basswood,0.542312,0.872846
106,Where are bullet ants located?,Bullet ants are located in Central and South America.,Central and South America,0.536672,0.25201
1208,Give an example of the ten Beta World Cities.,San Trancisco,San Francisco,0.533199,0.768252


Not bad for a model that never saw any example nor has it been finetuned on specific corpora