Installation modul

In [50]:
# !pip install python-Levenshtein
# !pip install bert-serving-server bert-serving-client
# !pip install transformers
# !pip install pandas
# !pip install --upgrade pip
# !pip install tensorflow
# !pip install tf-nightly
# !pip install transformers
# !py -3.9 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
# !py -3.9 -m pip install -Uq sentence-transformers --user

Load data

In [4]:
import pandas as pd 
import os

In [34]:
files = os.listdir('dataset/')
files = files[0:3]
files

['S08_question_answer_pairs.txt',
 'S09_question_answer_pairs.txt',
 'S10_question_answer_pairs.txt']

In [35]:
df = pd.DataFrame()

for file in files:
  sub_df = pd.read_csv('dataset/' + file, sep='\t', encoding='latin-1').drop_duplicates(subset='Question')
  df = pd.concat([df, sub_df], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2463 entries, 0 to 2462
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ï»¿ArticleTitle           1631 non-null   object
 1   Question                  2461 non-null   object
 2   Answer                    2188 non-null   object
 3   DifficultyFromQuestioner  1570 non-null   object
 4   DifficultyFromAnswerer    2185 non-null   object
 5   ArticleFile               2461 non-null   object
 6   ArticleTitle              832 non-null    object
dtypes: object(7)
memory usage: 134.8+ KB


In [36]:
df.drop('ArticleTitle', axis=1, inplace=True)
df.rename(columns={'ï»¿ArticleTitle':'ArticleTitle'}, inplace=True)
df

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of...,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1...,yes,easy,medium,S08_set3_a4
2,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4
3,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months,medium,easy,S08_set3_a4
4,Abraham_Lincoln,When did Lincoln begin his political career?,1832,medium,easy,S08_set3_a4
...,...,...,...,...,...,...
2458,,What do zebras eat?,mainly grass,medium,hard,S10_set1_a9
2459,,What are zebras hunted for?,mainly for their skins,medium,medium,S10_set1_a9
2460,,What areas do the Grevy's Zebras inhabit?,semi-arid grasslands of Ethiopia and northern ...,hard,hard,S10_set1_a9
2461,,Which species of zebra is known as the common ...,"Plains Zebra (Equus quagga, formerly Equus bur...",hard,medium,S10_set1_a9


Data cleaning

In [37]:
df.dropna(subset=['Question'], inplace=True)
df.dropna(subset=['Answer'], inplace=True)
df.dropna(subset=['ArticleTitle'], inplace=True)
df = df[~df['Question'].str.contains('#')]
df = df[~df['Answer'].isin(['no','yes','Yes','No','No,','Yes,','No.','Yes.','yes.','no.'])]

modeling

In [11]:
from transformers import pipeline



In [12]:
model = pipeline('question-answering', model='distilbert-base-cased-distilled-squad')

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [46]:
def AnswerQuestion(q, article):
    folder = 'dataset/text_data/'
    files = df[df['ArticleTitle'] == article]['ArticleFile'].unique()[0] + '.txt.clean'
    path = os.path.join(folder, files)
    
    with open(path, 'r', errors='ignore') as my_files:
        context = my_files.read().replace('\n', '')
        # print(context)

    return model(q, context)

AnswerQuestion('Who made Volta a count?', 'Alessandro_Volta')   

{'score': 0.7745566964149475, 'start': 4404, 'end': 4412, 'answer': 'Napoleon'}

In [47]:
sample = df.sample(30)
sample['AnswerFromModel'] = sample.apply(lambda x: AnswerQuestion(x['Question'], x['ArticleTitle']), axis=1)
sample['ModelingAnswer'] = sample['AnswerFromModel'].apply(lambda x: x['answer'])
sample['ModelingScore'] = sample['AnswerFromModel'].apply(lambda x: x['score'])
sample.drop('AnswerFromModel', axis=1, inplace=True)
sample.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile,ModelingAnswer,ModelingScore
165,Canada,"In addition to Port Royal, where else did Samu...",Quebec City,medium,medium,S08_set2_a8,Quebec City,0.995903
840,Romania,"What are approximately 42,000 years old and ha...",the remains (the lower jaw) of the oldest mode...,,hard,S08_set2_a7,The remains (the lower jaw),0.30682
870,Singapore,Where is the National Orchid Garden?,Singapore Botanic Gardens.,medium,medium,S08_set2_a2,Singapore,0.594911
875,Singapore,What is the punishment for first-degree murder?,There are laws which allow capital punishment ...,hard,hard,S08_set2_a2,capital punishment,0.874122
1377,James_Watt,James Watt was born where?,"Greenock, a seaport on the Firth of Clyde",easy,medium,S09_set4_a2,London,0.967558


In [48]:
sample.drop('DifficultyFromQuestioner', axis=1, inplace=True)
sample.drop('DifficultyFromAnswerer', axis=1, inplace=True)
# sample.drop('ArticleFile', axis=1, inplace=True)
sample.head()

Unnamed: 0,ArticleTitle,Question,Answer,ArticleFile,ModelingAnswer,ModelingScore
165,Canada,"In addition to Port Royal, where else did Samu...",Quebec City,S08_set2_a8,Quebec City,0.995903
840,Romania,"What are approximately 42,000 years old and ha...",the remains (the lower jaw) of the oldest mode...,S08_set2_a7,The remains (the lower jaw),0.30682
870,Singapore,Where is the National Orchid Garden?,Singapore Botanic Gardens.,S08_set2_a2,Singapore,0.594911
875,Singapore,What is the punishment for first-degree murder?,There are laws which allow capital punishment ...,S08_set2_a2,capital punishment,0.874122
1377,James_Watt,James Watt was born where?,"Greenock, a seaport on the Firth of Clyde",S09_set4_a2,London,0.967558


Compare Model Answering

In [54]:
from sentence_transformers import util, SentenceTransformer

In [52]:
def SimilarityAnswer(answ, model_answ, model):
    answ_em = model.encode(answ, convert_to_tensor=True)
    model_answ_em = model.encode(model_answ, convert_to_tensor=True)

    return util.pytorch_cos_sim(answ_em, model_answ_em).item()

In [55]:
model_st = SentenceTransformer('sentence-transformers/all-miniLM-L6-v2')
sample['SimilarityAnswer'] = sample.apply(lambda x: SimilarityAnswer(x['Answer'], x['ModelingAnswer'], model_st), axis=1)
sample.sort_values(by='SimilarityAnswer', ascending=False)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Unnamed: 0,ArticleTitle,Question,Answer,ArticleFile,ModelingAnswer,ModelingScore,SimilarityAnswer
777,polar_bear,How heavy is a male polar bear?,300-600 kg (660-1320 lb),S08_set1_a4,300-600 kg (660-1320 lb),0.50193,1.0
632,Liechtenstein,When was the first factory opened?,1836,S08_set2_a1,1836,0.952051,1.0
1223,Copenhagen,What transnational bridge was completed in 2000?,Oresund Bridge,S09_set3_a8,Oresund Bridge,0.951993,1.0
1444,Michael_Faraday,Who was the sponsor and mentor of Faraday?,John 'Mad Jack' Fuller,S09_set4_a7,John 'Mad Jack' Fuller,0.986463,1.0
165,Canada,"In addition to Port Royal, where else did Samu...",Quebec City,S08_set2_a8,Quebec City,0.995903,1.0
716,otter,What is an otter's den called?,Holt,S08_set1_a7,holt,0.893469,1.0
1618,Violin,How many strings does a violin usually have?,four,S09_set2_a2,four,0.664392,1.0
180,Canada,European books and maps began referring to thi...,1545,S08_set2_a8,1545,0.965857,1.0
833,Romania,"Who was the first gymnast to score a perfect ""...",Nadia ComÄneci,S08_set2_a7,Nadia ComÄƒneci,0.921662,0.9532
912,turtle,How do turtles reproduce?,They lay eggs,S08_set1_a9,lay eggs,0.055917,0.87709


In [56]:
sample.to_csv('dataset/result.csv', index=False)

In [57]:
result = pd.read_csv('dataset/result.csv')
result

Unnamed: 0,ArticleTitle,Question,Answer,ArticleFile,ModelingAnswer,ModelingScore,SimilarityAnswer
0,Canada,"In addition to Port Royal, where else did Samu...",Quebec City,S08_set2_a8,Quebec City,0.995903,1.0
1,Romania,"What are approximately 42,000 years old and ha...",the remains (the lower jaw) of the oldest mode...,S08_set2_a7,The remains (the lower jaw),0.30682,0.790763
2,Singapore,Where is the National Orchid Garden?,Singapore Botanic Gardens.,S08_set2_a2,Singapore,0.594911,0.566212
3,Singapore,What is the punishment for first-degree murder?,There are laws which allow capital punishment ...,S08_set2_a2,capital punishment,0.874122,0.600196
4,James_Watt,James Watt was born where?,"Greenock, a seaport on the Firth of Clyde",S09_set4_a2,London,0.967558,0.242339
5,Liechtenstein,When was the first factory opened?,1836,S08_set2_a1,1836,0.952051,1.0
6,Ulysses_S._Grant,Who achieved international fame as the leading...,Grant,S08_set3_a5,Ulysses S. Grant Ulysses S. Grant,0.778269,0.612087
7,Violin,How many strings does a violin usually have?,four,S09_set2_a2,four,0.664392,1.0
8,Finland,What body of water lies to the south of Finland?,the Gulf of Finland,S08_set2_a4,Barents Sea harbour,0.537658,0.308042
9,Grover_Cleveland,When did he die?,June 24 1908,S08_set3_a6,March 18 1837 June 24 1908,0.768992,0.735608
