In [1]:
import pandas as pd
import random

In [2]:
oos = pd.read_csv('samples.txt', sep = '\n', names=['sentences'])
triggers = pd.read_csv('triggers.txt', sep = '\n', names=['sentences'])
distractors_tbl = pd.read_csv('neg_distractors.txt', sep = '\n', names=['sentences'])
context = pd.read_csv('weather_app_sample_convos.csv')

In [3]:
def oos_context_gen(oos, triggers, context, distractors = True, randomize = False, len_oos = 20): 
    "Generates one random string of input for the weather app using len_oos OOS lines"
    
    oos_samp = oos.sample(len_oos)
    context_samp = context.sample(1)
    triggers_samp = triggers.sample(1)
        
    oos_text = ' '.join(oos_samp['sentences'].values)
    context_text, label = context_samp['Context'].values[0], context_samp['Location'].values[0]
    trigger = triggers_samp['sentences'].values[0]
    
    if distractors == True:
        distractors_samp = distractors_tbl.sample(1)
        distractors_text = distractors_samp['sentences'].values[0]
        concat = [oos_text, context_text, distractors_text, trigger]    
    else: 
        concat = [oos_text, context_text, trigger]
        
    if randomize: 
        random.shuffle(concat)
        
    return ' '.join(concat), label 
        
    

### With distractors and no randomization

In [8]:
input_samples = pd.DataFrame(columns = ['sentence', 'label'])
for i in tqdm(range(5000)): 
    sent, lab = oos_context_gen(oos, triggers, context)
    input_samples = input_samples.append({'sentence' : sent, 'label' : lab}, ignore_index = True)

input_samples.to_csv('input_samples_dist_norandom.csv')

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:25<00:00, 196.81it/s]


In [10]:
input_samples = pd.read_csv('input_samples_dist_norandom.csv')

In [11]:
from transformers import pipeline
from tqdm import tqdm

In [12]:
def test_loc(questions=["what is the location?"]):
    """
    Function to test qa model against a list of location related questions. 
    Calculates accuracy (disregards nan values)
    """
    num_correct = []
    for ques in tqdm(questions):
        curr_correct = 0
        for index, row in tqdm(input_samples.iterrows()):
            context = row["sentence"]
            nlp_res = nlp(question=ques, context=context)
            
            if nlp_res['answer'].lower() in str(row['label']).lower(): 
                curr_correct += 1
        num_correct.append(curr_correct/len(input_samples))
            
    return num_correct

In [13]:
model_name = "mrm8488/bert-tiny-5-finetuned-squadv2"
nlp = pipeline(model=model_name, tokenizer=model_name, task="question-answering")

In [14]:
input_samples = input_samples.dropna().sample(100)

In [15]:
accuracy = test_loc()

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  6.25it/s][A
2it [00:00,  6.90it/s][A
4it [00:00,  7.85it/s][A
5it [00:00,  8.07it/s][A
6it [00:00,  8.54it/s][A
8it [00:00,  8.73it/s][A
9it [00:00,  9.07it/s][A
10it [00:01,  9.01it/s][A
11it [00:01,  7.53it/s][A
12it [00:01,  7.30it/s][A
13it [00:01,  7.53it/s][A
14it [00:01,  7.76it/s][A
15it [00:01,  7.46it/s][A
16it [00:01,  7.68it/s][A
17it [00:02,  8.13it/s][A
18it [00:02,  8.07it/s][A
19it [00:02,  8.54it/s][A
20it [00:02,  8.32it/s][A
21it [00:02,  8.45it/s][A
22it [00:02,  8.33it/s][A
23it [00:02,  8.10it/s][A
24it [00:02,  8.29it/s][A
25it [00:03,  7.87it/s][A
26it [00:03,  7.38it/s][A
27it [00:03,  4.93it/s][A
28it [00:03,  5.48it/s][A
30it [00:03,  6.38it/s][A
31it [00:03,  7.01it/s][A
32it [00:04,  7.40it/s][A
33it [00:04,  8.02it/s][A
34it [00:04,  8.51it/s][A
35it [00:04,  5.58it/s][A
3

In [16]:
accuracy

[0.03]

### Without distractors and with randomization

In [17]:
input_samples = pd.DataFrame(columns = ['sentence', 'label'])
for i in tqdm(range(5000)): 
    sent, lab = oos_context_gen(oos, triggers, context, distractors = False, randomize = True)
    input_samples = input_samples.append({'sentence' : sent, 'label' : lab}, ignore_index = True)

input_samples.to_csv('input_samples_nodist_random.csv')

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:22<00:00, 221.20it/s]


In [18]:
input_samples = pd.read_csv('input_samples_nodist_random.csv')

In [19]:
input_samples = input_samples.dropna().sample(100)

In [20]:
accuracy = test_loc()

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
1it [00:00,  8.12it/s][A
3it [00:00,  8.91it/s][A
4it [00:00,  8.73it/s][A
5it [00:00,  8.01it/s][A
6it [00:00,  8.50it/s][A
8it [00:00,  9.07it/s][A
9it [00:00,  8.81it/s][A
10it [00:01,  4.88it/s][A
12it [00:01,  6.07it/s][A
13it [00:01,  6.87it/s][A
15it [00:01,  7.88it/s][A
17it [00:01,  8.61it/s][A
19it [00:02,  9.31it/s][A
21it [00:02,  9.57it/s][A
23it [00:02,  9.89it/s][A
25it [00:02, 10.74it/s][A
27it [00:02, 10.08it/s][A
29it [00:03,  9.92it/s][A
31it [00:03, 10.21it/s][A
33it [00:03, 10.37it/s][A
35it [00:03, 10.27it/s][A
37it [00:03,  9.77it/s][A
39it [00:04, 10.13it/s][A
41it [00:04, 10.40it/s][A
43it [00:04, 11.02it/s][A
45it [00:04, 11.30it/s][A
47it [00:04, 11.06it/s][A
49it [00:05,  7.92it/s][A
50it [00:05,  8.04it/s][A
51it [00:05,  8.52it/s][A
52it [00:05,  8.58it/s][A
53it [00:05,  8.93it/s][A
5

In [21]:
accuracy

[0.14]

### No distractors and no randomization and reduced length (15)

In [22]:
input_samples = pd.DataFrame(columns = ['sentence', 'label'])
for i in tqdm(range(5000)): 
    sent, lab = oos_context_gen(oos, triggers, context, distractors = False, randomize = False, len_oos = 15)
    input_samples = input_samples.append({'sentence' : sent, 'label' : lab}, ignore_index = True)

input_samples.to_csv('input_samples_nodist_norandom.csv')

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:25<00:00, 197.14it/s]


In [23]:
input_samples = input_samples.dropna().sample(100)

In [24]:
accuracy = test_loc()

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
2it [00:00, 10.34it/s][A
4it [00:00, 11.10it/s][A
6it [00:00, 11.80it/s][A
8it [00:00, 12.31it/s][A
10it [00:00, 12.04it/s][A
12it [00:00, 12.75it/s][A
14it [00:01, 12.45it/s][A
16it [00:01, 11.69it/s][A
18it [00:01, 12.07it/s][A
20it [00:01, 12.44it/s][A
22it [00:01, 12.36it/s][A
24it [00:01, 12.60it/s][A
26it [00:02, 12.79it/s][A
28it [00:02, 12.13it/s][A
30it [00:02, 12.26it/s][A
32it [00:02, 12.82it/s][A
34it [00:02, 12.96it/s][A
36it [00:02, 13.50it/s][A
38it [00:03, 13.41it/s][A
40it [00:03, 13.01it/s][A
42it [00:03, 12.62it/s][A
44it [00:03, 12.88it/s][A
46it [00:03, 12.52it/s][A
48it [00:03, 12.41it/s][A
50it [00:04, 11.63it/s][A
52it [00:04, 11.94it/s][A
54it [00:04, 11.35it/s][A
56it [00:04, 12.04it/s][A
58it [00:04, 11.47it/s][A
60it [00:04, 11.24it/s][A
62it [00:05, 10.88it/s][A
64it [00:05, 10.87it/s][

In [25]:
accuracy

[0.12]

### Reducing length even further

In [26]:
input_samples = pd.DataFrame(columns = ['sentence', 'label'])
for i in tqdm(range(5000)): 
    sent, lab = oos_context_gen(oos, triggers, context, distractors = False, randomize = False, len_oos = 10)
    input_samples = input_samples.append({'sentence' : sent, 'label' : lab}, ignore_index = True)

100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:21<00:00, 233.13it/s]


In [27]:
input_samples = input_samples.dropna().sample(100)

In [None]:
accuracy = test_loc()