In [12]:
import os

base_dir = 'indicators'
indicators = {}
for filename in os.listdir(base_dir):
    with open(os.path.join(base_dir, filename), 'r') as f:
        indicators[filename] = []
        lines = f.readlines()
        for line in lines:
            indicators[filename].append(line.strip())

In [13]:
indicators

{'INSERT': ['aboard',
  'about',
  'absorbing',
  'accepting',
  'accepts',
  'accommodated',
  'acquiring',
  'admits',
  'admitted',
  'amid',
  'amidst',
  'among',
  'around',
  'assuming',
  'astride',
  'bagging',
  'besieged',
  'besieges',
  'between',
  'boarding',
  'boxed',
  'boxes',
  'boxing',
  'breaking',
  'bridged',
  'bridges',
  'bringing in',
  'brought round',
  'buried',
  'captures',
  'carried by',
  'carries',
  'carrying',
  'catches',
  'catching',
  'caught',
  'circled',
  'circling',
  'cleaving',
  'clutched',
  'clutches',
  'collected by',
  'collecting',
  'concealing',
  'contained',
  'contained by',
  'containing',
  'contents',
  'covered',
  'covers',
  'cutting',
  'devouring',
  'dividing',
  'during',
  'eating',
  'embraced',
  'embracing',
  'encircled',
  'encircles',
  'enclosing',
  'enfolded',
  'enfolds',
  'engaged in',
  'entering',
  'enveloped',
  'enveloping',
  'envelops',
  'external',
  'fencing',
  'filling',
  'filled with',
 

In [14]:

import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
print(stopwords.words('english'))



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/abdelrahman.sadallah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
print(f'Sizes before removing stop words:')
for k, v in indicators.items():
    print(f'{k}: {len(v)}')

Sizes before removing stop words:
INSERT: 158
ANAGRAM: 1152
SUBSTRING_FINAL: 38
SUBSTRING: 112
REVERSE: 110
SUBSTRING_INIT: 47


In [16]:
# Removing stop words

for indicator in indicators:
    indicators[indicator] = [word for word in indicators[indicator] if word.lower() not in stopwords.words('english')]

In [17]:
print(f'Sizes after removing stop words:')
for k, v in indicators.items():
    print(f'{k}: {len(v)}')

Sizes after removing stop words:
INSERT: 154
ANAGRAM: 1150
SUBSTRING_FINAL: 38
SUBSTRING: 110
REVERSE: 110
SUBSTRING_INIT: 47


In [18]:
all_indicators = set( [word for indicator in indicators.values() for word in indicator] )


print(f'All indicators: {len(all_indicators)}')

All indicators: 1578


In [19]:
from datasets import load_dataset, concatenate_datasets

a = load_dataset('json', data_files='data/clue_json/guardian/naive_random/train.json', split='train')
b = load_dataset('json', data_files='data/clue_json/guardian/naive_random/val.json', split='train')
c = load_dataset('json', data_files='data/clue_json/guardian/naive_random/test.json', split='train')


In [20]:
# dataset = concatenate_datasets([a, b, c])
dataset = a

In [21]:
indicators_dict = {}

for idicator_type in indicators.keys():
    temp = {'indicators': indicators[idicator_type],  'examples': []}
    indicators_dict[idicator_type] = temp

In [22]:
indicators_dict

{'INSERT': {'indicators': ['aboard',
   'absorbing',
   'accepting',
   'accepts',
   'accommodated',
   'acquiring',
   'admits',
   'admitted',
   'amid',
   'amidst',
   'among',
   'around',
   'assuming',
   'astride',
   'bagging',
   'besieged',
   'besieges',
   'boarding',
   'boxed',
   'boxes',
   'boxing',
   'breaking',
   'bridged',
   'bridges',
   'bringing in',
   'brought round',
   'buried',
   'captures',
   'carried by',
   'carries',
   'carrying',
   'catches',
   'catching',
   'caught',
   'circled',
   'circling',
   'cleaving',
   'clutched',
   'clutches',
   'collected by',
   'collecting',
   'concealing',
   'contained',
   'contained by',
   'containing',
   'contents',
   'covered',
   'covers',
   'cutting',
   'devouring',
   'dividing',
   'eating',
   'embraced',
   'embracing',
   'encircled',
   'encircles',
   'enclosing',
   'enfolded',
   'enfolds',
   'engaged in',
   'entering',
   'enveloped',
   'enveloping',
   'envelops',
   'external',
 

In [23]:
dataset

Dataset({
    features: ['target', 'input', 'idx'],
    num_rows: 85428
})

In [24]:
from tqdm import tqdm

clues_with_indicators = 0 

for i in tqdm(range(len(dataset))):
    t = dataset[i]['input']
    found = 0
    for indicator_type in indicators_dict.keys():
        for indicator_word in indicators_dict[indicator_type]['indicators']:
            if indicator_word  in t:
                clues_with_indicators += 1
                indicators_dict[indicator_type]['examples'].append({'clue': t, 'labels': dataset[i]['target']})
                found = 1
                break
        if found:
            break
        


100%|██████████| 85428/85428 [00:34<00:00, 2480.75it/s]


In [25]:
clues_with_indicators, len(dataset), clues_with_indicators/len(dataset)

(52993, 85428, 0.6203235473146977)

In [26]:
indicators_dict['SUBSTRING_FINAL']['examples']

[{'clue': 'Miser books sequel to unfinished blue movie (9)',
  'labels': 'skinflint'},
 {'clue': 'Mended and put back in stock room? (8)', 'labels': 'restored'},
 {'clue': 'Managed to come back with speed to provide the commentary (7)',
  'labels': 'narrate'},
 {'clue': 'Service may be suspended for its passengers (8)',
  'labels': 'monorail'},
 {'clue': 'Traitor finds Our Thing leaderless, which takes the biscuit (7)',
  'labels': 'ratafia'},
 {'clue': 'Perpetual happiness finally found by not a large charwoman? (6)',
  'labels': 'steady'},
 {'clue': 'Finishes beheaded, not carried over (3,7)',
  'labels': 'end stopped'},
 {'clue': "Literary man's endless fun, hosting evening of poetry (6)",
  'labels': 'jeeves'},
 {'clue': 'Director and topless celebrity against denial (9)',
  'labels': 'tarantino'},
 {'clue': 'Young dogs chewing the end of bedroom foot attire (5)',
  'labels': 'pumps'},
 {'clue': "Ducks missing Chandler's debut in Friends? (7)",
  'labels': 'quakers'},
 {'clue': 'Re

In [27]:
for indicator_type in indicators_dict.keys():
    print(len(indicators_dict[indicator_type]['examples']))

11302
31061
3432
2906
1364
2928


In [28]:
import json

with open('data/indicators_examples.json', 'w') as f:
    json.dump(indicators_dict, f)

In [29]:
import json

indicators_dict = json.load(open('data/indicators_examples.json'))

In [49]:
from utils import generate_prompt


ex = {'clue': 'this is back clue (10)', 'labels': 'label'}
shots =  [ {'clue': 'this is  clue', 'labels': 'this is a label'}, {'clue': 'this is an clue', 'labels': 'this is a label'}]
prompt = generate_prompt(ex, 'see this example', False, shots=shots,indicator_type_shots = 1, indicators_dict=indicators_dict)


print(prompt['prompt'])

### Instruction: see this example

### Input:
Such finally gets bagged by countryman? (8)

### Response:
pheasant

### Input:
It's intended to encourage laying in a little money (4,3)

### Response:
nest egg

### Input:
this is back clue (10)


In [50]:
answer_lengthes = []

for t in [prompt['prompt']]:
    l = t.split('\n')[-1]
    answer_lengthes. append( l[l.rfind("(")+1:l.rfind(")")].split(',')) 

In [51]:
answer_lengthes

[['10']]