In [None]:
%%time
!pip install -q pandas transformers tensorflow-hub faiss-gpu annoy torch torchvision elasticsearch elasticsearch-dsl seaborn
!pip install -q -U scikit-learn

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
import numpy as np
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search, Q, SF
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelWithLMHead, modeling_utils
import faiss
from annoy import AnnoyIndex
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.notebook import tqdm_notebook as tqdmnb
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
%matplotlib inline

In [None]:
MODEL_TO_USE = 'bert-large-uncased'

In [None]:
model = AutoModel.from_pretrained(MODEL_TO_USE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_TO_USE)

In [None]:
%%time
args = pd.read_pickle(f'args_encoded_{MODEL_TO_USE}.pkl')
args.dropna(inplace=True)
args.reset_index(inplace=True, drop=True)

arg_representations = np.load(f'arg_representations_{MODEL_TO_USE}.npy')


normalizer = Normalizer()
normalized_representation = normalizer.fit_transform(arg_representations) 

dataset = pd.read_pickle('dataset.pkl')

In [None]:
%%time
tokenized = []

for chunk in tqdmnb(np.array_split(dataset, 5), total=5):
    tokenized_chunk = tokenizer.batch_encode_plus(list(chunk['text'].values), max_length=tokenizer.max_len, pad_to_max_length=True, return_overflowing_tokens=True)
    tokenized_chunk.pop('token_type_ids')
    
    overflow_index = tokenized_chunk.pop('overflow_to_sample_mapping')
    
    # Repeating indices are included as lists of the corresponding index
    overflow_index = np.hstack(overflow_index)
    text_ids = chunk['id'].values
    text_ids = text_ids[overflow_index]
    
    df = pd.DataFrame(tokenized_chunk)
    df['id'] = text_ids
    tokenized.append(df)
tokenized = pd.concat(tokenized)
tokenized.reset_index(inplace=True, drop=True)

In [None]:
tokenized['input_ids'].value_counts()

In [None]:
tokenized

In [None]:
pd.DataFrame(test[np.hstack(overflow_map[0])])

In [None]:
test.shape

In [None]:
args.shape

In [None]:
with pd.option_context('display.max_colwidth', -1):
    display(dataset[:3])

## Index hyperparameters

In [None]:
d = normalized_representation.shape[1]
m=64
n_bits=8
nlist = 1024

## Create a FAISS PQ-index

In [None]:
%%time
pq = faiss.IndexPQ(d, m, n_bits)
pq.train(normalized_representation)
pq.add(normalized_representation)

## Create a FAISS dot-product IVF-index

In [None]:
%%time
quantizer = faiss.IndexFlatIP(d)
index_dp = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
index_dp.train(normalized_representation)
index_dp.add(normalized_representation)

## Create a FAISS IVF/PQ-index

In [None]:
%%time
m = 256                             
quantizer = faiss.IndexFlatL2(d)  # this remains the same
index_ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
index_ivfpq.train(arg_representations)
index_ivfpq.add(arg_representations)

In [None]:
D_faiss, I_faiss = index_ivfpq.search(normalized_representation[:1], 10)

In [None]:
I_faiss

## Create an Annoy index with 1000 trees

In [None]:
%%time
f = 1024
annoy = AnnoyIndex(f, 'angular')
for i, arg in tqdmnb(enumerate(normalized_representation), total=normalized_representation.shape[0]):
    annoy.add_item(i, arg)
print("Now building Index...")
annoy.build(1000);

In [None]:
annoy.save(f'{MODEL_TO_USE}_1000_trees_angular.ann')

## Load Annoy index from disk

In [None]:
u= AnnoyIndex(f,'angular')
u.load(f'{MODEL_TO_USE}_100_trees_angular.ann')

In [None]:
D, I = indexl2.search(np.expand_dims(query, 0))

In [None]:
indices

In [None]:
faiss.normalize_L2(arg_representations)

In [None]:
query = "Computer Science is a bad university degree."
THRESHOLD_ANNOY = 0.8
THRESHOLD_FAISS = 0.95

In [None]:
%%time


tokenized_query = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(query))
encoded_query = tokenizer.prepare_for_model(tokenized_query, max_length=512, add_special_tokens=True, pad_to_max_length=True)

inp, mask = encoded_query['input_ids'], encoded_query['attention_mask']
inp, mask = torch.tensor(inp).unsqueeze(0), torch.tensor(mask).unsqueeze(0)

with torch.no_grad():
    query_rep = model(inp, attention_mask=mask)[0][:,0,:].numpy()
query_rep_normalized = normalizer.transform(query_rep)

D_faiss, I_faiss = pq.search(query_rep_normalized, 100)   
I_annoy, D_annoy = u.get_nns_by_vector(query_rep_normalized.squeeze(), 100, search_k=-1, include_distances=True)
I_faiss = I_faiss[I_faiss > -1]

# #Annoy returns angular distance, not cosine similarity
cos_sim_annoy = 1 - np.square(D_annoy)/2
cos_sim_faiss = 1 - np.square(D_faiss)/2


arg_ids_annoy = set(args['id'][I_annoy].values)
arg_ids_faiss = set(args['id'][list(I_faiss.squeeze())].values)

intersection = list(arg_ids_annoy.intersection(arg_ids_faiss))
annoy_matches = {arg_id: d for (arg_id, d) in zip(list(arg_ids_annoy), cos_sim_annoy)}
faiss_matches = {arg_id: d for (arg_id, d) in zip(list(arg_ids_faiss), list(cos_sim_faiss.squeeze()))}





best_annoy = {k:v for k,v in annoy_matches.items() if v>THRESHOLD_ANNOY}
best_faiss = {k:v for k,v in faiss_matches.items() if v>THRESHOLD_FAISS}

best_annoy_weights = {k:len(best_annoy)**(v-0.5) for k,v in annoy_matches.items()}
best_faiss_weights = {k:len(best_faiss)**(v-0.7) for k,v in faiss_matches.items()}

faiss_functions = [SF({'weight': weight, 'filter': Q('term', _id=arg_id)}) for arg_id, weight in best_faiss_weights.items()]
annoy_functions = [SF({'weight': weight, 'filter': Q('term', _id=arg_id)}) for arg_id, weight in best_annoy_weights.items()]
functions = faiss_functions + annoy_functions

In [None]:
%%time
# s = Search(using=es, index="arg_index").query("match", text="abortion")
# response = s.execute()

s = Search(using=es, index="arg_index")   
q = Q("match", text=query)  | Q("terms", _id=list(best_annoy.keys()), boost=0.5) | Q("terms", _id=list(best_faiss.keys()), boost=0.4)
scored_query = Q('function_score', query=q,functions=functions)
s.query = scored_query 
s = s[:25]
response = s.execute()

In [None]:
for hit in response:
    print(hit.conclusion)

In [None]:
best_annoy

In [None]:
dataset[dataset['id'].isin(best_annoy)]['context.discussionTitle'].value_counts()[:5]

In [None]:
dataset[dataset['id'].isin(list(arg_ids_annoy))]['context.discussionTitle'].value_counts()[:20]

In [None]:
dataset[dataset['id'].isin(list(arg_ids_faiss))]['context.discussionTitle'].value_counts()[:5]

In [None]:
dataset[dataset['id'].isin(intersection)]['context.discussionTitle'].value_counts()[:5]

In [None]:
es = Elasticsearch(['127.0.0.1:9200/'], verify_certs=True)

## Experimenting with BERT MLM

In [None]:
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

In [None]:
inp= "Hello my name is [MASK] and I am here to [MASK] you to the [MASK] of [MASK]."
inp_tens = torch.tensor(tokenizer.encode(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(inp)))).unsqueeze(0)
mask_indices = np.nonzero(inp_tens.squeeze()==103).squeeze()
preds = model(inp_tens)[0].squeeze()
for i in list(mask_indices):
    hallucinated = inp_tens.squeeze()
    hallucinated[i] = torch.argmax(preds[i]).item()
tokenizer.decode(hallucinated, skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [None]:
inp = 'Once upon a midnight dreary while I pondered weak and [MASK].'
inp_tens = torch.tensor(tokenizer.encode(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(inp)))).unsqueeze(0)
mask_indices = np.nonzero(inp_tens.squeeze()==103).squeeze()

In [None]:
mask_indices.item()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained('gpt2')

In [None]:
%%time

input_context = 'cow milk good because'
input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
outputs = model.generate(max_length=30, input_ids=input_ids, do_sample=True, num_beams=10, top_k=100 , top_p=0.1, num_return_sequences=5, temperature=0.8, repetition_penalty=20)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
for i in range(5): #  3 output sequences were generated
    print('Generated {}: {}'.format(i, tokenizer.decode(outputs[0][i], skip_special_tokens=True)))

In [None]:
%%time

input_context = 'cow milk bad because'
input_ids = torch.tensor(tokenizer.encode(input_context)).unsqueeze(0)  # encode input context
outputs = model.generate(max_length=30, input_ids=input_ids, do_sample=True, num_beams=5, top_k=100 , top_p=0.9, num_return_sequences=5, temperature=1.2, repetition_penalty=20)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
for i in range(3): #  3 output sequences were generated
    print('Generated {}: {}'.format(i, tokenizer.decode(outputs[0][i], skip_special_tokens=True)))

In [None]:
cnt = Counter()

In [None]:
%%time
vectorizer = CountVectorizer(stop_words='english', min_df=2000)
x = vectorizer.fit(dataset['text'].values)
bow = x.transform(dataset['text'].values)
ocurrences = bow.sum(axis=0)
for word, i in x.vocabulary_.items():
    cnt[word] += ocurrences[0, i]

In [None]:
len(word_frequencies)

In [None]:
cnt.most_common(1000)

In [None]:
judgments = pd.read_csv('./Data/arguments.csv')

In [None]:
judgments['Premise']

In [None]:
%%time
vectorizer = CountVectorizer(stop_words='english')
x = vectorizer.fit(judgments['Premise'].values)
bow = x.transform(judgments['Premise'].values)
ocurrences = bow.sum(axis=0)
for word, i in x.vocabulary_.items():
    cnt[word] += ocurrences[0, i]

In [None]:
with pd.option_context('display.max_colwidth', -1):
    display(judgments[:4])

In [None]:
for rate in (1, 2, 4, 8) * 2:
    print(rate)