## Explore Data

In [64]:
import os
import pandas as pd

In [65]:
def read_files(path):
    file_contents = dict()

    for filename in os.listdir(path):
        if filename.endswith('.txt'):
            with open(os.path.join(path, filename), 'r') as f:
                content = f.read()
                file_contents[filename] = content

    print("... Reading files in path : ", path)
    print("Number of files read: ", len(file_contents))

    return file_contents

In [66]:
effectiveness_contents = read_files('data/effectiveness/train') # 4191
label_contents = read_files('data/label/train') # 15594

... Reading files in path :  data/effectiveness/train
Number of files read:  4191
... Reading files in path :  data/label/train
Number of files read:  15594


In [67]:
train = pd.read_csv('data/effectiveness/train.csv')
train.sample(3)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
28977,cff308d450a7,C44019DF831B,Many teenagers today have to take drivers ed c...,Evidence,Effective
36325,71d69054dc5e,E018497ED277,to affect deployment or presentation in their ...,Claim,Adequate
18282,03e1659babca,35279D8353D6,First i want to say that i agree and disagree ...,Position,Adequate


In [68]:
train = (train[train['discourse_effectiveness'] == 'Effective']
         .reset_index()
         .reset_index()[['level_0', 'discourse_text']]
         .replace(r'\n',' ', regex=True) 
        )
train['discourse_text'].str.strip()
train.head(3)

Unnamed: 0,level_0,discourse_text
0,0,Limiting the usage of cars has personal and pr...
1,1,With so many things in this world that few peo...
2,2,It is no secret that morning traffic jams and ...


In [69]:
test = pd.read_csv('data/effectiveness/test.csv')
test.sample(3)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
8,739a6d00f44a,D72CB1C11673,Taking other peoples advice and doing what the...,Evidence
9,bcfae2c9a244,D72CB1C11673,You can learn from others experiences by seeki...,Concluding Statement
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim


In [70]:
test = test.reset_index()[['index', 'discourse_text']].replace(r'\n',' ', regex=True) 
test['discourse_text'].str.strip()
test.head(3)

Unnamed: 0,index,discourse_text
0,0,Making choices in life can be very difficult. ...
1,1,Seeking multiple opinions can help a person ma...
2,2,it can decrease stress levels


##  Retrieval Augmented Generation
> * Learn to retrieve a sequence from an existing corpus of human-written prototypes (e.g., dialogue responses)
> * Learn to edit the retrieved sequence by adding, removing, and modifying tokens in the prototype – this will still result in a more “human-like” generation


### FastRAG

`ColBERT` (dense retriever) 
1. use NN to encode all documents into representative vectors
2. encodes query into a vector and using vector similarity search

`PLAID` engine
* use aset of filtering steps to improve latency times for ColBERT-based indexes


    `PLAIDDocumentStore` document store class
    * `collection_path` is the path to the documents collection, in the form of a TSV file with columns being "id,content,title" where the title is optional.
    * `checkpoint_path` is the path for the encoder model, needed to encode queries into vectors at run time. Could be a local path to a model or a model hosted on HuggingFace hub. In order to use our trained model based on NaturalQuestions, provide the path Intel/ColBERT-NQ; see Model Hub for more details.
    * `index_path` location of the indexed documents. The index contains the optimized and compressed vector representation of all the documents. Index can be created by the user given a collection and a checkpoint, or can be specified via a path.

`Fusion-in-Decoder` (`FiD`)
* transformer-based generative model (based on T5 architecture)

#### Create Index

In [71]:
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer, Searcher

In [72]:
dataroot = 'data'
dataset = 'effective'
datasplit = 'train'

queries = os.path.join(dataroot, dataset, datasplit, 'questions.search.tsv')
collection = os.path.join(dataroot, dataset, datasplit, 'collection.tsv')

with open(collection, 'w') as write_tsv:
   write_tsv.write(train.to_csv(sep='\t', index=False, header=False))

with open(queries, 'w') as write_tsv:
   write_tsv.write(test.to_csv(sep='\t', index=False, header=False))

tsv_read = pd.read_csv(collection, sep='\t')
tsv_read.head(3)

Unnamed: 0,0,"Limiting the usage of cars has personal and professional support all across the globe and yet it has yet to be embraced everywhere. Statistical proof show where it may help and real life examples of some of the effects of reducing, or getting rid of altogether, cars in one's daily life. While ""recent studies suggest that Americans are buying fewer cars, driving less and getting fewer licenses as each year goes by"" (Source 4), is that really enough or for the right reason? There are plenty of reasons to stop, or limit, the amount of cars being driven on the roads for every kind of person, from the hippie to the businessman, from the mom to the college student."
0,1,With so many things in this world that few peo...
1,2,It is no secret that morning traffic jams and ...
2,3,the environment suffers greatly from the many ...


In [73]:
queries = Queries(path=queries)
collection = Collection(path=collection)

f'Loaded {len(queries)} queries and {len(collection):,} passages'

[Mar 01, 18:56:44] #> Loading the queries from data/effective/train/questions.search.tsv ...
[Mar 01, 18:56:44] #> Got 10 queries. All QIDs are unique.

[Mar 01, 18:56:44] #> Loading collection...
0M 


'Loaded 10 queries and 9,326 passages'

In [74]:
nbits = 2
index_name = f'{dataset}.{datasplit}.{nbits}bits'

with Run().context(RunConfig(nranks=5, experiment='notebook')):

    config = ColBERTConfig(
        nbits=nbits,
    )
    indexer = Indexer(checkpoint='downloads/colbertv2.0', config=config)
    print('start indexing")
    indexer.index(name=index_name, collection=collection, overwrite=True)



[Mar 01, 18:56:47] #> Note: Output directory /Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/experiments/notebook/indexes/effective.train.2bits already exists


#> Starting...
#> Starting...
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 20,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 1e-5,
    "maxsteps": 400000,
    "save_every": null,
    "warmup": 20000,
    "warmup_bert": null,
    "relu": false,
    "nway": 64,
    "use_ib_negatives": true,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": "bert-base-uncased",
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 180,
    "mask_punctuat



[Mar 01, 18:56:51] [0] 		 # of sampled PIDs = 9326 	 sampled_pids[:3] = [6825, 166, 4892]
[Mar 01, 18:56:51] [0] 		 #> Encoding 1866 passages..
#> Starting...
[Mar 01, 18:56:54] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Mar 01, 18:56:54] [1] 		 #> Encoding 1866 passages..




#> Starting...
[Mar 01, 18:56:59] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Mar 01, 18:57:00] [2] 		 #> Encoding 1866 passages..




#> Starting...
[Mar 01, 18:57:05] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Mar 01, 18:57:05] [3] 		 #> Encoding 1866 passages..




[Mar 01, 18:57:13] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Mar 01, 18:57:13] [4] 		 #> Encoding 1862 passages..


100%|██████████| 30/30 [20:00<00:00, 40.00s/it]


[Mar 01, 19:16:52] [0] 		 avg_doclen_est = 68.29689025878906 	 len(local_sample) = 1,866
[Mar 01, 19:16:52] [0] 		 Creaing 8,192 partitions.
[Mar 01, 19:16:52] [0] 		 *Estimated* 636,936 embeddings.
[Mar 01, 19:16:52] [0] 		 #> Saving the indexing plan to /Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/experiments/notebook/indexes/effective.train.2bits/plan.json ..


Process Process-8:
Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/infra/launcher.py", line 117, in setup_new_process
    return_val = callee(config, *args)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 31, in encode
    encoder.run(shared_lists)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 57, in run
    distributed.barrier(self.rank)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/utils/distributed.py", line 38, in barrier
    torch.distributed.barrier(device_ids=[ran

[Mar 01, 19:16:55] [1] 		 avg_doclen_est = 67.5959243774414 	 len(local_sample) = 1,866


Process Process-9:
Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/infra/launcher.py", line 117, in setup_new_process
    return_val = callee(config, *args)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 31, in encode
    encoder.run(shared_lists)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 57, in run
    distributed.barrier(self.rank)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/utils/distributed.py", line 38, in barrier
    torch.distributed.barrier(device_ids=[ran

[Mar 01, 19:17:08] [2] 		 avg_doclen_est = 59.00053405761719 	 len(local_sample) = 1,866


Process Process-10:
Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/infra/launcher.py", line 117, in setup_new_process
    return_val = callee(config, *args)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 31, in encode
    encoder.run(shared_lists)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 57, in run
    distributed.barrier(self.rank)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/utils/distributed.py", line 38, in barrier
    torch.distributed.barrier(device_ids=[ra

[Mar 01, 19:17:10] [3] 		 avg_doclen_est = 59.81671905517578 	 len(local_sample) = 1,866


Process Process-11:
Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/infra/launcher.py", line 117, in setup_new_process
    return_val = callee(config, *args)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 31, in encode
    encoder.run(shared_lists)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 57, in run
    distributed.barrier(self.rank)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/utils/distributed.py", line 38, in barrier
    torch.distributed.barrier(device_ids=[ra

[Mar 01, 19:17:11] [4] 		 avg_doclen_est = 70.16970825195312 	 len(local_sample) = 1,862


Process Process-12:
Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/infra/launcher.py", line 117, in setup_new_process
    return_val = callee(config, *args)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 31, in encode
    encoder.run(shared_lists)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 57, in run
    distributed.barrier(self.rank)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/utils/distributed.py", line 38, in barrier
    torch.distributed.barrier(device_ids=[ra

KeyboardInterrupt: 

0

In [75]:
indexer.get_index() 

'/Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/experiments/notebook/indexes/effective.train.2bits'

In [76]:
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name)

[Mar 01, 19:27:29] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Mar 01, 19:27:29] #> Loading codec...




FileNotFoundError: [Errno 2] No such file or directory: '/Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/experiments/notebook/indexes/effective.train.2bits/centroids.pt'

#### Store

In [1]:
from fastrag.stores import PLAIDDocumentStore
import fastrag, torch

# https://github.com/IntelLabs/fastRAG/blob/main/models.md

store = PLAIDDocumentStore(index_path="",
                           checkpoint_path="downloads/,
                           collection_path=collection)




TypeError: PLAIDDocumentStore.__init__() missing 1 required positional argument: 'index_path'

#### Retriever

In [None]:
from fastrag.retrievers.colbert import ColBERTRetriever
retriever = ColBERTRetriever(store)

In [None]:
# display the answer
res['answers'][0].answer