## Explore Data

In [1]:
import os
import pandas as pd

In [4]:
def read_files(path):
    file_contents = dict()

    for filename in os.listdir(path):
        if filename.endswith('.txt'):
            with open(os.path.join(path, filename), 'r') as f:
                content = f.read()
                file_contents[filename] = content

    print("... Reading files in path : ", path)
    print("Number of files read: ", len(file_contents))

    return file_contents

In [8]:
dataroot = '/Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/data/'
effectiveness_contents = read_files(os.path.join(dataroot, 'effectiveness/train')) # 4191
label_contents = read_files(os.path.join(dataroot, 'label/train')) # 15594

... Reading files in path :  /Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/data/effectiveness/train
Number of files read:  4191
... Reading files in path :  /Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/data/label/train
Number of files read:  15594


In [9]:
train = pd.read_csv(os.path.join(dataroot, 'effectiveness/train.csv'))
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36765 entries, 0 to 36764
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   discourse_id             36765 non-null  object
 1   essay_id                 36765 non-null  object
 2   discourse_text           36765 non-null  object
 3   discourse_type           36765 non-null  object
 4   discourse_effectiveness  36765 non-null  object
dtypes: object(5)
memory usage: 1.4+ MB


In [7]:
train['discourse_text'].map(lambda x: len(x)).sort_values(ascending=False)

19526    4099
2380     3808
15688    3558
25551    3301
5679     3135
         ... 
9997        6
2909        6
22782       6
21443       6
26718       4
Name: discourse_text, Length: 36765, dtype: int64

In [11]:
train = (train[train['discourse_effectiveness'] == 'Effective']
         .reset_index()
         .reset_index()[['level_0', 'discourse_text']]
         .replace(r'\n',' ', regex=True) 
        )
train['discourse_text'].str.strip()
train.head(3)

Unnamed: 0,level_0,discourse_text
0,0,Limiting the usage of cars has personal and pr...
1,1,With so many things in this world that few peo...
2,2,It is no secret that morning traffic jams and ...


In [12]:
train.info

<bound method DataFrame.info of       level_0                                     discourse_text
0           0  Limiting the usage of cars has personal and pr...
1           1  With so many things in this world that few peo...
2           2  It is no secret that morning traffic jams and ...
3           3  the environment suffers greatly from the many ...
4           4  "Passenger cars are responsible for 12 percent...
...       ...                                                ...
9321     9321  We as humans need social interaction, that is ...
9322     9322  One major difference between regular school an...
9323     9323  A significant problem in schools today, is stu...
9324     9324  While home school give students the opportunit...
9325     9325  Technology is a major factor in our world toda...

[9326 rows x 2 columns]>

In [11]:
test = pd.read_csv(os.path.join(dataroot, 'effectiveness/test.csv'))
test.sample(3)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead
8,739a6d00f44a,D72CB1C11673,Taking other peoples advice and doing what the...,Evidence


In [12]:
test = test.reset_index()[['index', 'discourse_text']].replace(r'\n',' ', regex=True) 
test['discourse_text'].str.strip()
test.head(3)

Unnamed: 0,index,discourse_text
0,0,Making choices in life can be very difficult. ...
1,1,Seeking multiple opinions can help a person ma...
2,2,it can decrease stress levels


##  Retrieval Augmented Generation
> * Learn to retrieve a sequence from an existing corpus of human-written prototypes (e.g., dialogue responses)
> * Learn to edit the retrieved sequence by adding, removing, and modifying tokens in the prototype – this will still result in a more “human-like” generation


### FastRAG

`ColBERT` (dense retriever) 
1. use NN to encode all documents into representative vectors
2. encodes query into a vector and using vector similarity search

`PLAID` engine
* use a set of filtering steps to improve latency times for ColBERT-based indexes


    `PLAIDDocumentStore` document store class
    * `collection_path` is the path to the documents collection, in the form of a TSV file with columns being "id,content,title" where the title is optional.
    * `checkpoint_path` is the path for the encoder model, needed to encode queries into vectors at run time. Could be a local path to a model or a model hosted on HuggingFace hub. In order to use our trained model based on NaturalQuestions, provide the path Intel/ColBERT-NQ; see Model Hub for more details.
    * `index_path` location of the indexed documents. The index contains the optimized and compressed vector representation of all the documents. Index can be created by the user given a collection and a checkpoint, or can be specified via a path.

`Fusion-in-Decoder` (`FiD`)
* transformer-based generative model (based on T5 architecture)

#### Create Index

In [7]:
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer, Searcher

ModuleNotFoundError: No module named 'colbert'

In [13]:
dataroot = 'data'
dataset = 'effective'
datasplit = 'train'


collection = os.path.join(dataroot, dataset, datasplit, 'collection.tsv')

In [14]:
queries = os.path.join(dataroot, dataset, datasplit, 'questions.search.sample.tsv')
with open(queries, 'w') as write_tsv:
   write_tsv.write(test.to_csv(sep='\t', index=False, header=False))

In [25]:
df = pd.read_csv('/Users/az/Documents/Stanford/Classes/CS224N/final/argue-better-fastRAG/data/effective/res_t5.csv')
df = df.drop(columns=['idx','res'])
df['query'] = df['query'].replace(r'"',' ', regex=True) 
queries = os.path.join(dataroot, dataset, datasplit, 'questions.search.test.tsv')
with open(queries, 'w') as write_tsv:
   write_tsv.write(df.to_csv(sep='\t', index=False, header=False))

In [None]:
with open(collection, 'w') as write_tsv:
   write_tsv.write(train.to_csv(sep='\t', index=False, header=False))

with open(queries, 'w') as write_tsv:
   write_tsv.write(test.to_csv(sep='\t', index=False, header=False))

tsv_read = pd.read_csv(collection, sep='\t')
tsv_read.head(3)

In [4]:
queries = Queries(path=queries)
collection = Collection(path=collection)

f'Loaded {len(queries)} queries and {len(collection):,} passages'

[Mar 02, 22:47:17] #> Loading the queries from data/effective/train/questions.search.tsv ...
[Mar 02, 22:47:17] #> Got 10 queries. All QIDs are unique.

[Mar 02, 22:47:17] #> Loading collection...
0M 


'Loaded 10 queries and 9,326 passages'

In [8]:
import argparse
import logging
from pathlib import Path

from fastrag.stores import PLAIDDocumentStore

nbits = 2
gpus = 0
ranks = 1
index_name = f'{dataset}.{datasplit}.{nbits}bits'

In [9]:
store = PLAIDDocumentStore(
    index_path=index_name,
    checkpoint_path="Intel/ColBERT-NQ",
    collection_path=collection,
    create=True,
    nbits=nbits,
    gpus=gpus,
    ranks=ranks,
    doc_maxlen=120,
    query_maxlen=60,
    kmeans_niters=4,
)



[Mar 02, 23:11:16] #> Note: Output directory effective.train.2bits/ already exists


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3433, in run_code
  File "/var/folders/br/ld72h9496vs6sr3jsfw4qkhm0000gn/T/ipykernel_42688/3750005940.py", line 1, in <module>
    store = PLAIDDocumentStore(
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/haystack/nodes/base.py", line 48, in wrapper_exportable_to_yaml
    init_func(self, *args, **kwargs)
  File "/Users/az/Documents/Stanford/Classes/CS224N/final/argue-better-fastRAG/fastrag/stores/plaid.py", line 62, in __init__
    self._create_index()
  File "/Users/az/Documents/Stanford/Classes/CS224N/final/argue-better-fastRAG/fastrag/stores/plaid.py", line 99, in _create_index
    indexer.index("", collection=self.collection_path, overwrite=True)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexer.py", line 77, in index
    self.__launch(collection)
  File "/Users/az/opt/an

ERROR:posthog:error uploading: Could not find a suitable TLS CA certificate bundle, invalid path: /Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/certifi/cacert.pem


In [74]:
with Run().context(RunConfig(nranks=5, experiment='notebook')):

    config = ColBERTConfig(
        nbits=nbits,
    )
    indexer = Indexer(checkpoint='downloads/colbertv2.0', config=config)
    print("start indexing")
    indexer.index(name=index_name, collection=collection, overwrite=True)



[Mar 01, 18:56:47] #> Note: Output directory /Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/experiments/notebook/indexes/effective.train.2bits already exists


#> Starting...
#> Starting...
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 20,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 1e-5,
    "maxsteps": 400000,
    "save_every": null,
    "warmup": 20000,
    "warmup_bert": null,
    "relu": false,
    "nway": 64,
    "use_ib_negatives": true,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": "bert-base-uncased",
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 180,
    "mask_punctuat



[Mar 01, 18:56:51] [0] 		 # of sampled PIDs = 9326 	 sampled_pids[:3] = [6825, 166, 4892]
[Mar 01, 18:56:51] [0] 		 #> Encoding 1866 passages..
#> Starting...
[Mar 01, 18:56:54] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Mar 01, 18:56:54] [1] 		 #> Encoding 1866 passages..




#> Starting...
[Mar 01, 18:56:59] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Mar 01, 18:57:00] [2] 		 #> Encoding 1866 passages..




#> Starting...
[Mar 01, 18:57:05] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Mar 01, 18:57:05] [3] 		 #> Encoding 1866 passages..




[Mar 01, 18:57:13] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...




[Mar 01, 18:57:13] [4] 		 #> Encoding 1862 passages..


100%|██████████| 30/30 [20:00<00:00, 40.00s/it]


[Mar 01, 19:16:52] [0] 		 avg_doclen_est = 68.29689025878906 	 len(local_sample) = 1,866
[Mar 01, 19:16:52] [0] 		 Creaing 8,192 partitions.
[Mar 01, 19:16:52] [0] 		 *Estimated* 636,936 embeddings.
[Mar 01, 19:16:52] [0] 		 #> Saving the indexing plan to /Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/experiments/notebook/indexes/effective.train.2bits/plan.json ..


Process Process-8:
Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/infra/launcher.py", line 117, in setup_new_process
    return_val = callee(config, *args)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 31, in encode
    encoder.run(shared_lists)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 57, in run
    distributed.barrier(self.rank)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/utils/distributed.py", line 38, in barrier
    torch.distributed.barrier(device_ids=[ran

[Mar 01, 19:16:55] [1] 		 avg_doclen_est = 67.5959243774414 	 len(local_sample) = 1,866


Process Process-9:
Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/infra/launcher.py", line 117, in setup_new_process
    return_val = callee(config, *args)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 31, in encode
    encoder.run(shared_lists)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 57, in run
    distributed.barrier(self.rank)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/utils/distributed.py", line 38, in barrier
    torch.distributed.barrier(device_ids=[ran

[Mar 01, 19:17:08] [2] 		 avg_doclen_est = 59.00053405761719 	 len(local_sample) = 1,866


Process Process-10:
Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/infra/launcher.py", line 117, in setup_new_process
    return_val = callee(config, *args)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 31, in encode
    encoder.run(shared_lists)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 57, in run
    distributed.barrier(self.rank)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/utils/distributed.py", line 38, in barrier
    torch.distributed.barrier(device_ids=[ra

[Mar 01, 19:17:10] [3] 		 avg_doclen_est = 59.81671905517578 	 len(local_sample) = 1,866


Process Process-11:
Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/infra/launcher.py", line 117, in setup_new_process
    return_val = callee(config, *args)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 31, in encode
    encoder.run(shared_lists)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 57, in run
    distributed.barrier(self.rank)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/utils/distributed.py", line 38, in barrier
    torch.distributed.barrier(device_ids=[ra

[Mar 01, 19:17:11] [4] 		 avg_doclen_est = 70.16970825195312 	 len(local_sample) = 1,862


Process Process-12:
Traceback (most recent call last):
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/infra/launcher.py", line 117, in setup_new_process
    return_val = callee(config, *args)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 31, in encode
    encoder.run(shared_lists)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/indexing/collection_indexer.py", line 57, in run
    distributed.barrier(self.rank)
  File "/Users/az/opt/anaconda3/envs/cs224n/lib/python3.10/site-packages/colbert/utils/distributed.py", line 38, in barrier
    torch.distributed.barrier(device_ids=[ra

KeyboardInterrupt: 

0

In [75]:
indexer.get_index() 

'/Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/experiments/notebook/indexes/effective.train.2bits'

In [76]:
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name)

[Mar 01, 19:27:29] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Mar 01, 19:27:29] #> Loading codec...




FileNotFoundError: [Errno 2] No such file or directory: '/Users/az/Documents/Stanford/Classes/CS224N/final/argue-better/experiments/notebook/indexes/effective.train.2bits/centroids.pt'

#### Store

In [1]:
from fastrag.stores import PLAIDDocumentStore
import fastrag, torch

# https://github.com/IntelLabs/fastRAG/blob/main/models.md

store = PLAIDDocumentStore(index_path="",
                           checkpoint_path="downloads/,
                           collection_path=collection)




TypeError: PLAIDDocumentStore.__init__() missing 1 required positional argument: 'index_path'

#### Retriever

In [None]:
from fastrag.retrievers.colbert import ColBERTRetriever
retriever = ColBERTRetriever(store)

In [None]:
# display the answer
res['answers'][0].answer