In [6]:
import os
import pandas as pd

In [2]:
def read_files(path):
    file_contents = dict()

    for filename in os.listdir(path):
        if filename.endswith('.txt'):
            with open(os.path.join(path, filename), 'r') as f:
                content = f.read()
                file_contents[filename] = content

    print("... Reading files in path : ", path)
    print("Number of files read: ", len(file_contents))

    return file_contents

In [3]:
effectiveness_contents = read_files('data/effectiveness/train') # 4191
label_contents = read_files('data/label/train') # 15594

... Reading files in path :  data/effectiveness/train
Number of files read:  4191
... Reading files in path :  data/label/train
Number of files read:  15594


In [73]:
train = pd.read_csv('data/effectiveness/train.csv')
train.sample(3)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
16014,44ede99f7d41,160060CC2418,"To reiterate, students should be required to perform a community service bec...",Concluding Statement,Effective
1942,25212280c666,2352FF50900C,"While yes, the student can use online sources and copy information from that...",Counterclaim,Effective
7555,2ad86450e30f,8836AD393A34,"reducing automoblie use in the US should be a goal, but it cannot be approac...",Concluding Statement,Adequate


In [74]:
train = (train[train['discourse_effectiveness'] == 'Effective']
         .reset_index()
         .reset_index()[['level_0', 'discourse_text']]
        )
train.head(3)

Unnamed: 0,level_0,discourse_text
0,0,Limiting the usage of cars has personal and professional support all across ...
1,1,"With so many things in this world that few people agree on, this is a nice c..."
2,2,It is no secret that morning traffic jams and 5'o'clock traffic is often eno...


In [75]:
test = pd.read_csv('data/effectiveness/test.csv')
test.sample(3)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type
9,bcfae2c9a244,D72CB1C11673,You can learn from others experiences by seeking the advice that someone giv...,Concluding Statement
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim


In [76]:
test = df.reset_index()[['index', 'discourse_text']]
test.head(3)

Unnamed: 0,index,discourse_text
0,0,Making choices in life can be very difficult. People often ask for advice wh...
1,1,Seeking multiple opinions can help a person make a better choice
2,2,it can decrease stress levels


# Retrieval Augmented Generation
* Learn to retrieve a sequence from an existing corpus of human-written prototypes (e.g., dialogue responses)
* Learn to edit the retrieved sequence by adding, removing, and modifying tokens in the prototype – this will still result in a more “human-like” generation


### Models
`ColBERT` (dense retriever) 
1. use NN to encode all documents into representative vectors
2. encodes query into a vector and using vector similarity search

`PLAID` engine
* use aset of filtering steps to improve latency times for ColBERT-based indexes


    `PLAIDDocumentStore` document store class
    * `collection_path` is the path to the documents collection, in the form of a TSV file with columns being "id,content,title" where the title is optional.
    * `checkpoint_path` is the path for the encoder model, needed to encode queries into vectors at run time. Could be a local path to a model or a model hosted on HuggingFace hub. In order to use our trained model based on NaturalQuestions, provide the path Intel/ColBERT-NQ; see Model Hub for more details.
    * `index_path` location of the indexed documents. The index contains the optimized and compressed vector representation of all the documents. Index can be created by the user given a collection and a checkpoint, or can be specified via a path.

`Fusion-in-Decoder` (`FiD`)
* transformer-based generative model (based on T5 architecture)

In [35]:
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer, Searcher

In [None]:
import os
import sys
sys.path.insert(0, '../')
!mkdir -p downloads/

# ColBERTv2 checkpoint trained on MS MARCO Passage Ranking (388MB compressed)
!wget https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz -P downloads/
!tar -xvzf downloads/colbertv2.0.tar.gz -C downloads/

# The LoTTE dev and test sets (3.4GB compressed)
!wget https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/lotte.tar.gz -P downloads/
!tar -xvzf downloads/lotte.tar.gz -C downloads/

In [86]:
dataroot = 'downloads/lotte'
dataset = 'lifestyle'
datasplit = 'dev'

queries = os.path.join(dataroot, dataset, datasplit, 'questions.search.tsv')
collection = os.path.join(dataroot, dataset, datasplit, 'collection.tsv')
tsv_read = pd.read_csv(collection, sep='\t')
tsv_read.head(3)

Unnamed: 0,0,"In my experience rabbits are very easy to housebreak. They like to pee and poop in the same place every time, so in most cases all you have to do is put a little bit of their waste in the litter box and they will happily use the litter box. It is very important that if they go somewhere else, miss the edge or kick waste out of the box that you clean it up well and immediately as otherwise those spots will become existing places to pee and poop. When you clean the box, save a little bit of waste and put it in the cleaned box so it smells right to them. For a more foolproof method, you can get a piece of wood soaked with their urine and put that in the box along with droppings or cage them so that they are only in their litter box for a week. Generally, if I try the first method and find that they are not using only the box on the first day, I go for the litter box only for a week method. The wood block works well if you are moving from a hutch outdoors to a litter box indoors. If you have an indoor cage, you can use the cage itself as the litter box (or attach a litter box to the section of the cage the rabbit has used for waste.) Be sure to use clay or newsprint litter as the other types aren't necessarily good for rabbits. Wood litter is okay if you are sure it isn't fir. The most important thing is to clean anywhere they have an accident. High sided boxes help with avoiding kicking soiled litter out of the box, which is the biggest cause of failure in my experience."
0,1,"...rabbits can be easily trained to use a litter tray, sometimes with more r..."
1,2,It could be a multitude of things. Lack of exercise plays a big role in how ...
2,3,I've had a lot of success with crate training. Dogs won't relieve themselves...


In [87]:
queries = Queries(path=queries)
collection = Collection(path=collection)

f'Loaded {len(queries)} queries and {len(collection):,} passages'

[Mar 01, 18:14:59] #> Loading the queries from downloads/lotte/lifestyle/dev/questions.search.tsv ...
[Mar 01, 18:14:59] #> Got 417 queries. All QIDs are unique.

[Mar 01, 18:14:59] #> Loading collection...
0M 


'Loaded 417 queries and 268,893 passages'

In [None]:
with Run().context(RunConfig(nranks=1, experiment="msmarco")):

    config = ColBERTConfig(
        nbits=2,
    )
    indexer = Indexer(checkpoint="Intel/ColBERT-NQ", config=config)
    indexer.index(name="msmarco.nbits=2", collection="data/train.tsv", overwrite=True)

In [1]:
from fastrag.stores import PLAIDDocumentStore
import fastrag, torch

# https://github.com/IntelLabs/fastRAG/blob/main/models.md

store = PLAIDDocumentStore(index_path="",
                           checkpoint_path="downloads/,
                           collection_path="data/train.tsv")




TypeError: PLAIDDocumentStore.__init__() missing 1 required positional argument: 'index_path'

In [None]:
from fastrag.retrievers.colbert import ColBERTRetriever
retriever = ColBERTRetriever(store)

In [None]:
# display the answer
res['answers'][0].answer