In [1]:
#pip install transformers datasets faiss-cpu

In [2]:
import transformers
import datasets
import pandas as pd
from datasets import Dataset
import numpy as np
import faiss
import torch
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, RagConfig
from transformers import AutoConfig, RagRetriever, BartConfig
from datasets import load_from_disk
from nltk.corpus import stopwords
import nltk
import string
from nltk.stem import WordNetLemmatizer
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
import torch
from transformers import RagTokenizer, BartTokenizer
from transformers import AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset_path = "FinalModel/dataset"
dataset = load_from_disk(dataset_path)

In [4]:
df = dataset.to_pandas()

In [5]:
df = df.drop(["__index_level_0__"], axis = 1)

In [6]:
df

Unnamed: 0,text,title
0,The B.Tech./ Dual Degree/ B.S. programmes con...,b.tech./
1,"sciences, engineering and technology and other...",sequence studies broadly consists
2,of three phases.,Three phases. three
3,The first phase is an intense study of science...,First phase intense study
4,of concepts than what was done in school.,concepts done school.
...,...,...
1222,Consolidated statement of the Academic Perform...,Student performance is based
1223,for all the semesters completed.,Two more semesters
1224,: Under-Graduate Academic Performance Evaluat...,Under-graduate academic
1225,: Under-Graduate Programmes Committee,Under-graduate programmes


In [7]:
nltk.download('stopwords')
english_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akshaankhan2004/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
word_net_lemmatizer = WordNetLemmatizer()

In [9]:
def remove_punct(text):
    punct_free = "".join([i for i in text if i not in string.punctuation])
    return punct_free

In [10]:
df["preprocessed_text"] = df['text'].apply(lambda x: remove_punct(x))


In [11]:
df["preprocessed_text"]

0       The BTech Dual Degree BS  programmes consist o...
1       sciences engineering and technology and other ...
2                                       of three phases  
3       The first phase is an intense study of science...
4              of concepts than what was done in school  
                              ...                        
1222    Consolidated statement of the Academic Perform...
1223                     for all the semesters completed 
1224      UnderGraduate Academic Performance Evaluatio...
1225                  UnderGraduate Programmes Committee 
1226                        Undergraduate Research Award 
Name: preprocessed_text, Length: 1227, dtype: object

In [12]:
def tokenize_lemmatize(df):
    preprocessed = []
    for sen in df["preprocessed_text"]:
        tokens = sen.split()
        tokens = [word_net_lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in english_stopwords]
        sen1 = " ".join(tokens)
        preprocessed.append(sen1)
    df["preprocessed_text"] = preprocessed
    return df
        

In [13]:
df69 = tokenize_lemmatize(df)

In [14]:
df69["preprocessed_text"]

0       btech dual degree b programme consist course b...
1       science engineering technology related topic s...
2                                             three phase
3       first phase intense study science mathematics ...
4                                     concept done school
                              ...                        
1222    consolidated statement academic performance st...
1223                                   semester completed
1224    undergraduate academic performance evaluation ...
1225                    undergraduate programme committee
1226                         undergraduate research award
Name: preprocessed_text, Length: 1227, dtype: object

In [15]:
torch.set_grad_enabled(False)
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
ctx_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.weight', 'ctx_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz

In [16]:
ds_with_embeddings = dataset.map(lambda example: {'embeddings': ctx_encoder(**ctx_tokenizer(example["text"], return_tensors="pt"))[0][0].numpy()})
ds_with_embeddings.add_faiss_index(column='embeddings')

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1227/1227 [00:58<00:00, 20.92 examples/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:00<00:00, 698.88it/s]


Dataset({
    features: ['text', 'title', '__index_level_0__', 'embeddings'],
    num_rows: 1227
})

In [17]:
ds_with_embeddings.save_faiss_index('embeddings', 'FinalModel/index3.faiss')
ds_with_embeddings.get_index("embeddings").save("FinalModel/index3.faiss")

In [18]:
df0 = ds_with_embeddings.to_pandas()
df90 = df0.drop(["__index_level_0__"], axis = 1)
ds_with_embeddings = Dataset.from_pandas(df90)
ds_with_embeddings.save_to_disk("FinalModel/Dataset3")

Saving the dataset (1/1 shards): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1227/1227 [00:00<00:00, 342751.32 examples/s]


MODEL1

In [19]:
dataset_path = "FinalModel/dataset3"  # dataset saved via *dataset.save_to_disk(...)*
index_path = "FinalModel/index3.faiss"  # faiss index saved via *dataset.get_index("embeddings").save(...)*


retriever_config = AutoConfig.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
question_encoder_config = AutoConfig.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
generator_config = AutoConfig.from_pretrained("gpt2-large")
retriever_name_or_path = "facebook/rag-sequence-base"

retriever = RagRetriever.from_pretrained(
    retriever_name_or_path=retriever_name_or_path,
    retriever_config=retriever_config,
    question_encoder_config=question_encoder_config,
    generator_config=generator_config,
    index_name="custom",
    passages_path=dataset_path,
    index_path=index_path,
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [20]:
question_encoder = AutoModel.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

Some weights of DPRQuestionEncoder were not initialized from the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base and are newly initialized: ['bert_model.encoder.layer.3.attention.self.value.weight', 'bert_model.encoder.layer.1.output.LayerNorm.weight', 'bert_model.encoder.layer.0.output.dense.weight', 'bert_model.encoder.layer.1.attention.output.dense.weight', 'bert_model.encoder.layer.7.output.LayerNorm.bias', 'bert_model.encoder.layer.11.attention.output.dense.weight', 'bert_model.encoder.layer.11.attention.self.key.bias', 'bert_model.encoder.layer.4.output.LayerNorm.weight', 'bert_model.encoder.layer.5.attention.output.LayerNorm.bias', 'bert_model.encoder.layer.5.intermediate.dense.bias', 'bert_model.embeddings.position_embeddings.weight', 'bert_model.encoder.layer.3.attention.output.dense.bias', 'bert_model.encoder.layer.10.attention.self.key.weight', 'bert_model.encoder.layer.0.attention.self.key.bias', 'bert_model.encoder.layer.5.intermediate.dense.weight', 'bert_mode

In [21]:

rag_config = RagConfig(
    dataset="custom",
    index_name="custom",
    index_path=index_path,  # Replace with the correct path
    passages_path=dataset_path,  # Replace with the correct path
    question_encoder={"model_type": "bart"},  # Use a compatible model for the question encoder
    generator={"model_type": "bart"},
)



In [22]:
from transformers import AutoModelForCausalLM

In [23]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

In [24]:
#g_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

In [25]:
generator_model = AutoModelForCausalLM.from_pretrained("gpt2-large")

In [26]:
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq",
                                                retriever=retriever,
                                                config=rag_config,
                                                question_encoder=question_encoder,
                                                tokenizer="facebook/rag-sequence-nq",)
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
model.generator = generator_model

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [27]:
num_parameters = sum(p.numel() for p in model.parameters())
num_parameters

1289213184

In [28]:
def query_to_answer(queries, model):
    outp = []
    for query in queries:
        input_dict = tokenizer.prepare_seq2seq_batch(query, padding=True, truncation=True, return_tensors="pt")
        generated = model.generate(input_ids=input_dict["input_ids"])
        outp.append(tokenizer.batch_decode(generated, skip_special_tokens=True)[0]) 
    return outp

In [29]:
queries = ["What is ARP", 
           "What is required number of credits to complete a minor?", 
           "What is compensatory time for PwD students", 
           "What is number of additional credits required for Honors degree", 
           "What is NP grade", 
           "What is PP grade"]

In [30]:
answers = query_to_answer(queries, model)
answers



[' the academic rehabilitation program',
 ' 30',
 ' 20 minutes',
 ' 24',
 ' not pass',
 ' pass']

In [31]:
torch.save(model, 'FinalModel/FinalRAG-GPT2-LARGE.pth')

In [32]:
loaded_model = torch.load("FinalModel/FinalRAG-GPT2-LARGE.pth")

In [33]:
answers = query_to_answer(queries, loaded_model)
answers

[' the academic rehabilitation program',
 ' 30',
 ' 20 minutes',
 ' 24',
 ' not pass',
 ' pass']