In [1]:
!pip uninstall -y pyarrow
!pip install -qq -U pyarrow==15.0.2
!pip install -qq -U datasets
!pip install -qq -U transformers
!pip install -qq -U langchain-community
!pip install -qq -U sentence_transformers==2.2.2
!pip install -qq -U faiss-gpu

Found existing installation: pyarrow 15.0.2
Uninstalling pyarrow-15.0.2:
  Successfully uninstalled pyarrow-15.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.8.2 requires cubinlinker, which is not installed.
cudf 24.8.2 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.8.2 requires ptxcompiler, which is not installed.
cuml 24.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 24.8.2 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires cloudpickle~=2.2.1, but you have cloudpickle 3.0.0 which is incompatible.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 15.0.2 whic

In [2]:
import pandas as pd
from datasets import Dataset
import torch
from transformers import DataCollatorForSeq2Seq
from transformers import  Trainer, TrainingArguments
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import DataFrameLoader
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import TextStreamer,pipeline

In [3]:
path = '/kaggle/input/newsqa/NewsQA.feather'

In [4]:
df = pd.read_feather(path)
df

Unnamed: 0,question,answer,paragraph
0,Who is the managing director of Synergee Capital?,Vikram Dalal,"""Investors can use a combination of governmen..."
1,What is the yield of 30- and 40-year governmen...,7%,"""Investors can use a combination of governmen..."
2,What is the name of the ETF 2027 that a conser...,SDL,"According to financial planners, an example o..."
3,When would a conservative fixed income investo...,2027,"According to financial planners, an example o..."
4,What year would a conservative fixed income in...,2040,"According to financial planners, an example o..."
...,...,...,...
481753,When does Uncle Sam reopen for fully vaccinate...,November 8,NEW DELHI: This could be the last expansion of...
481754,When will there be three more weekly flights b...,from second week of November,It currently has 23 weekly flights to America....
481755,What type of 777s would have helped AI have mo...,Boeing,It currently has 23 weekly flights to America....
481756,What was the first wave of AI nonstops?,second,"Before the second wave this summer, AI had abo..."


In [5]:
df = df[:5000]

In [6]:
# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [7]:
dataset = dataset.train_test_split(test_size=0.2)  # Adjust `test_size` as needed

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'paragraph'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['question', 'answer', 'paragraph'],
        num_rows: 1000
    })
})

In [9]:
model_name = "google-t5/t5-small"

In [10]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [11]:
def preprocess_function(examples):
    # Concatenate question and context
    
    template = """
    Use the following piece of context to answer the question in less than 30 words.

    Context: {context}

    Question: {question}

    Answer: """

    # Apply the template to create inputs by formatting the question and context
    inputs = [
        template.format(question=question, context=context) 
        for question, context in zip(examples['question'], examples['paragraph'])
    ]

    # Tokenize the inputs
    tokenized_inputs = tokenizer(inputs,
                                 padding="max_length",
                                 truncation=True,
                                 return_tensors="pt",
                                 max_length=400)

    # Tokenize the answers as labels
    labels = tokenizer(examples['answer'],
                       padding="max_length",
                       truncation=True,
                       return_tensors="pt",
                       max_length=30)

    return {
        'input_ids': tokenized_inputs['input_ids'].squeeze(),
        'attention_mask': tokenized_inputs['attention_mask'].squeeze(),
        'labels': labels['input_ids'].squeeze()
    }

In [12]:
trained_dataset = dataset["train"].map(preprocess_function,remove_columns= ['question', 'answer', 'paragraph'], batch_size=64,batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [13]:
validation_dataset = dataset["test"].map(preprocess_function,remove_columns= ['question', 'answer', 'paragraph'], batch_size=64,batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [14]:
trained_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4000
})

In [15]:
validation_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [17]:
EPOCHS = 20

In [18]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    overwrite_output_dir=True, # This reduces the amt of disk space that gets used.
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit = 1, 
    report_to="none",
    load_best_model_at_end=True
)


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=trained_dataset,
    eval_dataset=validation_dataset
)

In [20]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,0.064312
2,No log,0.041741
3,No log,0.035411
4,No log,0.032183
5,0.438900,0.029776
6,0.438900,0.028346
7,0.438900,0.026523
8,0.438900,0.025242
9,0.438900,0.025004
10,0.029400,0.02429


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2000, training_loss=0.12722103691101075, metrics={'train_runtime': 1270.1419, 'train_samples_per_second': 62.985, 'train_steps_per_second': 1.575, 'total_flos': 8458862592000000.0, 'train_loss': 0.12722103691101075, 'epoch': 20.0})

In [21]:
loader = DataFrameLoader(df, page_content_column="paragraph")


text_splitter = RecursiveCharacterTextSplitter(chunk_size=800,
                                               chunk_overlap=20)

documents = loader.load_and_split(text_splitter=text_splitter)


model_name1 = "sentence-transformers/all-MiniLM-L6-v2"

model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name1, model_kwargs=model_kwargs)

vectordb = FAISS.from_documents(
        documents = documents, 
        embedding = embeddings
    )

  embeddings = HuggingFaceEmbeddings(model_name=model_name1, model_kwargs=model_kwargs)


In [22]:
streamer = TextStreamer(tokenizer,
                        skip_prompt = True,
                        skip_special_tokens = True)

pipe =  pipeline(task = 'text2text-generation',
                         model = model,
                         tokenizer = tokenizer,
                         temperature=0.5,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
                         top_p=0.25,  # select from top tokens whose probability add up to 15%
                         top_k=0,  # select from top 0 tokens (because zero, relies on top_p)
                         max_new_tokens=10,  # mex number of tokens to generate in the output
                         repetition_penalty=1.1,  # without this output begins repeating
                         do_sample = True,
                         )

llm = HuggingFacePipeline(pipeline = pipe)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
  llm = HuggingFacePipeline(pipeline = pipe)


In [23]:
prompt_template = """
                Use following piece of context to answer the question in less than 30 words.

                Context : {context}

                Question : {question}

                Answer : """


PROMPT = PromptTemplate(
    template = prompt_template, 
    input_variables = ["context", "question"]
)


retriever = vectordb.as_retriever(search_kwargs = {"k": 5})

qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
    retriever = retriever, 
    chain_type_kwargs = {"prompt": PROMPT},
    return_source_documents = True,
    verbose = False
)

In [24]:
def predict(idx):
    question = df.iloc[idx]['question']
    print("\nQuestion : ",question)
    print("\nContext : ",df.iloc[idx]['paragraph'])
    print("\nActual : ",df.iloc[idx]['answer'])
    result = qa_chain(question)
    print("\nPredicted : ", result['result'])
    return result

In [25]:
predict(8)

  result = qa_chain(question)
Token indices sequence length is longer than the specified maximum sequence length for this model (555 > 512). Running this sequence through the model will result in indexing errors



Question :  Who is receiving their first dose of the COVID-19 vaccine?


Actual :  Americans

Predicted :  Americans


{'query': 'Who is receiving their first dose of the COVID-19 vaccine?',
 'result': 'Americans',

In [26]:
predict(10)


Question :  How many doses of the COVID-19 vaccine did President Joe Biden want to deliver by July 4?


Actual :  at least one

Predicted :  at least one


{'query': 'How many doses of the COVID-19 vaccine did President Joe Biden want to deliver by July 4?',
 'result': 'at least one',

In [32]:
predict(22)


Question :  According to what agency, 138 million Americans have been fully vaccinated with one of the two mRNA vaccines?

Context :   Moderna said it is aware of reports of heart inflammation cases following administration of mRNA vaccines. It said it is working with public health and regulatory authorities to assess the issue. Over 138 million Americans have so far been fully vaccinated with one of the two mRNA vaccines, according to CDC data as of Monday.

Actual :  CDC

Predicted :  CDC


{'query': 'According to what agency, 138 million Americans have been fully vaccinated with one of the two mRNA vaccines?',
 'result': 'CDC',
 'source_documents': [Document(metadata={'question': 'On what date did the CDC release data on the mRNA vaccine?', 'answer': 'Monday'}, page_content='Moderna said it is aware of reports of heart inflammation cases following administration of mRNA vaccines. It said it is working with public health and regulatory authorities to assess the issue. Over 138 million Americans have so far been fully vaccinated with one of the two mRNA vaccines, according to CDC data as of Monday.'),
  Document(metadata={'question': 'According to what agency, 138 million Americans have been fully vaccinated with one of the two mRNA vaccines?', 'answer': 'CDC'}, page_content='Moderna said it is aware of reports of heart inflammation cases following administration of mRNA vaccines. It said it is working with public health and regulatory authorities to assess the issue. Over

In [30]:
predict(34)


Question :  Who did Blinken say would act with even greater impunity if they did not?

Context :   Secretary of State Antony Blinken said on Wednesday that the United States was ready to confront China where need be, calling the Asian power the “biggest geopolitical test” of the century. In his first major speech, Blinken vowed that President Joe Biden’s administration will emphasise diplomacy over military action and build cooperation with the world on global challenges such as climate change and Covid-19. “We will manage the biggest geopolitical test of the 21st century: our relationship with China,” Blinken said at the State Department. He promised to champion the rights of Hong Kong and the ethnic Uighurs, saying that if not, “China will act with even greater impunity”. “China is the only country with the economic, diplomatic, military and technological power to seriously challenge the stable and open international system — all the rules, values and relationships that make the wor

{'query': 'Who did Blinken say would act with even greater impunity if they did not?',
 'result': 'China',
 'source_documents': [Document(metadata={'question': 'Who ordered an air strike in Syria against Iranian-linked Iraqi Shiite paramilitaries?', 'answer': 'Biden'}, page_content='Secretary of State Antony Blinken said on Wednesday that the United States was ready to confront China where need be, calling the Asian power the “biggest geopolitical test” of the century. In his first major speech, Blinken vowed that President Joe Biden’s administration will emphasise diplomacy over military action and build cooperation with the world on global challenges such as climate change and Covid-19. “We will manage the biggest geopolitical test of the 21st century: our relationship with China,” Blinken said at the State Department. He promised to champion the rights of Hong Kong and the ethnic Uighurs, saying that if not, “China will act with even greater impunity”. “China is the only country wit

In [29]:
import shutil
shutil.make_archive('/kaggle/working/FinetunedFlanT5', 'zip', '/kaggle/working/my_awesome_model/checkpoint-2000')


'/kaggle/working/FinetunedFlanT5.zip'