In [None]:
####Fine-Tuning Step-by-Step

#### Step 1: Load the Dataset, tokenizer, and model

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

dataset = load_dataset("cnn_dailymail", "3.0.0")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")


In [None]:
#### Step 2: Preprocess the Data

def preprocess(example):
    inputs = tokenizer(example["article"], truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(example["highlights"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_data = dataset.map(preprocess, batched=True)


In [None]:
### Step 3: Use the Trainer API

from transformers import TrainingArguments, Trainer

from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    logging_dir="./logs",
    save_total_limit=2,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"]
)

trainer.train()


In [None]:
##### Case Study: Fine-Tuning a Text Summarizer


from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import evaluate
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Load model and tokenizer
model_name = "facebook/bart-large-cnn"  # Common summarization model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Get samples using the proper method for datasets
small_validation = dataset["validation"].select(range(9))

# Extract articles and highlights as lists to avoid indexing issues
sample_texts = [item for item in small_validation["article"]]
reference_summaries = [item for item in small_validation["highlights"]]

# Generate summaries with proper error handling
generated_summaries = []
for text in sample_texts:
    try:
        # Handle text that's too long
        if len(text) > 1024:  # Simple truncation for very long texts
            text = text[:1024]
        
        # Generate the summary and add explicit error handling
        summary_output = summarizer(text, max_length=130, min_length=30, do_sample=False)
        
        # Check if the output is valid
        if summary_output and isinstance(summary_output, list) and len(summary_output) > 0:
            summary = summary_output[0]["summary_text"]
            generated_summaries.append(summary)
        else:
            # Fallback if summary generation fails
            print(f"Warning: Failed to generate summary, using placeholder")
            generated_summaries.append("Summary generation failed.")
    except Exception as e:
        print(f"Error processing text: {e}")
        generated_summaries.append("Error in summary generation.")

# Make sure we have the same number of generated and reference summaries
assert len(generated_summaries) == len(reference_summaries), "Mismatch in number of summaries"

# Evaluate with ROUGE
rouge = evaluate.load("rouge")
results = rouge.compute(predictions=generated_summaries, references=reference_summaries)

# Print results
print("ROUGE Evaluation:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")



In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

# Step 1: Load PDF Document
loader = PyPDFLoader("HubSpots Guide to Data Analytics.pdf")
documents = loader.load()

# Step 2: Split into smaller chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=150)
docs = splitter.split_documents(documents)

# Step 3: Create a vector store with HuggingFace embeddings
embedding = HuggingFaceEmbeddings()
vectorstore = FAISS.from_documents(docs, embedding)

# Step 4: Retrieve top-k relevant documents
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})  # Increased k for more context
query = "What is data visualization ?"

relevant_docs = retriever.get_relevant_documents(query)
context = " ".join([doc.page_content for doc in relevant_docs])

# Truncate context if it's too long
context = context[:2000]  # Limit to avoid max_length cutoff

# Step 5: Load a better generative QA pipeline
model_name = "google/flan-t5-large"
qa_pipeline = pipeline(
    "text2text-generation",
    model=model_name,
    tokenizer=model_name,
    max_length=300,  # Allow longer responses
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    do_sample=True
)

# Step 6: Refine prompt to encourage longer, clearer answers
prompt = f"""You are a helpful assistant.
Based on the following context, answer the question clearly and in 2-4 informative sentences.

Context: {context}

Question: {query}
Answer:"""

response = qa_pipeline(prompt)[0]["generated_text"]

print("\nüìÑ Query:", query)
print("‚úÖ Generated Answer:\n", response.strip())


  embedding = HuggingFaceEmbeddings()
Device set to use cpu



üìÑ Query: What is data visualization ?
‚úÖ Generated Answer:
 The term ‚Äúdata visualization‚Äù refers to how you can display numbers, statistics, and other data in a diagram or graph to make it easier to understand and present.


In [6]:
# app.py

import streamlit as st
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

# Load model and setup pipeline
@st.cache_resource
def load_qa_pipeline():
    model_name = "google/flan-t5-large"
    return pipeline(
        "text2text-generation",
        model=model_name,
        tokenizer=model_name,
        max_length=300,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
        do_sample=True,
    )

qa_pipeline = load_qa_pipeline()

st.title("üìÑ PDF Q&A using RAG (FAISS + Flan-T5)")
st.markdown("Upload a PDF, ask any question based on its content.")

# Upload PDF
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")

if uploaded_file:
    st.success("PDF uploaded successfully!")

    # Process PDF
    loader = PyPDFLoader(uploaded_file)
    documents = loader.load()

    # Split into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=150)
    docs = splitter.split_documents(documents)

    # Create vector store
    embedding = HuggingFaceEmbeddings()
    vectorstore = FAISS.from_documents(docs, embedding)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

    # Accept user query
    query = st.text_input("Ask a question based on the PDF content:")

    if query:
        relevant_docs = retriever.get_relevant_documents(query)
        context = " ".join([doc.page_content for doc in relevant_docs])
        context = context[:2000]

        prompt = f"""You are a helpful assistant.
Based on the following context, answer the question clearly and in 2-4 informative sentences.

Context: {context}

Question: {query}
Answer:"""

        response = qa_pipeline(prompt)[0]["generated_text"]
        st.markdown("### ‚úÖ Answer:")
        st.write(response.strip())


2025-05-01 13:02:55.527 
  command:

    streamlit run C:\Users\1508\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py [ARGUMENTS]
Device set to use cpu


In [2]:
pip install langchain

Collecting langchain
  Downloading langchain-0.3.24-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-core<1.0.0,>=0.3.55 (from langchain)
  Downloading langchain_core-0.3.56-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Downloading langsmith-0.3.38-py3-none-any.whl.metadata (15 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.4,>=0.1.17->langchain)
  Downloading orjson-3.10.18-cp312-cp312-win_amd64.whl.metadata (43 kB)
Downloading langchain-0.3.24-py3-none-any.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------- ----------------------------- 0.3/1.0 MB ? eta -:--:--
   ------------------------------- -------- 0.8/1.0 MB 2.4 MB/s eta 0:00:01
   ---------------------------------------- 1.0/1.0 MB 2.8 MB/s eta 0:00:00
Downloading langchain_core-0.3

In [2]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting typing-inspection>=0.4.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading typing_inspection-0.4.0-py3-none-any.whl.metadata

In [5]:
pip install pypdf

Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Downloading pypdf-5.4.0-py3-none-any.whl (302 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.4.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-4.1.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl (15.0 MB)
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
    --------------------------------------- 0.3/15.0 MB ? eta -:--:--
   -- ------------------------------------- 1.0/15.0 MB 3.1 MB/s eta 0:00:05
   ---- ----------------------------------- 1.8/15.0 MB 3.7 MB/s eta 0:00:04
   ------ --------------------------------- 2.4/15.0 MB 3.4 MB/s eta 0:00:04
   -------- ------------------------------- 3.1/15.0 MB 3.3 MB/s eta 0:00:04
   ---------- ----------------------------- 3.9/15.0 MB 3.4 MB/s eta 0:00:04
   ------------ --------------------------- 4.7/15.0 MB 3.4 MB/s eta 0:00:03
   ------------- -------------------------- 5.0/15.0 MB 3.4 MB/s eta 0:00:03
   --------------- ------------------------ 5.8/15.0 MB 3.3 MB/s eta 0:00:03
   ---------------- ----------------------- 6.3/15.0 MB 3.2 MB/s eta 0:00:03
   ---