In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers datasets peft accelerate bitsandbytes -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import torch

In [6]:
path = '/content/drive/MyDrive/AI/symbiosis.csv'
df = pd.read_csv(path)
df.head()


Unnamed: 0,Question,Answer
0,What is SNAP?,"SNAP is the Symbiosis National Aptitude Test, ..."
1,Who conducts SNAP?,SNAP is conducted by Symbiosis International (...
2,How many times can I attempt SNAP in a year?,You can attempt SNAP up to two times in a year...
3,When is SNAP conducted?,SNAP is usually held in December across multip...
4,What is the eligibility for SNAP?,You must be a graduate with at least 50% marks...


In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Question', 'Answer'],
        num_rows: 101
    })
})

In [7]:
# Load dataset
dataset = load_dataset("csv", data_files=path)

# Model & tokenizer
model_name = "google/flan-t5-small"   # good for Q&A
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [9]:
# Preprocess
def preprocess(example):
    inputs = [q for q in example["Question"]]
    targets = [a for a in example["Answer"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [13]:
# Training arguments
args = TrainingArguments(
    output_dir="faq-bot",
    eval_strategy="no", # Changed from "epoch" to "no"
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="./logs",
)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=130, training_loss=1.8948629525991587, metrics={'train_runtime': 32.9312, 'train_samples_per_second': 30.67, 'train_steps_per_second': 3.948, 'total_flos': 46937352437760.0, 'train_loss': 1.8948629525991587, 'epoch': 10.0})

In [15]:
# Save
model.save_pretrained("faq_model")
tokenizer.save_pretrained("faq_model")


('faq_model/tokenizer_config.json',
 'faq_model/special_tokens_map.json',
 'faq_model/spiece.model',
 'faq_model/added_tokens.json',
 'faq_model/tokenizer.json')

UI

In [16]:
!pip install streamlit pyngrok -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m93.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
%%writefile app.py
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model
tokenizer = AutoTokenizer.from_pretrained("faq_model")
model = AutoModelForSeq2SeqLM.from_pretrained("faq_model")

st.title("🎓 College FAQ Chatbot")

user_input = st.text_input("Ask a question:")

if user_input:
    inputs = tokenizer(user_input, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_length=128)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    st.write("**Answer:**", answer)


Writing app.py


In [21]:
!pip install gradio -q

In [22]:
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load fine-tuned model
tokenizer = AutoTokenizer.from_pretrained("faq_model")
model = AutoModelForSeq2SeqLM.from_pretrained("faq_model")

def answer_question(question):
    inputs = tokenizer(question, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Build Gradio interface
demo = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(lines=2, placeholder="Ask your question here..."),
    outputs="text",
    title="🎓 College FAQ Chatbot",
    description="Ask me anything about college admissions, hostel, placements, etc."
)

demo.launch(share=True)  # share=True gives a public link

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://769eaf086f5fb97500.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
"""from pyngrok import ngrok

# Start streamlit app
!streamlit run app.py &

# Get public URL
url = ngrok.connect(8501)
print("Streamlit URL:", url)"""

In [27]:
"""from sentence_transformers import SentenceTransformer
import faiss
import gradio as gr


# Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Encode all questions into vectors (convert to numpy)
question_embeddings = embedder.encode(df["Question"].tolist(), convert_to_numpy=True)

# Build FAISS index
dim = question_embeddings.shape[1]   # embedding dimension
index = faiss.IndexFlatL2(dim)
index.add(question_embeddings)

# Search function
def get_answer(query):
    query_vec = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(query_vec, k=1)   # top-1 match
    answer = df.iloc[I[0][0]]["Answer"]
    return answer

# Gradio UI
demo = gr.Interface(
    fn=get_answer,
    inputs=gr.Textbox(lines=2, placeholder="Ask your question here..."),
    outputs="text",
    title="🎓 College FAQ Chatbot",
    description="Ask me anything about college admissions, hostel, placements, etc."
)

demo.launch(share=True)  # share=True gives a public link
"""

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ddd1d72bf71567f33a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


