In [1]:
!pip install -U sentence-transformers faiss-cpu


Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from

In [2]:
import os
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import faiss
import pickle


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
file_path = "/content/drive/MyDrive/Smart_Counsel_AI/all_college_docs_cleaned.txt"

with open(file_path, "r", encoding="utf-8") as f:
    docs = [doc.strip() for doc in f.read().split("\n---\n") if doc.strip()]

print(f"✅ Total documents: {len(docs)}")
print("🔹 Sample document:\n", docs[0])


✅ Total documents: 563024
🔹 Sample document:
 College: R V College of Engineering (RVCE)
Location: Bangalore
Branch: Computer Science And Engineering
Exam: COMEDK
Year: 2020
Category: GM
Cutoff Rank: 3621
Fees (Annual): ₹207605
Hostel Fee: ₹77417
Seats: 33
Avg Package: ₹6.491660457272833 LPA
Top Recruiters: Bosch, Siemens, HAL
NIRF Rank: 272.0


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
model = model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
# Create embeddings in batches
batch_size = 64
all_embeddings = []

for i in tqdm(range(0, len(docs), batch_size), desc="🔄 Embedding batches"):
    batch = docs[i:i+batch_size]
    emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True, device=device)
    all_embeddings.extend(emb)


🔄 Embedding batches: 100%|██████████| 8798/8798 [09:53<00:00, 14.83it/s]


In [7]:
import faiss
import numpy as np

# Convert embeddings to numpy array
embedding_matrix = np.array(all_embeddings).astype('float32')

# Initialize FAISS index
dimension = embedding_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to index
index.add(embedding_matrix)

# Final check
print(f"✅ FAISS index created with {index.ntotal} vectors of dimension {dimension}.")


✅ FAISS index created with 563024 vectors of dimension 384.


In [8]:
import pickle

# Save FAISS index
faiss.write_index(index, "faiss_index.index")

# Save the associated documents (texts/chunks)
with open("documents.pkl", "wb") as f:
    pickle.dump(docs, f)

print("💾 Saved both FAISS index and document list!")


💾 Saved both FAISS index and document list!


In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load model if not already loaded
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def search_similar(query: str, top_k: int = 5):
    if not query.strip():
        print("⚠️ Empty query passed. Please enter a valid search term.")
        return []

    # Embed the query
    query_embedding = model.encode([query], convert_to_numpy=True)

    # Search the FAISS index
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve corresponding documents
    results = [docs[i] for i in indices[0]]

    print(f"🔍 Top {top_k} results for: '{query}'")
    return results


In [10]:
query = "top colleges in Bangalore for CSE under 20k rank"
results = search_similar(query)

for i, r in enumerate(results, 1):
    print(f"\n📌 Result {i}:\n{r}\n{'-'*60}")


🔍 Top 5 results for: 'top colleges in Bangalore for CSE under 20k rank'

📌 Result 1:
College: B M S College of Engineering (BMSCE)
Location: Bangalore
Branch: Data Science
Exam: COMEDK
Year: 2024
Category: TULU
Cutoff Rank: 8743
Fees (Annual): ₹308000
Hostel Fee: ₹90698
Seats: 51
Avg Package: ₹9.5 LPA
Top Recruiters: Microsoft, Adobe, Wipro
NIRF Rank: 229.0
------------------------------------------------------------

📌 Result 2:
College: B M S College of Engineering (BMSCE)
Location: Bangalore
Branch: Data Science
Exam: COMEDK
Year: 2024
Category: OTHERS
Cutoff Rank: 9021
Fees (Annual): ₹308000
Hostel Fee: ₹90698
Seats: 33
Avg Package: ₹9.5 LPA
Top Recruiters: Microsoft, Adobe, Wipro
NIRF Rank: 229.0
------------------------------------------------------------

📌 Result 3:
College: B M S College of Engineering (BMSCE)
Location: Bangalore
Branch: Civil Engineering
Exam: COMEDK
Year: 2024
Category: TULU
Cutoff Rank: 16710
Fees (Annual): ₹309376
Hostel Fee: ₹93059
Seats: 51
Avg Package: 

In [11]:
from transformers import pipeline

# ✅ Load BART summarization model (very fast on T4)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize(text: str, max_len: int = 130, min_len: int = 30) -> str:
    """Summarize using BART model"""
    if not text.strip():
        return "⚠️ Empty input text"
    result = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)
    return result[0]['summary_text']

# === Example Test ===
sample_text = '''College: RV College of Engineering (RVCE)
Location: Bangalore
Branch: Computer Science And Engineering
Exam: COMEDK
Year: 2020
Category: GM
Cutoff Rank: 3621
Fees (Annual): ₹207605
Hostel Fee: ₹77417
Seats: 33
Avg Package: ₹6.49 LPA
Top Recruiters: Bosch, Siemens, HAL
NIRF Rank: 272'''

print("\n📄 Original:")
print(sample_text)

print("\n📝 Summary:")
print(summarize(sample_text))


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Your max_length is set to 130, but your input_length is only 106. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=53)



📄 Original:
College: RV College of Engineering (RVCE)
Location: Bangalore
Branch: Computer Science And Engineering
Exam: COMEDK
Year: 2020
Category: GM
Cutoff Rank: 3621
Fees (Annual): ₹207605
Hostel Fee: ₹77417
Seats: 33
Avg Package: ₹6.49 LPA
Top Recruiters: Bosch, Siemens, HAL
NIRF Rank: 272

📝 Summary:
College: RV College of Engineering (RVCE) Branch: Computer Science And Engineering (COMEDK) Exam: COMEDK                Year: 2020                Category: GM                Cutoff Rank: 3621                Hostel Fee: ₹77417                Seats: 33                Average Package: ⁹6.49 LPA                Top Recruiters: Bosch, Siemens, HAL.


In [12]:
import os
import pickle
from tqdm import tqdm
from transformers import pipeline

# === Load Hugging Face summarizer (fast & ungated) ===
print("📦 Loading summarizer model: facebook/bart-large-cnn")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# === Settings ===
BATCH_SIZE = 4
MAX_DOCS = 5000              # ⛔ Avoid 10k+ without strong GPU
SAVE_EVERY = 200             # Save after every 200 summaries
OUTFILE = "summarized_docs_batched.pkl"

# === Smart summarizer with dynamic max_length ===
def summarize(text: str, min_len: int = 30) -> str:
    if not text.strip():
        return "⚠️ Empty input text"
    token_estimate = int(len(text.split()) * 1.3)
    max_len = max(min_len + 10, int(token_estimate * 0.7))  # 70% of estimated length
    try:
        output = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)
        return output[0]['summary_text']
    except Exception as e:
        return f"[SUMMARY_ERROR] {e}"

# === Load previous summaries if available ===
if os.path.exists(OUTFILE):
    with open(OUTFILE, "rb") as f:
        summaries = pickle.load(f)
    start_idx = len(summaries)
    print(f"🧠 Loaded {start_idx} previously summarized docs.")
else:
    summaries = []
    start_idx = 0
    print("🆕 Starting from scratch...")

# === Summarize in batches ===
for i in tqdm(range(start_idx, min(MAX_DOCS, len(docs)), BATCH_SIZE)):
    batch = docs[i:i+BATCH_SIZE]
    batch = [t.strip() for t in batch if t.strip()]
    if not batch:
        continue
    try:
        outputs = summarizer(batch, max_length=130, min_length=30, do_sample=False)
        batch_summaries = [out['summary_text'] for out in outputs]
    except Exception as e:
        print(f"⚠️ Error at index {i}: {e}")
        batch_summaries = ["[ERROR]"] * len(batch)

    summaries.extend(batch_summaries)

    # Save progress
    if (i + BATCH_SIZE) % SAVE_EVERY == 0 or (i + BATCH_SIZE) >= MAX_DOCS:
        with open(OUTFILE, "wb") as f:
            pickle.dump(summaries, f)
        print(f"💾 Saved {len(summaries)} summaries to {OUTFILE}")

# === View few sample outputs ===
print("\n✅ Sample Summaries:\n")
for i, summary in enumerate(summaries[:3]):
    print(f"📄 Original #{i+1}:\n{docs[i][:400]}")
    print(f"\n📝 Summary #{i+1}:\n{summary}\n{'-'*60}")


📦 Loading summarizer model: facebook/bart-large-cnn


Device set to use cuda:0


🆕 Starting from scratch...


  0%|          | 0/1250 [00:00<?, ?it/s]Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manuall

💾 Saved 200 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
  4%|▍         | 51/1250 [03:39<1:20:40,  4.04s/it]Your max_length is set to 130, but your input_length is only 114. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_leng

💾 Saved 400 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
  8%|▊         | 101/1250 [06:23<1:10:13,  3.67s/it]Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_len

💾 Saved 600 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 130, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 130, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
 12%|█▏        | 151/1250 [08:57<53:39,  2.93s/it]Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 800 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
 16%|█▌        | 201/1250 [11:16<49:00,  2.80s/it]Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 1000 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
 20%|██        | 251/1250 [14:16<1:28:16,  5.30s/it]Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_len

💾 Saved 1200 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
 24%|██▍       | 301/1250 [18:38<1:04:08,  4.06s/it]Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_len

💾 Saved 1400 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
 28%|██▊       | 351/1250 [21:26<54:27,  3.63s/it]Your max_length is set to 130, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 1600 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
 32%|███▏      | 401/1250 [23:59<35:57,  2.54s/it]Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 1800 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
 36%|███▌      | 451/1250 [26:23<34:09,  2.57s/it]Your max_length is set to 130, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 2000 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
 40%|████      | 501/1250 [28:49<44:12,  3.54s/it]Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 2200 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
 44%|████▍     | 551/1250 [31:46<50:17,  4.32s/it]Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 2400 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
 48%|████▊     | 601/1250 [35:08<45:46,  4.23s/it]Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 2600 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
 52%|█████▏    | 651/1250 [38:15<35:53,  3.60s/it]Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 2800 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
 56%|█████▌    | 701/1250 [41:22<33:49,  3.70s/it]Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 3000 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 130, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 130, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
 60%|██████    | 751/1250 [45:24<42:11,  5.07s/it]Your max_length is set to 130, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 3200 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
 64%|██████▍   | 801/1250 [48:43<24:29,  3.27s/it]Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 3400 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
 68%|██████▊   | 851/1250 [51:32<22:48,  3.43s/it]Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 3600 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
 72%|███████▏  | 901/1250 [54:56<20:22,  3.50s/it]Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 3800 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
 76%|███████▌  | 951/1250 [58:03<17:39,  3.54s/it]Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_lengt

💾 Saved 4000 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)
 80%|████████  | 1001/1250 [1:02:08<20:40,  4.98s/it]Your max_length is set to 130, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_le

💾 Saved 4200 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 130, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
 84%|████████▍ | 1051/1250 [1:05:26<10:55,  3.29s/it]Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_le

💾 Saved 4400 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
Your max_length is set to 130, but your input_length is only 119. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=59)
 88%|████████▊ | 1101/1250 [1:08:17<06:38,  2.68s/it]Your max_length is set to 130, but your input_length is only 118. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_le

💾 Saved 4600 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 130, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 130, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
 92%|█████████▏| 1151/1250 [1:11:01<06:06,  3.70s/it]Your max_length is set to 130, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_le

💾 Saved 4800 summaries to summarized_docs_batched.pkl


Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
Your max_length is set to 130, but your input_length is only 120. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=60)
 96%|█████████▌| 1201/1250 [1:13:41<02:30,  3.07s/it]Your max_length is set to 130, but your input_length is only 121. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_le

💾 Saved 5000 summaries to summarized_docs_batched.pkl

✅ Sample Summaries:

📄 Original #1:
College: R V College of Engineering (RVCE)
Location: Bangalore
Branch: Computer Science And Engineering
Exam: COMEDK
Year: 2020
Category: GM
Cutoff Rank: 3621
Fees (Annual): ₹207605
Hostel Fee: ₹77417
Seats: 33
Avg Package: ₹6.491660457272833 LPA
Top Recruiters: Bosch, Siemens, HAL
NIRF Rank: 272.0

📝 Summary #1:
College: R V College of Engineering (RVCE) Branch: Computer Science And Engineering (COMEDK) Exam: COMEDK                Year: 2020                Category: GM                Cutoff Rank: 3621                Hostel Fee: ₹77417                Seats: 33                Average Package: � $6.491660457272833.
------------------------------------------------------------
📄 Original #2:
College: R V College of Engineering (RVCE)
Location: Bangalore
Branch: Computer Science And Engineering
Exam: COMEDK
Year: 2020
Category: GM
Cutoff Rank: 2707
Fees (Annual): ₹207605
Hostel Fee: ₹77417
Seats: 33
A




In [13]:
from google.colab import files

files.download("faiss_index.index")
files.download("documents.pkl")
files.download("summarized_docs_batched.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##  ✅ Smart Counsel AI - Data Pipeline Overview

| 🔢 Step | 🧠 Process          | ⚙️ What Was Done                                                      | 📦 Model / Tool Used                          |
|--------:|--------------------|------------------------------------------------------------------------|-----------------------------------------------|
| 1️⃣     | Text Cleaning       | Cleaned raw college data (cutoffs, placements, docs, etc.)             | `Python`, `Pandas`                            |
| 2️⃣     | Text Chunking       | Split text into ~500-character chunks with smart overlap               | `RecursiveCharacterTextSplitter`             |
| 3️⃣     | Embedding           | Converted each chunk into vector embeddings                            | `sentence-transformers/all-MiniLM-L6-v2`     |
| 4️⃣     | Vector Store        | Built FAISS vector index for fast similarity search                    | `FAISS`                                       |
| 5️⃣     | Summarization       | Generated short 1-2 line summaries for each chunk                      | `facebook/bart-large-cnn`                    |
| 6️⃣     | Data Saving         | Saved all files for later use (index, docs, summaries)                 | `pickle`, `faiss.write_index()`              |
| 7️⃣     | Retrieval (RAG)     | Retrieved relevant chunks for user query                               | `FAISS`, `MiniLM`                            |
| 8️⃣     | Answer Generation   | Used retrieved chunks to generate final smart answers                  | `GPT-3.5`, optionally `Mistral-7B`            |
| 9️⃣     | Streamlit App Dev   | Built UI pages like predictor, explorer, Q&A, simulator                | `Streamlit`, `Python`, `FAISS`, `OpenAI API` |



## 🎉✅ **All preprocessing steps completed!**  
- 📚 Text cleaned and chunked  
- 🧠 Vectors embedded and stored in FAISS  
- 📝 Summaries generated and cached  
- 💾 All assets saved locally for use  

🚀 You're now fully ready to build your **RAG-based Answer Generator** and design the **Streamlit App** with smart features, GPT/Mistral Q&A, and a beautiful UI! 🌟💬📊  
