In [1]:
import pandas as pd
df = pd.read_csv('/kaggle/input/iipc-v3/IIPC_Data.csv')
df.head()

Unnamed: 0,ark_url,title,date,creator,subject,description,item_type,source_url,full_text,cleaned_text
0,https://digital.library.unt.edu/ark:/67531/met...,The End of Term Archive: Collaboratively Prese...,2018-11-13,"Phillips, Mark Edward",digital libraries; government documents; web a...,Presentation the 2018 International Internet P...,Presentation,https://digital.library.unt.edu/ark:/67531/met...,"Mark Phillips, UNT Libraries \nAbbie Grotke, L...",---\n# **The End of Term Archive: Collaborativ...
1,https://digital.library.unt.edu/ark:/67531/met...,Warcbase: Building a Scalable Web Archiving Pl...,2015-04-28,"Lin, Jimmy",digital preservation; web archiving; historica...,Presentation for the 2015 International Intern...,Presentation,https://digital.library.unt.edu/ark:/67531/met...,Warcbase: Building a Scalable Web Archiving Pl...,---\n## Warcbase: Building a Scalable Web Arch...
2,https://digital.library.unt.edu/ark:/67531/met...,WARC Standard Revision Workshop,2015-04-28,"Oury, Clément",digital preservation; web archiving; web crawl,Presentation for the 2015 International Intern...,Presentation,https://digital.library.unt.edu/ark:/67531/met...,1\nWARC standard revision workshop\nClément Ou...,---\n*1. Workshop Overview**\n**Topic:** WARC ...
3,https://digital.library.unt.edu/ark:/67531/met...,Co-Developing Access to the UK Web Archive,2015-04-27,"Hockx-Yu, Helen",digital preservation; web archiving; development,Presentation for the 2015 International Intern...,Presentation,https://digital.library.unt.edu/ark:/67531/met...,www.bl.uk\n1\nCo-developing access to \nthe UK...,---\n*Title:** Co-developing Access to the UK ...
4,https://digital.library.unt.edu/ark:/67531/met...,Web Archiving for all: Building WebRecorder.io,2015-04-28,"Kreymer, Ilya",digital preservation; web archiving; web crawl,Presentation for the 2015 International Intern...,Presentation,https://digital.library.unt.edu/ark:/67531/met...,WebRecorder.io\nBuilding a new archiving servi...,*WebRecorder.io: Building a New Archiving Serv...


In [2]:
# Drop rows with null cleaned_text and convert to string
df = df[df['cleaned_text'].notnull()].copy()
df['cleaned_text'] = df['cleaned_text'].astype(str)

In [3]:
from tqdm import tqdm
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")

# Ensure cleaned_text column has valid strings
df = df[df['cleaned_text'].notnull()].copy()
df['cleaned_text'] = df['cleaned_text'].astype(str)

# Chunk settings
CHUNK_SIZE = 450
OVERLAP = 50

def chunk_by_tokens(text, tokenizer, chunk_size=CHUNK_SIZE, overlap=OVERLAP):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
        start += chunk_size - overlap
    return chunks

# Chunk each row's cleaned_text
chunked_rows = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Token chunking"):
    text_chunks = chunk_by_tokens(row['cleaned_text'], tokenizer)
    for i, chunk in enumerate(text_chunks):
        new_row = row.to_dict()
        new_row['cleaned_text'] = chunk
        new_row['chunk_id'] = i
        chunked_rows.append(new_row)

# Final DataFrame
df = pd.DataFrame(chunked_rows)


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Token chunking:   0%|          | 0/575 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2173 > 512). Running this sequence through the model will result in indexing errors
Token chunking: 100%|██████████| 575/575 [00:03<00:00, 182.89it/s]


In [4]:
df.head()

Unnamed: 0,ark_url,title,date,creator,subject,description,item_type,source_url,full_text,cleaned_text,chunk_id
0,https://digital.library.unt.edu/ark:/67531/met...,The End of Term Archive: Collaboratively Prese...,2018-11-13,"Phillips, Mark Edward",digital libraries; government documents; web a...,Presentation the 2018 International Internet P...,Presentation,https://digital.library.unt.edu/ark:/67531/met...,"Mark Phillips, UNT Libraries \nAbbie Grotke, L...",- - - # * * the end of term archive : collabor...,0
1,https://digital.library.unt.edu/ark:/67531/met...,The End of Term Archive: Collaboratively Prese...,2018-11-13,"Phillips, Mark Edward",digital libraries; government documents; web a...,Presentation the 2018 International Internet P...,Presentation,https://digital.library.unt.edu/ark:/67531/met...,"Mark Phillips, UNT Libraries \nAbbie Grotke, L...",", and others utilize tools like archive - it. ...",1
2,https://digital.library.unt.edu/ark:/67531/met...,The End of Term Archive: Collaboratively Prese...,2018-11-13,"Phillips, Mark Edward",digital libraries; government documents; web a...,Presentation the 2018 International Internet P...,Presentation,https://digital.library.unt.edu/ark:/67531/met...,"Mark Phillips, UNT Libraries \nAbbie Grotke, L...",- - # # * * 5. funding and development * * * *...,2
3,https://digital.library.unt.edu/ark:/67531/met...,The End of Term Archive: Collaboratively Prese...,2018-11-13,"Phillips, Mark Edward",digital libraries; government documents; web a...,Presentation the 2018 International Internet P...,Presentation,https://digital.library.unt.edu/ark:/67531/met...,"Mark Phillips, UNT Libraries \nAbbie Grotke, L...",cocciolo _ peet _ eot. pdf ] ( http : / / www....,3
4,https://digital.library.unt.edu/ark:/67531/met...,The End of Term Archive: Collaboratively Prese...,2018-11-13,"Phillips, Mark Edward",digital libraries; government documents; web a...,Presentation the 2018 International Internet P...,Presentation,https://digital.library.unt.edu/ark:/67531/met...,"Mark Phillips, UNT Libraries \nAbbie Grotke, L...",change by percentage was observed. `. gov ` an...,4


In [5]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import pickle

# Load the embedding model
model = SentenceTransformer('BAAI/bge-base-en-v1.5')


df['combined_text'] = (
    "Title: " + df['title'].fillna('') + "\n" +
    "Creator: " + df['creator'].fillna('') + "\n" +
    "Subject: " + df['subject'].fillna('') + "\n" +
    "Description: " + df['description'].fillna('') + "\n" +
    "Text: " + df['cleaned_text'].fillna('') + "\n" +
    "Item Type: " + df['item_type'].fillna('') + "\n" +
    "Date: " + df['date'].fillna('') + "\n" +
    "URL: " + df['ark_url'].fillna('') + "\n" +
    "Source URL: " + df['source_url'].fillna('')
)


# Create embeddings for the combined text
embeddings = model.encode(df['combined_text'].tolist(), show_progress_bar=True)

with open('/kaggle/working/embeddings_v2.pkl', 'wb') as f:
    pickle.dump({
        'embeddings': embeddings,
        'combined_texts': df['combined_text'].tolist()
    }, f)


print("✅ Embeddings and all metadata saved successfully.")


2025-08-03 23:12:39.398628: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754262759.591078      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754262759.643126      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/74 [00:00<?, ?it/s]

✅ Embeddings and all metadata saved successfully.


In [6]:
pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pickle
import numpy as np
import faiss
# Load pickle data
with open('/kaggle/working/embeddings_v2.pkl', 'rb') as f:
    data = pickle.load(f)

embeddings = np.array(data['embeddings']).astype('float32')

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)


In [8]:
def retrieve_top_k(query, k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(np.array(query_embedding).astype('float32'), k)
    results = []
    for i in indices[0]:
        result = {
            'combined_text': data['combined_texts'][i]
        }
        results.append(result)
    return results


In [None]:
import google.generativeai as genai

# Configure your Gemini API key
genai.configure(api_key="add_your_api_here")

MODEL_NAME = "gemini-2.5-flash-lite"  # Gemini 2.5 Flash-Lite

AiModel = genai.GenerativeModel(MODEL_NAME)

def generate_response(query, context_docs):
    context_parts = []
    for doc in context_docs:
        context_parts.append(
            f"{doc['combined_text']}\n----"
        )
    context = "\n".join(context_parts)\

    prompt = f"""You are an expert assistant specialized in web archiving, using ONLY the provided IIPC conference materials below.

Each document includes metadata fields like Title, Creator, Date, Subject, and Content, clearly labeled.

Answer the user's questions strictly based on this context about web archiving topics discussed in the IIPC conferences.

Format your answer in clear, simple language without using markdown symbols such as asterisks, underscores, or other formatting characters.

Use plain text bullet points or paragraphs to make the answer easy to read and understand.

If the question asks about a specific year or topic, base your answers only on the relevant information found in the context.

Do NOT make any assumptions or use knowledge beyond what is in the provided documents.

If the answer is not found in the context, respond with: "I don't know."

Context:
{context}

Question: {query}
Answer:"""

    response = AiModel.generate_content(prompt)
    return response.text



In [10]:
query = "title What You See No One Saw ,what is the date  of publish?"
results = retrieve_top_k(query)
answer = generate_response(query, results)

print("\n🤖 Gemini Answer:\n", answer)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


🤖 Gemini Answer:
 2025-04-10
