In [1]:
!pip install kaggle


Collecting kaggle
  Using cached kaggle-1.7.4.5-py3-none-any.whl.metadata (16 kB)
Collecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode (from kaggle)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Using cached kaggle-1.7.4.5-py3-none-any.whl (181 kB)
Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Installing collected packages: text-unidecode, python-slugify, kaggle
Successfully installed kaggle-1.7.4.5 python-slugify-8.0.4 text-unidecode-1.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = os.path.expanduser('~/Desktop')


In [3]:
!kaggle datasets download -d antoinebourgois2/wikipedia-ai-glossary -p ./data


Dataset URL: https://www.kaggle.com/datasets/antoinebourgois2/wikipedia-ai-glossary
License(s): CC0-1.0
wikipedia-ai-glossary.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
import zipfile

with zipfile.ZipFile('./data/wikipedia-ai-glossary.zip', 'r') as zip_ref:
    zip_ref.extractall('./data/')


In [5]:
import os
print(os.listdir('./data'))


['Wikipedia_AI_Glossary.csv', 'wikipedia-ai-glossary.zip']


In [6]:
import pandas as pd

df = pd.read_csv('./data/Wikipedia_AI_Glossary.csv')
print(df.columns)


Index(['Link', 'Title', 'Wikipedia_page_description',
       'High_dimensional_embeddings'],
      dtype='object')


In [24]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from transformers import pipeline

In [25]:
# -------------------------------
# 2️⃣ Dataset
# -------------------------------
df = pd.read_csv('./data/Wikipedia_AI_Glossary.csv')
df = df.dropna(subset=["Title", "Wikipedia_page_description"])
df["Wikipedia_page_description"] = df["Wikipedia_page_description"].apply(lambda x: x.replace("\n", " ").strip())
texts = df["Wikipedia_page_description"].tolist()

print(f"Dataset boyutu: {len(df)}")
print(f"Örnek terim: {df['Title'].iloc[0]}")
print(f"Örnek açıklama: {texts[0]}")

Dataset boyutu: 343
Örnek terim: abductive logic programming
Örnek açıklama: Logic programming using abductive reasoningAbductive logic programming ALP is a high-level knowledge-representation framework that can be used to solve problems declaratively, based on abductive reasoning. It extends normal logic programming by allowing some predicates to be incompletely defined, declared as abducible predicates. Problem solving is effected by deriving hypotheses on these abducible predicates abductive hypotheses as solutions of problems to be solved. These problems can be either observations that need to be explained as in classical abduction or goals to be achieved as in normal logic programming. It can be used to solve problems in diagnosis, planning, natural language and machine learning. It has also been used to interpret negation as failure as a form of abductive reasoning.


In [29]:
# -------------------------------
# 3️⃣ Embeddings (batch ile)
# -------------------------------
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = []
batch_size = 50
for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    batch_embeddings = embedder.encode(batch, show_progress_bar=True)
    embeddings.extend(batch_embeddings)



Batches: 100%|████████████████████████████████████| 2/2 [00:00<00:00,  7.19it/s]


RuntimeError: Numpy is not available

In [30]:
# -------------------------------
# 4️⃣ Chroma Vector Store
# -------------------------------
client = chromadb.Client(Settings(anonymized_telemetry=False))

if "ai_glossary" in [c.name for c in client.list_collections()]:
    client.delete_collection("ai_glossary")

collection = client.create_collection("ai_glossary")
for i, (text, emb) in enumerate(zip(texts, embeddings)):
    collection.add(documents=[text], embeddings=[emb.tolist()], ids=[str(i)])


In [31]:
# -------------------------------
# 5️⃣ QA Model (Tiny Flan-T5)
# -------------------------------
qa_model = pipeline("text2text-generation", model="google/flan-t5-small", tokenizer="google/flan-t5-small")


Device set to use mps:0


In [32]:
# -------------------------------
# 6️⃣ RAG Query Fonksiyonu
# -------------------------------
def rag_query(question, top_k=3):
    query_vec = embedder.encode([question])
    results = collection.query(query_embeddings=query_vec.tolist(), n_results=top_k)
    top_docs = results["documents"][0]

    context = " ".join(top_docs)
    context = context[:3000]  # Uzun context için limit

    prompt = f"""
You are an AI expert. Answer the question using the following context.
Provide a detailed explanation in 3-5 sentences.

Context: {context}

Question: {question}
Answer:
"""
    answer = qa_model(prompt, max_new_tokens=600, do_sample=True, temperature=0.7)[0]["generated_text"]
    return answer.strip()
