In [None]:
!pip install datasets sentence-transformers faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-no

In [None]:
from datasets import load_dataset
from itertools import islice
import random

# Küçük subset almak için streaming=True ve take kullanacağız
dataset = load_dataset(
    "wikimedia/wikipedia",
    "20231101.en",      # Dump tarihi (Kasım 2023 İngilizce)
    split="train",
    streaming=True,
)

# Karıştırma işlemi ekleyelim (streaming dataset için shuffle metodu)
#dataset = dataset.shuffle(seed=42) # to make it reproducible

# İlk 1000 makaleyi listeye çekelim
dataset_small = list(islice(dataset, 1000))


print(len(dataset_small))
print(dataset_small[0].keys())
print(dataset_small[0]["text"][:500])

README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

1000
dict_keys(['id', 'url', 'title', 'text'])
Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as


In [None]:
import nltk, uuid

nltk.download("punkt")
nltk.download("punkt_tab")  # yeni eklenen paket
from nltk.tokenize import sent_tokenize

def chunk_text(text, max_tokens=250):
    sentences = sent_tokenize(text)
    chunks, current_chunk, current_tokens = [], [], 0

    for sent in sentences:
        tokens = sent.split()
        if current_tokens + len(tokens) > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk, current_tokens = [], 0
        current_chunk.append(sent)
        current_tokens += len(tokens)
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

chunked_data = []
for doc in dataset_small:
    for i, chunk in enumerate(chunk_text(doc['text'])):
        chunked_data.append({
            "chunk_id": str(uuid.uuid4()),
            "doc_id": doc['id'],
            "title": doc['title'],
            "text": chunk,
            "source": doc['url'],
            "chunk_idx": i
        })

print(f"Toplam chunk sayısı: {len(chunked_data)}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Toplam chunk sayısı: 15502


In [None]:
from sentence_transformers import SentenceTransformer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device=device)

texts = [c["text"] for c in chunked_data]
embeddings = model.encode(
    texts,
    normalize_embeddings=True,
    batch_size=64,
    show_progress_bar=True,
    device=device
)


Using device: cuda


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/243 [00:00<?, ?it/s]

In [None]:
import faiss
import numpy as np
import json

# FAISS index (cosine)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

# Kaydet
faiss.write_index(index, "faiss_index.bin")
with open("chunks.json", "w") as f:
    json.dump(chunked_data, f)

In [None]:
# Örnek sorgu
query = "What is the date of the Apollo 11 moon landing?"
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text'][:200]}...")
    print(f"Source: {chunks[idx]['source']}\n")

Score: 0.7764
Title: Apollo 11
Text: Apollo 11 (July 16–24, 1969) was the American spaceflight that first landed humans on the Moon. Commander Neil Armstrong and Lunar Module Pilot Buzz Aldrin landed the Apollo Lunar Module Eagle on July...
Source: https://en.wikipedia.org/wiki/Apollo%2011

Score: 0.7687
Title: Apollo 12
Text: Apollo 12 (November 14–24, 1969) was the sixth crewed flight in the United States Apollo program and the second to land on the Moon. It was launched on November 14, 1969, by NASA from the Kennedy Spac...
Source: https://en.wikipedia.org/wiki/Apollo%2012

Score: 0.7594
Title: Apollo 14
Text: Apollo 14 (January 31February 9, 1971) was the eighth crewed mission in the United States Apollo program, the third to land on the Moon, and the first to land in the lunar highlands. It was the last o...
Source: https://en.wikipedia.org/wiki/Apollo%2014

Score: 0.7317
Title: Apollo 15
Text: Apollo 15 (July 26August 7, 1971) was the ninth crewed mission in the United States' Ap

###all-MiniLM-L6-v2


Score: 0.6106
Title: Apollo 11
Text: Apollo 11 (July 16–24, 1969) was the American spaceflight that first landed humans on the Moon. Commander Neil Armstrong and Lunar Module Pilot Buzz Aldrin landed the Apollo Lunar Module Eagle on July...
Source: https://en.wikipedia.org/wiki/Apollo%2011

Score: 0.5877
Title: Apollo 11
Text: Sources

External links

 "Apollo 11 transcripts" at Spacelog
 Apollo 11 in real time
 Apollo 11 Press Conference filmed by KPRC-TV at Texas Archive of the Moving Image
 Apollo 11 and 13 Checklists a...
Source: https://en.wikipedia.org/wiki/Apollo%2011

Score: 0.5851
Title: Apollo 14
Text: Apollo 14 (January 31February 9, 1971) was the eighth crewed mission in the United States Apollo program, the third to land on the Moon, and the first to land in the lunar highlands. It was the last o...
Source: https://en.wikipedia.org/wiki/Apollo%2014

Score: 0.5675
Title: Apollo 11
Text: After being sent to the Moon by the Saturn V's third stage, the astronauts separated the spacecraft from it and traveled for three days until they entered lunar orbit. Armstrong and Aldrin then moved ...
Source: https://en.wikipedia.org/wiki/Apollo%2011

Score: 0.5656
Title: Apollo 11
Text: Films and documentaries
 Footprints on the Moon, a 1969 documentary film by Bill Gibson and Barry Coe, about the Apollo 11 mission
 Moonwalk One, a 1971 documentary film by Theo Kamecke
 Apollo 11: A...
Source: https://en.wikipedia.org/wiki/Apollo%2011

In [None]:
# Örnek sorgu
query = "Apollo 11’in Ay’a iniş tarihi nedir"
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text'][:200]}...")
    print(f"Source: {chunks[idx]['source']}\n")

Score: 0.7996
Title: Apollo 11
Text: Apollo 11 (July 16–24, 1969) was the American spaceflight that first landed humans on the Moon. Commander Neil Armstrong and Lunar Module Pilot Buzz Aldrin landed the Apollo Lunar Module Eagle on July...
Source: https://en.wikipedia.org/wiki/Apollo%2011

Score: 0.7766
Title: Apollo 14
Text: Apollo 14 (January 31February 9, 1971) was the eighth crewed mission in the United States Apollo program, the third to land on the Moon, and the first to land in the lunar highlands. It was the last o...
Source: https://en.wikipedia.org/wiki/Apollo%2014

Score: 0.7672
Title: Apollo 12
Text: Apollo 12 (November 14–24, 1969) was the sixth crewed flight in the United States Apollo program and the second to land on the Moon. It was launched on November 14, 1969, by NASA from the Kennedy Spac...
Source: https://en.wikipedia.org/wiki/Apollo%2012

Score: 0.7619
Title: Apollo 8
Text: Apollo 8 (December 21–27, 1968) was the first crewed spacecraft to leave low Earth orbit

In [None]:
# Örnek sorgu
query = "Aristoteles'in mantık çalışmaları nelerdir?"
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text'][:200]}...")
    print(f"Source: {chunks[idx]['source']}\n")

Score: 0.7743
Title: Aristotle
Text: Most of Aristotle's work is probably not in its original form, because it was most likely edited by students and later lecturers. The logical works of Aristotle were compiled into a set of six books c...
Source: https://en.wikipedia.org/wiki/Aristotle

Score: 0.7506
Title: Aristotle
Text: Aristotle (;  Aristotélēs, ; 384–322 BC) was an Ancient Greek philosopher and polymath. His writings cover a broad range of subjects spanning the natural sciences, philosophy, linguistics, economics, ...
Source: https://en.wikipedia.org/wiki/Aristotle

Score: 0.7383
Title: Aristotle
Text: Aristotle studied and made significant contributions to "logic, metaphysics, mathematics, physics, biology, botany, ethics, politics, agriculture, medicine, dance, and theatre." Near the end of his li...
Source: https://en.wikipedia.org/wiki/Aristotle

Score: 0.7140
Title: Aristotle
Text: Aristotle's Empiricism: Experience and Mechanics in the 4th century BC, Parmenides Publishing

In [None]:
# Örnek sorgu
query = "What are Aristotle's logical works?"
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text'][:200]}...")
    print(f"Source: {chunks[idx]['source']}\n")

Score: 0.7802
Title: Aristotle
Text: Most of Aristotle's work is probably not in its original form, because it was most likely edited by students and later lecturers. The logical works of Aristotle were compiled into a set of six books c...
Source: https://en.wikipedia.org/wiki/Aristotle

Score: 0.7282
Title: Aristotle
Text: Aristotle studied and made significant contributions to "logic, metaphysics, mathematics, physics, biology, botany, ethics, politics, agriculture, medicine, dance, and theatre." Near the end of his li...
Source: https://en.wikipedia.org/wiki/Aristotle

Score: 0.7273
Title: Aristotle
Text: Aristotle (;  Aristotélēs, ; 384–322 BC) was an Ancient Greek philosopher and polymath. His writings cover a broad range of subjects spanning the natural sciences, philosophy, linguistics, economics, ...
Source: https://en.wikipedia.org/wiki/Aristotle

Score: 0.6943
Title: Aristotle
Text: During Aristotle's time in the Macedonian court, he gave lessons not only to Alexander but al

In [None]:
# Örnek sorgu
query = "What is an AVL tree?"
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text'][:200]}...")
    print(f"Source: {chunks[idx]['source']}\n")

Score: 0.8093
Title: AVL tree
Text: In computer science, an AVL tree (named after inventors Adelson-Velsky and Landis) is a self-balancing binary search tree. In an AVL tree, the heights of the two child subtrees of any node differ by a...
Source: https://en.wikipedia.org/wiki/AVL%20tree

Score: 0.5797
Title: AVL tree
Text: A binary tree is defined to be an AVL tree if the invariant

holds for every node X in the tree. A node X with  is called "left-heavy", one with  is called "right-heavy", and one with  is sometimes si...
Source: https://en.wikipedia.org/wiki/AVL%20tree

Score: 0.5288
Title: Alder
Text: Alders are trees comprising the genus Alnus in the birch family Betulaceae. The genus comprises about 35 species of monoecious trees and shrubs, a few reaching a large size, distributed throughout the...
Source: https://en.wikipedia.org/wiki/Alder

Score: 0.5055
Title: Ailanthus
Text: Ailanthus (; derived from ailanto, an Ambonese word probably meaning "tree of the gods" or "tree of h

In [None]:
# Örnek sorgu
query = "Why does Amdahl’s law limit parallel processing? "
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text'][:200]}...")
    print(f"Source: {chunks[idx]['source']}\n")

Score: 0.7130
Title: Amdahl's law
Text: Then

 

It follows from Amdahl's law that the speedup due to parallelism is given by
 

 Relation to the law of diminishing returns 

Amdahl's law is often conflated with the law of diminishing retur...
Source: https://en.wikipedia.org/wiki/Amdahl%27s%20law

Score: 0.6624
Title: Amdahl's law
Text: An implication of Amdahl's law is that to speed up real applications which have both serial and parallel portions, heterogeneous computing techniques are required. There are novel speedup and energy c...
Source: https://en.wikipedia.org/wiki/Amdahl%27s%20law

Score: 0.6098
Title: Amdahl's law
Text: In computer architecture, Amdahl's law (or Amdahl's argument) is a formula which gives the theoretical speedup in latency of the execution of a task at fixed workload that can be expected of a system ...
Source: https://en.wikipedia.org/wiki/Amdahl%27s%20law

Score: 0.5859
Title: Amdahl's law
Text: Furthermore,

 

shows that the theoretical speedup of the e

In [None]:
# Örnek sorgu
query = "What principle explains why adding more processors doesn’t always speed up a program proportionally?"
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text'][:200]}...")
    print(f"Source: {chunks[idx]['source']}\n")

Score: 0.7684
Title: Amdahl's law
Text: Furthermore,

 

shows that the theoretical speedup of the execution of the whole task increases with the improvement of the resources of the system and that regardless of the magnitude of the improve...
Source: https://en.wikipedia.org/wiki/Amdahl%27s%20law

Score: 0.7226
Title: Assembly language
Text: In the case of speed optimization, modern optimizing compilers are claimed to render high-level languages into code that can run as fast as hand-written assembly, despite the counter-examples that can...
Source: https://en.wikipedia.org/wiki/Assembly%20language

Score: 0.6840
Title: Amdahl's law
Text: For example, with a serial program in two parts A and B for which  and ,
 if part B is made to run 5 times faster, that is  and , then 
if part A is made to run 2 times faster, that is  and , then 

T...
Source: https://en.wikipedia.org/wiki/Amdahl%27s%20law

Score: 0.6404
Title: Amdahl's law
Text: Then we are told that the 1st part is not sped up, s

In [None]:
# Örnek sorgu
query = "Which group does the element actinium belong to?"
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text'][:200]}...")
    print(f"Source: {chunks[idx]['source']}\n")

Score: 0.7325
Title: Actinide
Text: The actinide () or actinoid () series encompasses the 14 metallic chemical elements with atomic numbers from 89 to 102, actinium through nobelium. The actinide series derives its name from the first e...
Source: https://en.wikipedia.org/wiki/Actinide

Score: 0.6954
Title: Actinium
Text: Actinium is a chemical element with the symbol Ac and atomic number 89. It was first isolated by Friedrich Oskar Giesel in 1902, who gave it the name emanium; the element got its name by being wrongly...
Source: https://en.wikipedia.org/wiki/Actinium

Score: 0.6498
Title: Actinide
Text: Bk(OH)3 and Cf(OH)3 are also known, as are tetravalent hydroxides for Np, Pu and Am and pentavalent for Np and Am. The strongest base is of actinium. All compounds of actinium are colorless, except fo...
Source: https://en.wikipedia.org/wiki/Actinide

Score: 0.6248
Title: Actinide
Text: Actinides, especially those with a small number of 5f-electrons, are prone to hybridization. This is

In [None]:
# Örnek sorgu
query = "Which chemical series includes the element with atomic number 89?"
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text'][:200]}...")
    print(f"Source: {chunks[idx]['source']}\n")

Score: 0.6677
Title: Alkane
Text: The first eight members of the series (in terms of number of carbon atoms) are named as follows:
 methane CH4 – one carbon and 4 hydrogen
 ethane  C2H6 – two carbon and 6 hydrogen
 propane C3H8 – thre...
Source: https://en.wikipedia.org/wiki/Alkane

Score: 0.6502
Title: Atomic number
Text: the element number Z. Among other things, Moseley demonstrated that the lanthanide series (from lanthanum to lutetium inclusive) must have 15 members—no fewer and no more—which was far from obvious fr...
Source: https://en.wikipedia.org/wiki/Atomic%20number

Score: 0.6374
Title: Alkali metal
Text: Although a simple extrapolation of the periodic table (by the aufbau principle) would put element 169, unhexennium, under ununennium, Dirac-Fock calculations predict that the next element after ununen...
Source: https://en.wikipedia.org/wiki/Alkali%20metal

Score: 0.6306
Title: Alkali metal
Text: Soon afterward, a majority of chemists chose to classify these elements in gro

In [None]:
# Örnek sorgu
query = "How do antibodies protect the body?"
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text'][:200]}...")
    print(f"Source: {chunks[idx]['source']}\n")# Örnek sorgu


Score: 0.6760
Title: Antibody
Text: Research applications

Specific antibodies are produced by injecting an antigen into a mammal, such as a mouse, rat, rabbit, goat, sheep, or horse for large quantities of antibody. Blood isolated from...
Source: https://en.wikipedia.org/wiki/Antibody

Score: 0.6708
Title: Antibody
Text: Since an antibody has at least two paratopes, it can bind more than one antigen by binding identical epitopes carried on the surfaces of these antigens. By coating the pathogen, antibodies stimulate e...
Source: https://en.wikipedia.org/wiki/Antibody

Score: 0.6663
Title: Antibody
Text: Together with B and T cells, antibodies comprise the most important part of the adaptive immune system. They occur in two forms: one that is attached to a B cell, and the other, a soluble form, that i...
Source: https://en.wikipedia.org/wiki/Antibody

Score: 0.6478
Title: Antibody
Text: Other researchers believed that antibodies existed freely in the blood and, in 1904, Almroth Wright 

In [None]:
def faiss_search(query):
  # Örnek sorgu
  query = query
  q_emb = model.encode([query], normalize_embeddings=True)
  chunks = chunked_data
  # FAISS arama
  scores, idxs = index.search(q_emb, 5)

  # Sonuçları yazdır
  for score, idx in zip(scores[0], idxs[0]):
      print(f"Score: {score:.4f}")
      print(f"Title: {chunks[idx]['title']}")
      print(f"Text: {chunks[idx]['text'][:200]}...")
      print(f"Source: {chunks[idx]['source']}\n")

In [None]:
faiss_search("How do antibodies protect the body?")


Score: 0.6760
Title: Antibody
Text: Research applications

Specific antibodies are produced by injecting an antigen into a mammal, such as a mouse, rat, rabbit, goat, sheep, or horse for large quantities of antibody. Blood isolated from...
Source: https://en.wikipedia.org/wiki/Antibody

Score: 0.6708
Title: Antibody
Text: Since an antibody has at least two paratopes, it can bind more than one antigen by binding identical epitopes carried on the surfaces of these antigens. By coating the pathogen, antibodies stimulate e...
Source: https://en.wikipedia.org/wiki/Antibody

Score: 0.6663
Title: Antibody
Text: Together with B and T cells, antibodies comprise the most important part of the adaptive immune system. They occur in two forms: one that is attached to a B cell, and the other, a soluble form, that i...
Source: https://en.wikipedia.org/wiki/Antibody

Score: 0.6478
Title: Antibody
Text: Other researchers believed that antibodies existed freely in the blood and, in 1904, Almroth Wright 

In [None]:
faiss_search("who is the biggest king")


Score: 0.5957
Title: Aimery of Cyprus
Text: The leper King died in April or May 1185, his nephew in late summer of 1186. Ignoring Baldwin IV's decree, Sybilla was proclaimed queen by her supporters and she crowned her husband, Guy, king. Aimery...
Source: https://en.wikipedia.org/wiki/Aimery%20of%20Cyprus

Score: 0.5876
Title: Alexander Jannaeus
Text: Alexander Jannaeus ( ;  Yannaʾy; born Jonathan ) was the second king of the Hasmonean dynasty, who ruled over an expanding kingdom of Judaea from 103 to 76 BCE. A son of John Hyrcanus, he inherited th...
Source: https://en.wikipedia.org/wiki/Alexander%20Jannaeus

Score: 0.5756
Title: Ancient Egypt
Text: The strong institution of kingship developed by the kings served to legitimize state control over the land, labor, and resources that were essential to the survival and growth of ancient Egyptian civi...
Source: https://en.wikipedia.org/wiki/Ancient%20Egypt

Score: 0.5709
Title: Ammon
Text: Other kings attested to in contemporary sources a

In [None]:
faiss_search("who is the worst king")


Score: 0.5306
Title: American Revolution
Text: Lord North's cabinet ministers, the Earl of Sandwich, First Lord of the Admiralty, and Lord George Germain, Secretary of State for the Colonies, however, proved to lack leadership skills suited for th...
Source: https://en.wikipedia.org/wiki/American%20Revolution

Score: 0.5182
Title: Aimery of Cyprus
Text: The leper King died in April or May 1185, his nephew in late summer of 1186. Ignoring Baldwin IV's decree, Sybilla was proclaimed queen by her supporters and she crowned her husband, Guy, king. Aimery...
Source: https://en.wikipedia.org/wiki/Aimery%20of%20Cyprus

Score: 0.5067
Title: Alternate history
Text: Actual historical figures are seen in a much different light: Ben Franklin is revered as the continent's finest "maker", George Washington was executed after being captured, and "Tom" Jefferson is the...
Source: https://en.wikipedia.org/wiki/Alternate%20history

Score: 0.4981
Title: Alexander Jannaeus
Text: Alexander Jannaeus ( ;  Ya

In [None]:
faiss_search("List examples of elements in the actinide series?")


Score: 0.7132
Title: Actinide
Text: The actinide () or actinoid () series encompasses the 14 metallic chemical elements with atomic numbers from 89 to 102, actinium through nobelium. The actinide series derives its name from the first e...
Source: https://en.wikipedia.org/wiki/Actinide

Score: 0.6018
Title: Actinium
Text: Actinium is a chemical element with the symbol Ac and atomic number 89. It was first isolated by Friedrich Oskar Giesel in 1902, who gave it the name emanium; the element got its name by being wrongly...
Source: https://en.wikipedia.org/wiki/Actinium

Score: 0.5433
Title: Actinide
Text: Actinides, especially those with a small number of 5f-electrons, are prone to hybridization. This is explained by the similarity of the electron energies at the 5f, 7s and 6d shells. Most actinides ex...
Source: https://en.wikipedia.org/wiki/Actinide

Score: 0.5327
Title: Actinide
Text: Nuclear properties

See also 
 Actinides in the environment
 Lanthanides
 Major actinides
 Minor act

In [None]:
faiss_search("programming languages that support object-oriented programming?")


Score: 0.6048
Title: Abstract data type
Text: Built-in abstract data types
The specification of some programming languages is intentionally vague about the representation of certain built-in data types, defining only the operations that can be do...
Source: https://en.wikipedia.org/wiki/Abstract%20data%20type

Score: 0.5377
Title: Assembly language
Text: Programs using such facilities can then construct abstractions using different assembly language on each hardware platform. The system's portable code can then use these processor-specific components ...
Source: https://en.wikipedia.org/wiki/Assembly%20language

Score: 0.5352
Title: Ada (programming language)
Text: Also, the language provides for accessibility checks, both at compile time and at run time, that ensures that an access value cannot outlive the type of the object it points to. Though the semantics o...
Source: https://en.wikipedia.org/wiki/Ada%20%28programming%20language%29

Score: 0.5344
Title: Abstract data type
Text: Ab

In [None]:
faiss_search("where is Azeribeijan")


Score: 0.5856
Title: Azincourt
Text: Azincourt (), historically known in English as Agincourt ( ), is a commune in the Pas-de-Calais department in northern France. It is situated  north-west of Saint-Pol-sur-Ternoise on the D71 road betw...
Source: https://en.wikipedia.org/wiki/Azincourt

Score: 0.5720
Title: AZ Alkmaar
Text: Alkmaar Zaanstreek (), better known internationally as AZ Alkmaar, or simply and most commonly as AZ () in the Netherlands, is a Dutch professional football club from Alkmaar and the Zaan district. Th...
Source: https://en.wikipedia.org/wiki/AZ%20Alkmaar

Score: 0.5402
Title: Abadan, Iran
Text: Abadan ( Ābādān, ) is a city in the Central District of Abadan County, Khuzestan province, Iran, and serves as both capital of the county and of the district. The city is in the southwest of the count...
Source: https://en.wikipedia.org/wiki/Abadan%2C%20Iran

Score: 0.5314
Title: Geography of Azerbaijan
Text: Azerbaijan is a country in the Caucasus region, situated at the j

In [None]:
faiss_search("Where is Azerbaijan")


Score: 0.8869
Title: Azerbaijan
Text: Azerbaijan (, ; , ), officially the Republic of Azerbaijan, is a transcontinental country located at the boundary of Eastern Europe and West Asia. It is a part of the South Caucasus region and is boun...
Source: https://en.wikipedia.org/wiki/Azerbaijan

Score: 0.8471
Title: Geography of Azerbaijan
Text: Azerbaijan is a country in the Caucasus region, situated at the juncture of Eastern Europe and West Asia. Three physical features dominate Azerbaijan: the Caspian Sea, whose shoreline forms a natural ...
Source: https://en.wikipedia.org/wiki/Geography%20of%20Azerbaijan

Score: 0.7717
Title: Azerbaijan
Text: The Azerbaijani diaspora is found in 42 countries and in turn there are many centers for ethnic minorities inside Azerbaijan, including the German cultural society "Karelhaus", Slavic cultural center,...
Source: https://en.wikipedia.org/wiki/Azerbaijan

Score: 0.7380
Title: Foreign relations of Azerbaijan
Text: The Republic of Azerbaijan is a mem

In [None]:
faiss_search("Capital of Turkey")


Score: 0.7791
Title: Ankara
Text: Ankara became the new Turkish capital upon the establishment of the Republic on 29 October 1923, succeeding in this role as the former Turkish capital Istanbul following the fall of the Ottoman Empire...
Source: https://en.wikipedia.org/wiki/Ankara

Score: 0.7668
Title: Ankara
Text: Ankara ( ,  ; ), historically known as Ancyra and Angora, is the capital of Turkey. Located in the central part of Anatolia, the city has a population of 5.1 million in its urban center and 5.7 millio...
Source: https://en.wikipedia.org/wiki/Ankara

Score: 0.6822
Title: Ankara
Text: Prior to World War I, the town had a British consulate and a population of around 28,000, roughly  of whom were Christian. Turkish republican capital 

Following the Ottoman defeat in World War I, the...
Source: https://en.wikipedia.org/wiki/Ankara

Score: 0.6585
Title: Ankara
Text: Ankara has experienced a phenomenal growth since it was made Turkey's capital in 1923, when it was "a small town o

In [None]:
faiss_search("what is capital of Turkey")


Score: 0.8000
Title: Ankara
Text: Ankara became the new Turkish capital upon the establishment of the Republic on 29 October 1923, succeeding in this role as the former Turkish capital Istanbul following the fall of the Ottoman Empire...
Source: https://en.wikipedia.org/wiki/Ankara

Score: 0.7852
Title: Ankara
Text: Ankara ( ,  ; ), historically known as Ancyra and Angora, is the capital of Turkey. Located in the central part of Anatolia, the city has a population of 5.1 million in its urban center and 5.7 millio...
Source: https://en.wikipedia.org/wiki/Ankara

Score: 0.6806
Title: Anatolia
Text: In 1941, with the First Geography Congress which divided Turkey into seven geographical regions based on differences in climate and landscape, the eastern provinces of Turkey were placed into the East...
Source: https://en.wikipedia.org/wiki/Anatolia

Score: 0.6775
Title: Ankara
Text: Ankara has experienced a phenomenal growth since it was made Turkey's capital in 1923, when it was "a small to

In [None]:
faiss_search("Which field focuses on prime numbers and divisibility rules?")


Score: 0.5794
Title: Absolute value
Text: When the real numbers  are considered as the one-dimensional vector space , the absolute value is a norm, and is the -norm (see Lp space) for any . In fact the absolute value is the "only" norm on , i...
Source: https://en.wikipedia.org/wiki/Absolute%20value

Score: 0.5575
Title: Axiom
Text: The real numbers are uniquely picked out (up to isomorphism) by the properties of a Dedekind complete ordered field, meaning that any nonempty set of real numbers with an upper bound has a least upper...
Source: https://en.wikipedia.org/wiki/Axiom

Score: 0.5522
Title: Axiom
Text: There are many examples of fields; field theory gives correct knowledge about them all. It is not correct to say that the axioms of field theory are "propositions that are regarded as true without pro...
Source: https://en.wikipedia.org/wiki/Axiom

Score: 0.5481
Title: Algebraic geometry
Text: Nevertheless, the two fields remain distinct, as the methods of proof are quite differen

In [None]:
faiss_search("analgesic drugs.")


Score: 0.7612
Title: Analgesic
Text: These drugs are used along with analgesics to modulate and/or modify the action of opioids when used against pain, especially of neuropathic origin. Dextromethorphan has been noted to slow the develop...
Source: https://en.wikipedia.org/wiki/Analgesic

Score: 0.7520
Title: Analgesic
Text: An analgesic drug, also called simply an analgesic, pain reliever, or painkiller, is any member of the group of drugs used to achieve relief from pain (that is, analgesia or pain management). Analgesi...
Source: https://en.wikipedia.org/wiki/Analgesic

Score: 0.6741
Title: Alkaloid
Text: Mescaline and many indole alkaloids (such as psilocybin, dimethyltryptamine and ibogaine) have hallucinogenic effect. Morphine and codeine are strong narcotic pain killers. There are alkaloids that do...
Source: https://en.wikipedia.org/wiki/Alkaloid

Score: 0.6248
Title: Analgesic
Text: It also functions as an analgesic to a lesser degree by increasing the activity of the primary 

In [None]:
faiss_search("Capital of Turkey")


In [None]:
faiss_search("Capital of Turkey")


In [None]:
faiss_search("Capital of Turkey")


In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
from rank_bm25 import BM25Okapi
import json
import re

# Chunk'ları oku (önceden kaydedilmiş dosyan)
with open("chunks.json", "r") as f:
    chunks = json.load(f)

# Küçük harfe çevir + basit tokenizasyon (\w+): kelime kelime ayır
def tokenize(text):
    return re.findall(r"\w+", text.lower())

# Her chunk için: title + chunk_text → BM25'e giren döküman
documents = [
    tokenize(c["title"] + " " + c["text"]) for c in chunks
]

# BM25 indeksi oluştur
bm25 = BM25Okapi(documents)

In [None]:
def search_bm25(query, top_k=5):
    tokenized_query = tokenize(query)
    scores = bm25.get_scores(tokenized_query)
    top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]

    results = []
    for i in top_k_indices:
        results.append({
            "score": scores[i],
            "title": chunks[i]["title"],
            "text": chunks[i]["text"][:200],  # ilk 200 karakter
            "source": chunks[i].get("source", ""),
        })
    return results

In [None]:
query = "history of artificial intelligence"
results = search_bm25(query)

for i, r in enumerate(results):
    print(f"#{i+1} | Score: {r['score']:.4f}")
    print(f"Title: {r['title']}")
    print(f"Text: {r['text']}...")
    print(f"Source: {r['source']}\n")

#1 | Score: 24.4990
Title: Artificial intelligence
Text: Several works use AI to force us to confront the fundamental question of what makes us human, showing us artificial beings that have the ability to feel, and thus to suffer. This appears in Karel Čape...
Source: https://en.wikipedia.org/wiki/Artificial%20intelligence

#2 | Score: 23.5149
Title: Artificial intelligence
Text: However, they are critical that the test compares machines to people. "Aeronautical engineering texts," they wrote, "do not define the goal of their field as making 'machines that fly so exactly like ...
Source: https://en.wikipedia.org/wiki/Artificial%20intelligence

#3 | Score: 22.3166
Title: Artificial intelligence
Text: Johnston, John (2008) The Allure of Machinic Life: Cybernetics, Artificial Life, and the New AI, MIT Press. Gary Marcus, "Artificial Confidence: Even the newest, buzziest systems of artificial general...
Source: https://en.wikipedia.org/wiki/Artificial%20intelligence

#4 | Score: 22.0408
Ti

In [None]:


# Örnek sorgu
query = "history of artificial intelligence"
q_emb = model.encode([query], normalize_embeddings=True)
chunks = chunked_data
# FAISS arama
scores, idxs = index.search(q_emb, 5)

# Sonuçları yazdır
for score, idx in zip(scores[0], idxs[0]):
    print(f"Score: {score:.4f}")
    print(f"Title: {chunks[idx]['title']}")
    print(f"Text: {chunks[idx]['text']}...")
    print(f"Source: {chunks[idx]['source']}\n")

Score: 0.7406
Title: Artificial intelligence
Text: Artificial intelligence (AI) is the intelligence of machines or software, as opposed to the intelligence of humans or animals. It is also the field of study in computer science that develops and studies intelligent machines. "AI" may also refer to the machines themselves. AI technology is widely used throughout industry, government and science. Some high-profile applications are: advanced web search engines (e.g., Google Search), recommendation systems (used by YouTube, Amazon, and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Waymo), generative or creative tools (ChatGPT and AI art), and competing at the highest level in strategic games (such as chess and Go). Artificial intelligence was founded as an academic discipline in 1956. The field went through multiple cycles of optimism followed by disappointment and loss of funding, but after 2012, when deep learning surpassed all previous AI techni

In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import json

# ---- Setup (bir kere çalıştır yeter) ----

index = faiss.read_index("faiss_index.bin")

with open("chunks.json", "r") as f:
    chunks = json.load(f)

# ---- Fonksiyon: FAISS + semantik arama ----
def search_faiss(query, top_k=5):
    query_vec = model.encode([query], normalize_embeddings=True)
    scores, idxs = index.search(query_vec, top_k)

    results = []
    for score, idx in zip(scores[0], idxs[0]):
        results.append({
            "score": float(score),
            "title": chunks[idx]["title"],
            "text": chunks[idx]["text"][:200],
            "source": chunks[idx].get("source", ""),
        })
    return results