In [25]:
import os
from dotenv import load_dotenv
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np

In [28]:
import re

sayi = 0

for file in os.listdir("dataset"):
    with open(f'dataset/{file}') as f:
        text = f.read()
        characters = len(re.findall('\S', text))
    sayi += characters

average = sayi / 281
print(f"The average number of these documents: {average}")

  characters = len(re.findall('\S', text))


The average number of these documents: 4623.135231316726


In [4]:
MODEL = "gpt-4o-mini"
db_name = "C:/Users/aalperen.arda/Documents/GitHub/LLM-Biography-Analysis/main/json_output"

In [5]:
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [6]:
text_loader_kwargs = {'encoding': 'utf-8'}
documents = []

for file in os.listdir(db_name):
    path = os.path.join(db_name, file)
    loader = TextLoader(path, **text_loader_kwargs)
    docs = loader.load()
    for doc in docs:
        doc.metadata["doc_type"] = os.path.splitext(file)[0]  # dosya adı
        documents.append(doc)

In [31]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=70)
chunks = text_splitter.split_documents(documents)

In [32]:
len(chunks)

281

In [34]:
embeddings = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 281 documents


In [49]:
# Vectorstore'un çalışıp çalışmadığını test edin
query = "alperen gökmen"
results = vectorstore.similarity_search(query, k=1)
for result in results:
    print(result.page_content[:])

{
  "ad": "Alperen Gökmen",
  "dogum_yeri": "Konya",
  "dogum_tarihi": "12.05.1985",
  "ilkokul": null,
  "lise": "Konya Anadolu Lisesi",
  "universite": "Orta Doğu Teknik Üniversitesi",
  "bolum": "Elektrik-Elektronik Mühendisliği",
  "yuksek_lisans": null,
  "doktora": null,
  "calistigi_kurumlar": ["TechNest"],
  "kurdugu_girisim_ve_dernekler": ["TechNest", "Genç Mühendisler İçin Teknoloji Atölyesi"],
  "yasadigi_sehir": "Konya",
  "hobiler": ["robotik projeler", "doğa yürüyüşleri", "kitap okumak", "bilim projeleri geliştirmek"],
  "es": "Selin Çelik",
  "cocuklar": ["Oğul", "Kız"],
  "akademik_yayinlar": ["Enerji Verimliliği İçin Akıllı Sistemler", "Akıllı Şehirler İçin Enerji Yönetim Sistemleri"],
  "dosya_adi": "alperen_gokmen.md"
}
