In [1]:
import os
from dotenv import load_dotenv
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [4]:
'''# fancy code

import re

sayi = 0

for file in os.listdir("dataset"):
    with open(f'dataset/{file}') as f:
        text = f.read()
        characters = len(re.findall('\S', text))
    sayi += characters

average = sayi / 281
print(f"The average number of these documents: {average}")'''

  characters = len(re.findall('\S', text))


'# fancy code\n\nimport re\n\nsayi = 0\n\nfor file in os.listdir("dataset"):\n    with open(f\'dataset/{file}\') as f:\n        text = f.read()\n        characters = len(re.findall(\'\\S\', text))\n    sayi += characters\n\naverage = sayi / 281\nprint(f"The average number of these documents: {average}")'

In [5]:
MODEL = "gpt-4o-mini"
db_name = "C:/Users/aalperen.arda/Documents/GitHub/LLM-Biography-Analysis/main/json_output"

In [6]:
text_loader_kwargs = {'encoding': 'utf-8'}
documents = []

for file in os.listdir(db_name):
    path = os.path.join(db_name, file)
    loader = TextLoader(path, **text_loader_kwargs)
    docs = loader.load()
    for doc in docs:
        doc.metadata["doc_type"] = os.path.splitext(file)[0]  # dosya adı
        documents.append(doc)

In [7]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=70)
chunks = text_splitter.split_documents(documents)

In [8]:
len(chunks)

281

In [9]:
embeddings = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 281 documents


In [10]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)


In [30]:
soru = "kaç tane alperen var o zaman söyle"
yanit = conversation_chain.invoke({"question": soru})
print(yanit["answer"])

İki kişi "Alperen" ismine sahip: Alperen Toprak ve Alperen.
