In [1]:
from PyPDF2 import PdfReader
from langchain_core.documents import Document
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_groq import ChatGroq
from py2neo import Graph

# Initialize the LLM model
GROQ_API_KEY = ""
llm = ChatGroq(
    groq_api_key=GROQ_API_KEY, 
    model_name="Gemma2-9b-It",
    temperature=0.7,          # Controls the randomness of the output
    max_tokens=8000,            # Maximum number of tokens in the response
    stop=["\n\n"],             # A list of tokens where the generation will stop
    model_kwargs={
        "top_p": 0.9,                 # Controls diversity via nucleus sampling
        "frequency_penalty": 0.0,     # Reduces repetition of words/phrases
        "presence_penalty": 0.0       # Increases the likelihood of introducing new topics
    }
)



In [2]:
def pdf_to_text_chunks(pdf_path, chunk_size=1000):
    reader = PdfReader(pdf_path)
    full_text = ""

    # Extract text from each page
    for page in reader.pages:
        full_text += page.extract_text() or ""

    # Split the text into chunks of the specified size
    chunks = [full_text[i:i + chunk_size] for i in range(0, len(full_text), chunk_size)]
    return chunks

# Provide your PDF file path
pdf_path = "Rajaji-Mahabharata.pdf"
chunks = pdf_to_text_chunks(pdf_path)


In [3]:
# Convert chunks into LangChain documents
documents = [Document(page_content=chunk) for chunk in chunks]



In [None]:
print(type(documents))

In [6]:
import os

# Set folder name where graph documents will be stored
output_folder = "df"

# Create the folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

llm_transformer = LLMGraphTransformer(llm=llm)
graph_documents = []

for idx, document in enumerate(documents):
    try:
        graph_doc = llm_transformer.convert_to_graph_documents([document])
        graph_documents.extend(graph_doc)
        
        # Save each graph document as a file
        graph_file = os.path.join(output_folder, f"graph_document_{idx}.txt")
        with open(graph_file, 'w') as f:
            f.write(str(graph_doc))
        
        print(f"Graph document {idx} successfully created and saved.")
    
    except Exception as e:
        print(f"Error processing document: {document.page_content[:50]}... - Skipping. Error: {e}")


Graph document 0 successfully created and saved.
Graph document 1 successfully created and saved.
Graph document 2 successfully created and saved.
Graph document 3 successfully created and saved.
Graph document 4 successfully created and saved.
Graph document 5 successfully created and saved.
Graph document 6 successfully created and saved.
Graph document 7 successfully created and saved.
Graph document 8 successfully created and saved.
Graph document 9 successfully created and saved.
Graph document 10 successfully created and saved.
Graph document 11 successfully created and saved.
Graph document 12 successfully created and saved.
Graph document 13 successfully created and saved.
Graph document 14 successfully created and saved.
Graph document 15 successfully created and saved.
Graph document 16 successfully created and saved.
Graph document 17 successfully created and saved.
Graph document 18 successfully created and saved.
Graph document 19 successfully created and saved.
Graph docu

In [10]:
for graph_document in graph_documents:
    if hasattr(graph_document, 'relationships'):
        print(graph_document.relationships)
    else:
        print("This document has no relationships attribute.")


[Relationship(source=Node(id='Mahabharata', type='Book'), target=Node(id='C. Rajagopalachari', type='Person'), type='_RETOLD_BY_'), Relationship(source=Node(id='Mahabharata', type='Book'), target=Node(id='Jay Mazo', type='Person'), type='EDITED_BY'), Relationship(source=Node(id='Mahabharata', type='Book'), target=Node(id='International Gita Society', type='Organization'), type='PUBLISHED_BY')]
[]
[Relationship(source=Node(id='Yudhishthira', type='Person'), target=Node(id='Karna', type='Person'), type='RELATIONSHIP')]
[Relationship(source=Node(id='Ramayana', type='Person'), target=Node(id='Sita', type='Person'), type='RELATED_TO'), Relationship(source=Node(id='Ramayana', type='Person'), target=Node(id='Rama', type='Person'), type='RELATED_TO'), Relationship(source=Node(id='Mahabharata', type='Person'), target=Node(id='Draupadi', type='Person'), type='RELATED_TO'), Relationship(source=Node(id='Mahabharata', type='Person'), target=Node(id='Arjuna', type='Person'), type='RELATED_TO'), Rela