# Setup

This notebook sets up the development environment for lexical-graph examples.

## Check Development Mode

First, let's check if we're running in development mode with hot-code-injection:

In [None]:
import os
import sys

# Check if lexical-graph source is mounted
lexical_graph_path = '/home/jovyan/lexical-graph-src'
dev_mode = os.path.exists(lexical_graph_path)

if dev_mode:
    print('Development mode detected!')
    print(f'Lexical-graph source mounted at: {lexical_graph_path}')
    
    # Add to Python path for hot-code-injection
    if lexical_graph_path not in sys.path:
        sys.path.insert(0, lexical_graph_path)
        print('Added lexical-graph to Python path for hot-code-injection')
else:
    print('Standard mode - using installed packages')
    print('To enable development mode, restart with: ./start-containers.sh --dev')

## Install Dependencies

Install required packages:

In [None]:
if not dev_mode:
    print('Installing lexical-graph package...')
    !pip install https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.11.0.zip#subdirectory=lexical-graph
else:
    print('Development mode - will install from mounted source')

## Fix NLTK Data

Download required NLTK data to prevent processing errors:

In [None]:
import nltk
import ssl

# Handle SSL certificate issues
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
print('NLTK data downloaded successfully')

## Hot-Reload Setup (Development Mode)

If in development mode, set up hot-reloading for lexical-graph modules:

**IMPORTANT**: After running this cell in development mode, you must restart the kernel (Kernel → Restart Kernel) before continuing to the next cells.

**NOTE**: This installation process can sometimes fail or hang. If it doesn't complete within 2-3 minutes, interrupt the kernel and try running this cell again.

In [None]:
if dev_mode:
    print('Setting up hot-reload for lexical-graph modules...')
    
    # Install lexical-graph in editable mode from mounted source
    try:
        import subprocess
        import time
        print('Installing lexical-graph in editable mode (this may take 60-90 seconds)...')
        print('Please wait - installation in progress', end='', flush=True)
        
        # Run pip install with real-time feedback
        process = subprocess.Popen(['pip', 'install', '-e', '/home/jovyan/lexical-graph-src'], 
                                 stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
                                 universal_newlines=True, bufsize=1)
        
        # Show progress with timeout
        import threading
        progress_active = True
        
        def show_progress():
            count = 0
            while progress_active and count < 60:  # Max 60 seconds (30 * 2)
                print('.', end='', flush=True)
                time.sleep(2)
                count += 1
            if count >= 60:
                print('\nInstallation taking longer than expected, but still running...')
        
        progress_thread = threading.Thread(target=show_progress)
        progress_thread.start()
        
        # Wait for completion
        stdout, _ = process.communicate()
        progress_active = False
        progress_thread.join()
        print()  # New line after dots
        
        if process.returncode == 0:
            print('Installed lexical-graph in editable mode')
        else:
            print('Could not install in editable mode, using Python path instead')
    except Exception as e:
        print(f'Installation failed: {e}')
    
    # Enable autoreload
    %load_ext autoreload
    %autoreload 2
    
    print('Hot-reload configured - modules will auto-reload on changes!')
    print('Tip: Use %autoreload 2 in cells where you want fresh imports')
    print('')
    print('IMPORTANT: You must restart the kernel now for the editable installation to take effect.')
    print('   Go to Kernel → Restart Kernel, then continue with the remaining cells.')
else:
    print('Hot-reload not available in standard mode')

## Environment Setup

In [None]:
%reload_ext dotenv
%dotenv

import os
import sys

# Re-check dev mode after kernel restart
lexical_graph_path = '/home/jovyan/lexical-graph-src'
dev_mode = os.path.exists(lexical_graph_path)

from graphrag_toolkit.lexical_graph.storage.graph.neo4j_graph_store_factory import Neo4jGraphStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph import set_logging_config

set_logging_config('INFO')

# Register the Neo4j backend with the factory
GraphStoreFactory.register(Neo4jGraphStoreFactory)

print('Development environment setup complete!')
print(f'Graph Store: {os.environ.get("GRAPH_STORE", "Not configured")}')
print(f'Vector Store: {os.environ.get("VECTOR_STORE", "Not configured")}')

if dev_mode:
    print('\nHot-code-injection enabled - changes to lexical-graph source will be reflected immediately!')
    print('Auto-reload is active - modules will refresh automatically')

## Verify AWS Access

In [None]:
import boto3
import os

print('Testing AWS access...')

# Check if AWS_PROFILE is set
aws_profile = os.environ.get('AWS_PROFILE')
if aws_profile:
    print(f'Using AWS profile: {aws_profile}')
    session = boto3.Session(profile_name=aws_profile)
else:
    print('Using default AWS credentials')
    session = boto3.Session()

try:
    # Test STS access to get caller identity
    sts = session.client('sts')
    identity = sts.get_caller_identity()
    
    print('AWS access successful!')
    print(f'Account: {identity["Account"]}')
    print(f'User/Role: {identity["Arn"]}')
    
    # Test Bedrock access
    bedrock = session.client('bedrock', region_name=os.environ.get('AWS_REGION', 'us-east-1'))
    models = bedrock.list_foundation_models()
    
    print('Bedrock access successful!')
    print(f'Available models: {len(models["modelSummaries"])}')
    
except Exception as e:
    print(f'AWS access failed: {e}')
    print('Check your AWS credentials and profile configuration')

## Readers Support

In [None]:
# Install PDF processing dependencies
!pip install llama-index-readers-file pymupdf

print('PDF processing dependencies installed!')

In [None]:
# Install YouTube processing dependencies
!pip install youtube-transcript-api

print('YouTube processing dependencies installed!')

In [None]:
# Install DOCX processing dependencies
!pip install docx2txt

print('DOCX processing dependencies installed!')

In [None]:
# Install GitHub processing dependencies
!pip install PyGithub
!pip install llama-index-readers-github
!pip install nest_asyncio

# Add this at the top of your GitHub reader cell
import nest_asyncio
nest_asyncio.apply()

print('GitHub processing dependencies installed!')

In [None]:
# Install PPTX processing dependencies
!pip install torch "transformers<=4.50" python-pptx Pillow

print('PPTX processing dependencies installed!')

In [None]:
# Install JSON processing dependencies
!pip install llama-index-readers-json

print('JSON processing dependencies installed!')

In [None]:
# Install Structured-Data processing dependencies
!pip install llama-index-readers-structured-data pandas openpyxl
print('Structured-Data processing dependencies installed!')

In [None]:
# Support for S3 Directory Reader
!pip install boto3
!pip install llama-index-readers-s3

print('S3 Directory Reader installed!')