In [None]:
# Cell 1: GitHub Setup and Auto-Logging

import os
import sys
import importlib.util
import psycopg2

# GitHub credentials - use Kaggle secrets for security
github_token = os.environ.get("GITHUB_TOKEN")
repo_url = f"https://{github_token}@github.com/amiralpert/SmartReach.git"
local_path = "/kaggle/working/SmartReach"

# Clone or update repo
if os.path.exists(local_path):
    !cd {local_path} && git pull
else:
    !git clone {repo_url} {local_path}

# Add to Python path for regular imports
sys.path.insert(0, f'{local_path}/BizIntel')

# Direct import of pipeline
pipeline_path = f"{local_path}/BizIntel/Modules/SystemUno/Patents/patentlens_pipeline_v3.py"

spec = importlib.util.spec_from_file_location("patentlens_pipeline_v3", pipeline_path)
pipeline_module = importlib.util.module_from_spec(spec)
sys.modules["patentlens_pipeline_v3"] = pipeline_module
spec.loader.exec_module(pipeline_module)

PatentLensPipeline = pipeline_module.PatentLensPipeline
PatentData = pipeline_module.PatentData
KeywordManager = pipeline_module.KeywordManager

print("✓ Pipeline module imported from GitHub!")

# Set up database configuration
NEON_CONFIG = {
    'host': 'ep-royal-star-ad1gn0d4-pooler.c-2.us-east-1.aws.neon.tech',
    'database': 'BizIntelSmartReach',
    'user': 'neondb_owner',
    'password': 'npg_aTFt6Pug3Kpy',
    'sslmode': 'require'
}

# Try to set up logger, but don't fail if there are issues
try:
    # Create separate connection for logger
    logger_conn = psycopg2.connect(**NEON_CONFIG)
    print("✓ Database connected for logger")

    # Import auto-logger using direct file import
    logger_module_path = f"{local_path}/BizIntel/Scripts/KaggleLogger/auto_logger.py"
    if os.path.exists(logger_module_path):
        spec = importlib.util.spec_from_file_location("auto_logger", logger_module_path)
        auto_logger_module = importlib.util.module_from_spec(spec)
        sys.modules["auto_logger"] = auto_logger_module
        spec.loader.exec_module(auto_logger_module)

        setup_auto_logging = auto_logger_module.setup_auto_logging
        logger = setup_auto_logging(logger_conn, "PatentLens")
        print("✓ Auto-logging enabled!")
    else:
        print(f"✗ Auto-logger not found at {logger_module_path}")
        logger = None
except Exception as e:
    print(f"⚠️ Logger setup failed: {e}")
    print("  Continuing without auto-logging...")
    logger = None

print("\n✅ Setup complete. Pipeline ready to use.")

In [4]:
# Install required packages
!pip install -q transformers accelerate bitsandbytes
!pip install -q sentence-transformers psycopg2-binary
!pip install -q torch numpy pandas tqdm

import os
import sys
import importlib.util
import json
import psycopg2
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

print("✓ Packages installed and imported")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

2025-08-27 19:17:33.605841: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756322253.979276      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756322254.083044      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
# Neon Database Configuration
NEON_CONFIG = {
    'host': 'ep-royal-star-ad1gn0d4-pooler.c-2.us-east-1.aws.neon.tech',
    'database': 'BizIntelSmartReach',
    'user': 'neondb_owner',
    'password': 'npg_aTFt6Pug3Kpy',
    'sslmode': 'require'
}

# Test database connection
def test_database_connection():
    try:
        conn = psycopg2.connect(**NEON_CONFIG)
        cursor = conn.cursor()
        
        # Check tables
        cursor.execute('''
            SELECT 
                (SELECT COUNT(*) FROM raw_data.patents_full_text) as patents,
                (SELECT COUNT(*) FROM core.companies) as companies,
                (SELECT COUNT(*) FROM system_uno.patents_processing_status) as status_records,
                (SELECT COUNT(*) FROM system_uno.patents_keywords) as keywords,
                (SELECT COUNT(*) FROM system_uno.patents_extracted_knowledge) as extractions
        ''')
        
        counts = cursor.fetchone()
        print("✓ Database connected successfully!")
        print(f"  Patents: {counts[0]}")
        print(f"  Companies: {counts[1]}")
        print(f"  Status records: {counts[2]}")
        print(f"  Keywords: {counts[3]}")
        print(f"  Extractions: {counts[4]}")
        
        cursor.close()
        conn.close()
        return True
        
    except Exception as e:
        print(f"✗ Database connection failed: {e}")
        return False

# Test connection
test_database_connection()

True

In [None]:
# Load Llama-3.1-8B-Instruct with 4-bit quantization
print("Logging in to HuggingFace...")

from huggingface_hub import login

# Define model name FIRST
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Get HuggingFace token from Kaggle secrets
hf_token = os.environ.get('HUGGINGFACE_TOKEN')

if hf_token:
    login(token=hf_token)
    print("✓ Logged in to HuggingFace")
else:
    print("✗ No HuggingFace token found in secrets")

print("\nLoading Llama-3.1-8B-Instruct...")

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.chat_template = tokenizer.default_chat_template
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    print("✓ Llama model loaded successfully")
    
    # Test generation with chat format
    test_messages = [
        {"role": "user", "content": "What is cancer? Answer in one sentence."}
    ]
    test_input = tokenizer.apply_chat_template(test_messages, return_tensors="pt", tokenize=True)
    
    with torch.no_grad():
        outputs = model.generate(test_input, max_new_tokens=50, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Test response: {response}")
    
except Exception as e:
    print(f"✗ Error loading Llama: {e}")
    print("Note: Make sure HUGGINGFACE_TOKEN is added to Kaggle Secrets")
    print("      and you have access to Llama-3.1-8B-Instruct model")
    model = None
    tokenizer = None

# Load Sentence Transformer for embeddings
print("\nLoading Sentence Transformer...")
try:
    embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    print("✓ Sentence Transformer loaded")

    # Test embedding
    test_embedding = embedder.encode(["test sentence"])
    print(f"Embedding dimension: {test_embedding.shape[1]}")

except Exception as e:
    print(f"✗ Error loading embedder: {e}")
    embedder = None

In [12]:
# Initialize the PatentLens Pipeline
"""
This cell initializes the pipeline with the loaded models and database connection.
The pipeline needs to be created AFTER the models are loaded but BEFORE using it in 
cells 5-8.
"""

# Check if all required components are loaded
components_status = {
  'Database': NEON_CONFIG is not None,
  'LLM Model': 'model' in locals() and model is not None,
  'Tokenizer': 'tokenizer' in locals() and tokenizer is not None,
  'Embedder': 'embedder' in locals() and embedder is not None,
  'Pipeline Class': 'PatentLensPipeline' in locals()
}

print("Component Status:")
for component, status in components_status.items():
    print(f"  {component}: {'✓ Ready' if status else '✗ Not loaded'}")

# Initialize pipeline if all components are ready
if all([components_status['Database'], components_status['Pipeline Class']]):
    try:
        pipeline = PatentLensPipeline(
          db_config=NEON_CONFIG,
          llm_model=model if components_status['LLM Model'] else None,
          tokenizer=tokenizer if components_status['Tokenizer'] else None,
          embedder=embedder if components_status['Embedder'] else None
        )
        print("\n✅ Pipeline initialized successfully!")
    
        # Show pipeline configuration
        version_info = pipeline.get_version_info()
        print(f"Pipeline version: {version_info.get('version', 'Unknown')}")
        print(f"LLM loaded: {version_info.get('llm_loaded', 'No')}")
        print(f"Embedder loaded: {version_info.get('embedder_loaded', 'No')}")

    except Exception as e:
        print(f"\n✗ Error initializing pipeline: {e}")
        pipeline = None
else:
    print("\n⚠️ Cannot initialize pipeline - missing required components")
    pipeline = None

In [13]:
"""Test Single Patent """

# Test processing a single patent
if pipeline:
    print("Fetching one patent for testing...")
    
    # Get one patent - CORRECT METHOD NAME
    test_patents = pipeline.get_patents_to_process(limit=1)
    
    if test_patents:
        test_patent = test_patents[0]
        
        print(f"\nTesting with patent: {test_patent.patent_number}")
        print(f"Company: {test_patent.company_domain}")
        print(f"Abstract length: {len(test_patent.abstract)} chars")
        print(f"Background length: {len(test_patent.background_text)} chars")
        print(f"Description length: {len(test_patent.description_text)} chars")
        print(f"CPC codes: {', '.join(test_patent.cpc_codes[:3])}...")
        
        print("\nProcessing patent through pipeline...")
        result = pipeline.process_patent(test_patent)
        
        if result:
            print("\n✓ Patent processed successfully!")
            print("\nExtracted Information:")
            print(f"Field: {result['field_description'][:150]}...")
            print(f"Field keywords: {len(result.get('field_keywords', []))} keyword IDs")
            
            print(f"\nTechnical Problem: {result['technical_problem'][:150]}...")
            print(f"Clinical Problem: {result['clinical_problem'][:150]}...")
            
            print(f"\nSolution: {result['solution_approach'][:150]}...")
            
            print(f"\nCitations:")
            print(f"  Patents cited: {len(result.get('cited_patents', []))}")
            if result.get('cited_patents'):
                print(f"    Examples: {', '.join(result['cited_patents'][:3])}")
            print(f"  Papers cited: {len(result.get('cited_papers', []))}")
            if result.get('cited_papers'):
                print(f"    Examples: {', '.join(result['cited_papers'][:3])}")
            
            # Check embeddings
            has_embeddings = any(
                result.get(f"{field}_embedding") is not None 
                for field in ['field', 'technical_problem', 'clinical_problem', 'solution', 'claims']
            )
            print(f"\nEmbeddings generated: {has_embeddings}")
            
            # Check keywords in database
            conn = psycopg2.connect(**NEON_CONFIG)
            cursor = conn.cursor()
            cursor.execute("SELECT COUNT(*) FROM patents.keywords")
            keyword_count = cursor.fetchone()[0]
            print(f"\nTotal keywords in taxonomy: {keyword_count}")
            cursor.close()
            conn.close()
        else:
            print("✗ Patent processing failed - check logs for error details")
    else:
        print("No patents available for testing")
else:
    print("Pipeline not initialized")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

UndefinedTable: relation "patents.keywords" does not exist
LINE 1: SELECT COUNT(*) FROM patents.keywords
                             ^


In [None]:
# Process multiple patents
if pipeline:
    print("Starting batch processing...")
    
    # Get processing statistics before
    stats_before = pipeline.get_processing_stats()
    pending_count = stats_before.get('pending', 0)
    
    print(f"Patents pending: {pending_count}")
    
    if pending_count > 0:
        # Process a batch
        batch_size = min(10, pending_count)  # Process up to 10 patents
        print(f"\\nProcessing {batch_size} patents...")
        
        results = []
        patents = pipeline.fetch_patents_batch(limit=batch_size)
        
        # Process with progress bar
        for patent in tqdm(patents, desc="Processing patents"):
            result = pipeline.process_patent(patent)
            results.append({
                'patent_number': patent.patent_number,
                'success': result is not None,
                'field': result['field_description'][:50] if result else None
            })
            
            # Clear GPU cache periodically
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        # Show results
        df_results = pd.DataFrame(results)
        print(f"\\nProcessing complete!")
        print(f"Success rate: {df_results['success'].mean():.1%}")
        
        # Get updated statistics
        stats_after = pipeline.get_processing_stats()
        print("\\nUpdated statistics:")
        for status, count in stats_after.items():
            before = stats_before.get(status, 0)
            change = count - before
            if change != 0:
                print(f"  {status}: {count} ({change:+d})")
            else:
                print(f"  {status}: {count}")
        
        # Show sample results
        print("\\nSample results:")
        print(df_results.head())
    else:
        print("No pending patents to process")
else:
    print("Pipeline not initialized")

In [None]:
# Analyze extraction results and keyword taxonomy
conn = psycopg2.connect(**NEON_CONFIG)

# 1. Keyword Taxonomy Analysis
print("=== KEYWORD TAXONOMY ANALYSIS ===\\n")

query_keywords = '''
    SELECT 
        category,
        COUNT(*) as keyword_count,
        AVG(use_count) as avg_usage,
        MAX(use_count) as max_usage
    FROM patents.keywords
    GROUP BY category
    ORDER BY keyword_count DESC
'''

df_keywords = pd.read_sql(query_keywords, conn)
print("Keywords by category:")
print(df_keywords)

# Top keywords by usage
print("\\nTop 10 most used keywords:")
query_top = '''
    SELECT keyword_text, category, use_count
    FROM patents.keywords
    ORDER BY use_count DESC
    LIMIT 10
'''
df_top_keywords = pd.read_sql(query_top, conn)
print(df_top_keywords)

# 2. Extraction Results Analysis
print("\\n=== EXTRACTION RESULTS ===\\n")

query_extractions = '''
    SELECT 
        COUNT(*) as total_extractions,
        COUNT(DISTINCT patent_number) as unique_patents,
        COUNT(field_description) as has_field,
        COUNT(technical_problem) as has_technical,
        COUNT(clinical_problem) as has_clinical,
        COUNT(solution_approach) as has_solution,
        AVG(array_length(cited_patents, 1)) as avg_patent_citations,
        AVG(array_length(cited_papers, 1)) as avg_paper_citations
    FROM patents.extracted_knowledge
'''

df_extraction_stats = pd.read_sql(query_extractions, conn)
print("Extraction statistics:")
for col in df_extraction_stats.columns:
    val = df_extraction_stats[col].iloc[0]
    if pd.notna(val):
        if 'avg' in col:
            print(f"  {col}: {val:.2f}")
        else:
            print(f"  {col}: {int(val)}")

# 3. Sample Extractions
print("\\n=== SAMPLE EXTRACTIONS ===\\n")

query_samples = '''
    SELECT 
        ek.patent_number,
        c.name as company,
        LEFT(ek.field_description, 100) as field_preview,
        array_length(ek.cited_patents, 1) as patent_citations,
        array_length(ek.cited_papers, 1) as paper_citations
    FROM patents.extracted_knowledge ek
    JOIN patents.patent_full_text p ON ek.patent_id = p.id
    JOIN core.companies c ON p.company_domain = c.domain
    LIMIT 5
'''

df_samples = pd.read_sql(query_samples, conn)
print("Sample extracted patents:")
print(df_samples.to_string())

# 4. Processing Status Overview
print("\\n=== PROCESSING STATUS ===\\n")

query_status = '''
    SELECT 
        overall_status,
        COUNT(*) as count,
        ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 1) as percentage
    FROM patents.processing_status
    GROUP BY overall_status
    ORDER BY count DESC
'''

df_status = pd.read_sql(query_status, conn)
print("Overall processing status:")
print(df_status.to_string(index=False))

# 5. Citations Analysis
print("\\n=== CITATIONS ANALYSIS ===\\n")

query_citations = '''
    SELECT 
        patent_number,
        cited_patents,
        cited_papers
    FROM patents.extracted_knowledge
    WHERE array_length(cited_patents, 1) > 0 OR array_length(cited_papers, 1) > 0
    LIMIT 3
'''

df_citations = pd.read_sql(query_citations, conn)
if not df_citations.empty:
    print("Patents with citations:")
    for _, row in df_citations.iterrows():
        print(f"\\nPatent {row['patent_number']}:")
        if row['cited_patents']:
            print(f"  Cites patents: {', '.join(row['cited_patents'][:3])}...")
        if row['cited_papers']:
            print(f"  Cites papers: {', '.join(row['cited_papers'][:3])}...")
else:
    print("No citations extracted yet")

conn.close()

# Visualization (if matplotlib available)
try:
    import matplotlib.pyplot as plt
    
    if not df_keywords.empty:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
        
        # Keywords by category
        ax1.bar(df_keywords['category'], df_keywords['keyword_count'])
        ax1.set_title('Keywords by Category')
        ax1.set_xlabel('Category')
        ax1.set_ylabel('Count')
        ax1.tick_params(axis='x', rotation=45)
        
        # Processing status pie chart
        if not df_status.empty:
            ax2.pie(df_status['count'], labels=df_status['overall_status'], autopct='%1.1f%%')
            ax2.set_title('Processing Status')
        
        plt.tight_layout()
        plt.show()
        
except ImportError:
    print("\\nMatplotlib not available for visualization")

In [None]:
# Export results to CSV for download
conn = psycopg2.connect(**NEON_CONFIG)

# Export extracted knowledge with company info
query_export = '''
    SELECT 
        ek.patent_number,
        c.name as company_name,
        c.domain as company_domain,
        ek.field_description,
        ek.technical_problem,
        ek.clinical_problem,
        ek.solution_approach,
        array_to_string(ek.cited_patents, '; ') as cited_patents,
        array_to_string(ek.cited_papers, '; ') as cited_papers,
        ek.extraction_date
    FROM patents.extracted_knowledge ek
    JOIN patents.patent_full_text p ON ek.patent_id = p.id
    JOIN core.companies c ON p.company_domain = c.domain
    ORDER BY ek.extraction_date DESC
'''

df_export = pd.read_sql(query_export, conn)

# Save to CSV
output_file = '/kaggle/working/patentlens_extractions.csv'
df_export.to_csv(output_file, index=False)
print(f"✓ Exported {len(df_export)} patent extractions to {output_file}")

# Export keyword taxonomy
query_keywords_export = '''
    SELECT 
        keyword_text,
        category,
        use_count
    FROM patents.keywords
    ORDER BY use_count DESC, category, keyword_text
'''

df_keywords_export = pd.read_sql(query_keywords_export, conn)
keywords_file = '/kaggle/working/patentlens_keywords.csv'
df_keywords_export.to_csv(keywords_file, index=False)
print(f"✓ Exported {len(df_keywords_export)} keywords to {keywords_file}")

conn.close()

print("\\nFiles ready for download from Kaggle output")