# 🔬 Enhanced Semantic Indexing for Scopus Research Chatbot

## 📊 Multi-Index FAISS System for Scientific Literature Search

This notebook creates **multiple specialized FAISS indexes** for different search strategies on 4,000+ scientific articles from Scopus (2018-2025). The enhanced system supports:

- **Content Index**: Title + Abstract (primary semantic search)
- **Metadata Index**: Content + Keywords + Authors  
- **Institution Index**: Institution names + Countries
- **Full Index**: All available text fields combined

### 🎯 Key Features:
- **SPECTER embeddings** optimized for scientific papers
- **Multi-modal search** combining semantic and metadata filtering
- **Production-ready** for Hugging Face Spaces deployment
- **4,254+ articles** across 15 scientific domains

## 1. Install Required Packages

Install the necessary packages for semantic indexing.

In [None]:
# Install required packages
!pip install sentence-transformers faiss-cpu transformers torch

# Check if GPU is available
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    # If GPU is available, install faiss-gpu for better performance
    %pip install faiss-gpu

# Enhanced Semantic Indexing System
import sqlite3
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import os
import pandas as pd
from tqdm import tqdm

# Database configuration
DATABASE_NAME = 'scopus_database.db'

# Multiple FAISS indexes for different search types
INDEXES = {
    'content': {
        'faiss_file': 'scopus_content_index.faiss',
        'ids_file': 'scopus_content_ids.json',
        'description': 'Title + Abstract (primary semantic search)'
    },
    'metadata': {
        'faiss_file': 'scopus_metadata_index.faiss', 
        'ids_file': 'scopus_metadata_ids.json',
        'description': 'Title + Abstract + Keywords + Authors'
    },
    'institution': {
        'faiss_file': 'scopus_institution_index.faiss',
        'ids_file': 'scopus_institution_ids.json', 
        'description': 'Institution names and countries'
    },
    'full': {
        'faiss_file': 'scopus_full_index.faiss',
        'ids_file': 'scopus_full_ids.json',
        'description': 'All available text fields combined'
    }
}

print("✅ Enhanced semantic indexing system loaded")
print(f"📊 Will create {len(INDEXES)} specialized FAISS indexes")

## 2. Import Libraries and Setup

In [None]:
import sqlite3
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import os
from pathlib import Path
import gc
import torch

# Configuration
INPUT_DIR = '/kaggle/input'  # Kaggle input directory
OUTPUT_DIR = '/kaggle/working'  # Kaggle output directory

# Find the database file in input directory
db_files = list(Path(INPUT_DIR).rglob('*.db'))
if db_files:
    DATABASE_PATH = str(db_files[0])
    print(f"Found database: {DATABASE_PATH}")
else:
    print("❌ No .db file found in input. Please upload your scopus_database.db file.")
    DATABASE_PATH = None

# Output files
FAISS_INDEX_FILE = os.path.join(OUTPUT_DIR, "scopus_combined_metadata_index.faiss")
ARTICLE_IDS_MAP_FILE = os.path.join(OUTPUT_DIR, "scopus_article_ids_for_index.json")

print(f"Output files will be saved to:")
print(f"- FAISS Index: {FAISS_INDEX_FILE}")
print(f"- Article IDs: {ARTICLE_IDS_MAP_FILE}")

def get_article_data_with_affiliations():
    """Get articles with their affiliation information for enhanced indexing."""
    print("📊 Connecting to database and fetching article data...")
    
    conn = sqlite3.connect(DATABASE_PATH)
    cursor = conn.cursor()
    
    # Extended query to include affiliations and countries for enhanced search
    cursor.execute('''
        SELECT 
            A.scopus_id, 
            A.title, 
            A.abstract, 
            A.cover_date, 
            A.keywords,
            GROUP_CONCAT(Auth.full_name, '; ') AS authors_list,
            GROUP_CONCAT(Aff.institution_name, '; ') AS affiliations_list,
            GROUP_CONCAT(Aff.country, '; ') AS countries_list
        FROM articles AS A
        LEFT JOIN article_authors AS AA ON A.scopus_id = AA.article_scopus_id
        LEFT JOIN authors AS Auth ON AA.author_id = Auth.author_id
        LEFT JOIN author_affiliations AS AuthAff ON Auth.author_id = AuthAff.author_id
        LEFT JOIN affiliations AS Aff ON AuthAff.affiliation_id = Aff.affiliation_id
        WHERE A.abstract IS NOT NULL AND A.abstract != '' 
        GROUP BY A.scopus_id, A.title, A.abstract, A.cover_date, A.keywords
        ORDER BY A.scopus_id
    ''')
    
    articles_data = cursor.fetchall()
    conn.close()
    
    print(f"✅ Retrieved {len(articles_data):,} articles with complete metadata")
    return articles_data

# Load the data
articles_data = get_article_data_with_affiliations()

## 3. Load and Explore Database

In [None]:
import sqlite3
from tqdm import tqdm

DATABASE_PATH = 'path_to_your_database.db'  # Update this path

def create_embeddings_for_index_type(articles_data, index_type):
    """Create embeddings based on index type for specialized search."""
    print(f"🔍 Creating embeddings for {index_type} index...")
    
    texts_to_embed = []
    article_ids = []
    
    for row in tqdm(articles_data, desc=f"Processing {index_type}"):
        # Handle the enhanced data structure: scopus_id, title, abstract, cover_date, keywords, authors_list, affiliations_list, countries_list
        scopus_id, title, abstract, cover_date, keywords, authors_list, affiliations_list, countries_list = row
        
        # Build text based on index type
        if index_type == 'content':
            # Primary content search (title + abstract only)
            text = ""
            if title:
                text += f"{title}. "
            if abstract:
                text += f"{abstract}"
                
        elif index_type == 'metadata':
            # Content + metadata
            text = ""
            if title:
                text += f"{title}. "
            if abstract:
                text += f"{abstract}. "
            if keywords:
                text += f"Keywords: {keywords}. "
            if authors_list and authors_list != 'None':
                text += f"Authors: {authors_list}. "
                
        elif index_type == 'institution':
            # Institution and country focused
            text = ""
            if affiliations_list and affiliations_list != 'None':
                text += f"Institutions: {affiliations_list}. "
            if countries_list and countries_list != 'None':
                text += f"Countries: {countries_list}. "
            # Add title for context
            if title:
                text += f"Title: {title}"
            # If no institution data, skip this article for institution index
            if not text.strip() or text.strip() == f"Title: {title}":
                continue
                
        elif index_type == 'full':
            # Everything combined
            text = ""
            if title:
                text += f"{title}. "
            if abstract:
                text += f"{abstract}. "
            if keywords:
                text += f"Keywords: {keywords}. "
            if authors_list and authors_list != 'None':
                text += f"Authors: {authors_list}. "
            if affiliations_list and affiliations_list != 'None':
                text += f"Institutions: {affiliations_list}. "
            if countries_list and countries_list != 'None':
                text += f"Countries: {countries_list}. "
        
        text = text.strip()
        if text:  # Only add if we have text
            texts_to_embed.append(text)
            article_ids.append(scopus_id)
    
    print(f"✅ Created {len(texts_to_embed):,} text entries for {index_type} index")
    return texts_to_embed, article_ids

if DATABASE_PATH:
    # Connect to database and explore
    conn = sqlite3.connect(DATABASE_PATH)
    cursor = conn.cursor()
    
    # Check table structure
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    print("Available tables:", [table[0] for table in tables])
    
    # Count articles with abstracts
    cursor.execute("SELECT COUNT(*) FROM articles WHERE abstract IS NOT NULL AND abstract != ''")
    count = cursor.fetchone()[0]
    print(f"\nArticles with abstracts: {count:,}")
    
    # Sample data
    cursor.execute("SELECT title, abstract FROM articles WHERE abstract IS NOT NULL LIMIT 3")
    samples = cursor.fetchall()
    
    print("\n📄 Sample articles:")
    for i, (title, abstract) in enumerate(samples, 1):
        print(f"\n{i}. {title[:100]}...")
        print(f"   Abstract: {abstract[:200]}...")
    
    # Example usage of the embedding function
    cursor.execute("SELECT * FROM articles WHERE abstract IS NOT NULL")
    articles_data = cursor.fetchall()
    
    # Create embeddings for different index types
    for index_type in ['content', 'metadata', 'institution', 'full']:
        create_embeddings_for_index_type(articles_data, index_type)
    
    conn.close()
else:
    print("Cannot proceed without database file.")

## 4. Load SPECTER Model

SPECTER is specifically trained on scientific papers and provides better embeddings for academic content.

In [None]:
print("🧬 Loading scientific text embedding model...")

# Load SPECTER model with fallbacks for scientific papers
try:
    # Primary: SPECTER - designed specifically for scientific papers
    model = SentenceTransformer('allenai/specter')
    print("✅ SPECTER model loaded successfully!")
    model_name = "SPECTER"
except Exception as e:
    print(f"⚠️ SPECTER loading failed: {e}")
    try:
        # Fallback 1: SciBERT - scientific domain BERT
        model = SentenceTransformer('allenai/scibert_scivocab_uncased')
        print("✅ Using SciBERT as fallback")
        model_name = "SciBERT"
    except Exception as e:
        print(f"⚠️ SciBERT loading failed: {e}")
        # Fallback 2: General purpose model
        model = SentenceTransformer('all-MiniLM-L6-v2')
        print("✅ Using MiniLM as last resort")
        model_name = "MiniLM"

print(f"📝 Using model: {model_name}")
print(f"📐 Embedding dimension: {model.get_sentence_embedding_dimension()}")

# Check if model is on GPU
if torch.cuda.is_available():
    device = model.device
    print(f"🔧 Model device: {device}")

## 5. Extract and Prepare Article Data

In [None]:
if DATABASE_PATH and model:
    print("📚 Extracting article data from database...")
    
    conn = sqlite3.connect(DATABASE_PATH)
    cursor = conn.cursor()
    
    # First, let's check what date range we actually have
    print("🔍 Analyzing date distribution in database...")
    cursor.execute('''
        SELECT 
            substr(cover_date, 1, 4) as year,
            COUNT(*) as count
        FROM articles 
        WHERE cover_date IS NOT NULL AND cover_date != ''
        AND abstract IS NOT NULL AND abstract != ''
        GROUP BY substr(cover_date, 1, 4)
        ORDER BY year
    ''')
    
    year_distribution = cursor.fetchall()
    print("📅 Articles by year:")
    for year, count in year_distribution:
        print(f"   {year}: {count:,} articles")
    
    # Extract articles with enhanced metadata including affiliations and countries
    print("\n📋 Fetching articles with complete metadata including affiliations...")
    cursor.execute('''
        SELECT 
            A.scopus_id, 
            A.title, 
            A.abstract, 
            A.cover_date, 
            A.keywords,
            GROUP_CONCAT(Auth.full_name, '; ') AS authors_list,
            GROUP_CONCAT(Aff.institution_name, '; ') AS affiliations_list,
            GROUP_CONCAT(Aff.country, '; ') AS countries_list
        FROM articles AS A
        LEFT JOIN article_authors AS AA ON A.scopus_id = AA.article_scopus_id
        LEFT JOIN authors AS Auth ON AA.author_id = Auth.author_id
        LEFT JOIN author_affiliations AS AuthAff ON Auth.author_id = AuthAff.author_id
        LEFT JOIN affiliations AS Aff ON AuthAff.affiliation_id = Aff.affiliation_id
        WHERE A.abstract IS NOT NULL AND A.abstract != '' 
        GROUP BY A.scopus_id, A.title, A.abstract, A.cover_date, A.keywords
        ORDER BY A.scopus_id
    ''')
    
    articles_data = cursor.fetchall()
    conn.close()
    
    print(f"✅ Extracted {len(articles_data):,} articles with complete metadata")
    
    if not articles_data:
        print("❌ No articles with abstracts found in the database.")
    else:
        # Check the structure of the data
        print(f"📊 Data structure: {len(articles_data[0])} fields per article")
        sample = articles_data[0]
        print(f"📄 Sample data fields:")
        print(f"   1. Scopus ID: {sample[0]}")
        print(f"   2. Title: {sample[1][:50]}...")
        print(f"   3. Abstract: {sample[2][:50]}...")
        print(f"   4. Cover Date: {sample[3]}")
        print(f"   5. Keywords: {sample[4]}")
        print(f"   6. Authors: {sample[5][:50] if sample[5] else 'None'}...")
        print(f"   7. Affiliations: {sample[6][:50] if sample[6] else 'None'}...")
        print(f"   8. Countries: {sample[7][:50] if sample[7] else 'None'}...")
        
        # Show year distribution of fetched data
        from collections import Counter
        years = [row[3][:4] for row in articles_data if row[3]]
        year_counts = Counter(years)
        print(f"\n📊 Final dataset year distribution:")
        for year in sorted(year_counts.keys()):
            print(f"   {year}: {year_counts[year]:,} articles")
        
        # Check for any 2025 articles that shouldn't be there
        articles_2025 = [row for row in articles_data if row[3] and row[3].startswith('2025')]
        if articles_2025:
            print(f"\n⚠️ WARNING: Found {len(articles_2025)} articles from 2025!")
            print("Sample 2025 articles:")
            for i, article in enumerate(articles_2025[:3], 1):
                print(f"   {i}. ID: {article[0]}, Date: {article[3]}, Title: {article[1][:50]}...")
        else:
            print(f"\n✅ Good: No 2025 articles found in database")
        
        # Prepare simplified data for basic embedding (title + abstract)
        article_ids = []
        texts_to_embed = []
        
        print("\n🔧 Preparing texts for basic embedding...")
        for row in articles_data:
            scopus_id, title, abstract, cover_date, keywords, authors_list, affiliations_list, countries_list = row
            
            # Optimize format for SPECTER: title + abstract
            combined_text = ""
            if title:
                combined_text += f"{title}. "
            if abstract:
                combined_text += f"{abstract}"
            
            article_ids.append(scopus_id)
            texts_to_embed.append(combined_text.strip())
        
        print(f"✅ Prepared {len(texts_to_embed):,} texts for embedding")
        print(f"📄 Average text length: {np.mean([len(text) for text in texts_to_embed]):.0f} characters")
        
        def build_faiss_index(texts, model, index_type):
            """Build and return a FAISS index with proper normalization."""
            print(f"🔥 Generating embeddings for {len(texts):,} texts ({index_type})...")
            
            # Generate embeddings in batches for memory efficiency
            embeddings = model.encode(texts, 
                                     batch_size=8,
                                     show_progress_bar=True,
                                     convert_to_numpy=True,
                                     normalize_embeddings=False)  # We'll normalize manually
            
            embeddings = embeddings.astype('float32')
            print(f"📐 Embeddings shape: {embeddings.shape}")
            
            # Build FAISS index with Inner Product for cosine similarity
            dimension = embeddings.shape[1]
            index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity
            
            # Normalize embeddings for cosine similarity
            faiss.normalize_L2(embeddings)
            print("🔧 Normalized embeddings for cosine similarity")
            
            # Add embeddings to index
            index.add(embeddings)
            print(f"✅ FAISS index built: {index.ntotal:,} vectors, {dimension} dimensions")
            
            return index, embeddings
else:
    print("❌ Cannot proceed without database and model.")

## 6. Generate Embeddings

This is the most computationally intensive step. We'll process articles in batches for memory efficiency.

In [None]:
if 'texts_to_embed' in locals() and len(texts_to_embed) > 0:
    print(f"🚀 Generating {model_name} embeddings for {len(texts_to_embed):,} articles...")
    print("This may take several minutes depending on the number of articles and hardware.")
    
    # Determine batch size based on available memory and model
    if model_name == "SPECTER":
        batch_size = 8 if torch.cuda.is_available() else 4
    else:
        batch_size = 32 if torch.cuda.is_available() else 16
    
    print(f"📦 Using batch size: {batch_size}")
    
    # Generate embeddings
    embeddings = model.encode(
        texts_to_embed, 
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        device=model.device
    )
    
    # Convert to float32 for FAISS compatibility
    embeddings = embeddings.astype('float32')
    
    print(f"✅ Generated embeddings!")
    print(f"📏 Embedding shape: {embeddings.shape}")
    print(f"💾 Memory usage: {embeddings.nbytes / 1024**2:.1f} MB")
    
    # Clear some memory
    del texts_to_embed
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
else:
    print("❌ No texts to embed.")

# 🚀 Enhanced Multi-Index Creation Process
print("🚀 Starting Enhanced Semantic Indexing Process...")
print("=" * 60)

# Create each specialized index
for index_type, config in INDEXES.items():
    print(f"\n🔍 Creating {index_type} index: {config['description']}")
    print("-" * 50)
    
    # Prepare texts for this index type
    texts, article_ids = create_embeddings_for_index_type(articles_data, index_type)
    
    if not texts:
        print(f"⚠️ No texts found for {index_type} index")
        continue
        
    print(f"📄 Processing {len(texts):,} texts for {index_type} index")
    
    # Build FAISS index
    index, embeddings = build_faiss_index(texts, model, index_type)
    
    # Save FAISS index
    faiss.write_index(index, config['faiss_file'])
    print(f"💾 Saved FAISS index: {config['faiss_file']}")
    
    # Save article IDs mapping
    with open(config['ids_file'], 'w') as f:
        json.dump(article_ids, f)
    print(f"💾 Saved article IDs: {config['ids_file']}")
    
    # Check file sizes
    faiss_size = os.path.getsize(config['faiss_file']) / (1024*1024)
    ids_size = os.path.getsize(config['ids_file']) / (1024*1024)
    print(f"📊 Files created: {config['faiss_file']} ({faiss_size:.1f} MB), {config['ids_file']} ({ids_size:.2f} MB)")
    
    print(f"✅ {index_type} index complete: {len(article_ids):,} articles, {embeddings.shape[1]} dimensions")

print("\n🎉 Enhanced semantic indexing complete!")
print("=" * 60)

## 7. Build FAISS Index

Create a FAISS index optimized for cosine similarity search.

In [None]:
if 'embeddings' in locals():
    print("🏗️ Building FAISS index...")
    
    dimension = embeddings.shape[1]
    print(f"📏 Vector dimension: {dimension}")
    
    # Use Inner Product index for cosine similarity (recommended for SPECTER)
    index = faiss.IndexFlatIP(dimension)
    
    # Normalize embeddings for proper cosine similarity
    print("🔧 Normalizing embeddings for cosine similarity...")
    faiss.normalize_L2(embeddings)
    
    # Add embeddings to index
    print("📥 Adding embeddings to FAISS index...")
    index.add(embeddings)
    
    print(f"✅ FAISS index built successfully!")
    print(f"📊 Index contains {index.ntotal:,} vectors")
    print(f"🎯 Index type: {type(index).__name__} (Inner Product for cosine similarity)")
    
    # Test the index with a sample query
    print("\n🧪 Testing index with sample query...")
    test_query = "machine learning artificial intelligence"
    test_embedding = model.encode([test_query], convert_to_numpy=True).astype('float32')
    faiss.normalize_L2(test_embedding)
    
    similarities, indices = index.search(test_embedding, 3)
    print(f"Sample query: '{test_query}'")
    print(f"Top 3 results: indices {indices[0]}, similarities {similarities[0]}")
    
    # 🔍 Validate Created Indexes
    print("🔍 Validating created indexes...")
    print("=" * 40)

    total_indexes_created = 0
    total_size_mb = 0

    for index_type, config in INDEXES.items():
        faiss_file = config['faiss_file']
        ids_file = config['ids_file']
        
        if os.path.exists(faiss_file) and os.path.exists(ids_file):
            # Load and test index
            test_index = faiss.read_index(faiss_file)
            with open(ids_file, 'r') as f:
                test_ids = json.load(f)
            
            faiss_size = os.path.getsize(faiss_file) / (1024*1024)
            ids_size = os.path.getsize(ids_file) / (1024*1024)
            total_size_mb += faiss_size + ids_size
            
            print(f"✅ {index_type}: {test_index.ntotal:,} vectors, {len(test_ids):,} IDs ({faiss_size:.1f} MB)")
            total_indexes_created += 1
            
            # Quick test search
            if test_index.ntotal > 0:
                test_query = model.encode(["machine learning artificial intelligence"])
                test_query = test_query.astype('float32')
                faiss.normalize_L2(test_query)
                distances, indices = test_index.search(test_query, 3)
                print(f"   Test search successful: top similarity = {1-distances[0][0]:.3f}")
        else:
            print(f"❌ {index_type}: Files missing")

    print(f"\n📊 Summary: {total_indexes_created}/{len(INDEXES)} indexes created successfully")
    print(f"💾 Total size: {total_size_mb:.1f} MB")
    print(f"🎯 Ready for deployment to Hugging Face Spaces!")
else:
    print("❌ No embeddings available to build index.")

## 8. Save Index and Metadata

Save the FAISS index and article ID mapping for use in the chatbot.

In [None]:
if 'index' in locals() and 'article_ids' in locals():
    print("💾 Saving FAISS index and metadata...")
    
    # Save FAISS index
    faiss.write_index(index, FAISS_INDEX_FILE)
    print(f"✅ FAISS index saved: {FAISS_INDEX_FILE}")
    
    # Save article IDs mapping
    with open(ARTICLE_IDS_MAP_FILE, "w", encoding="utf-8") as f:
        json.dump(article_ids, f, ensure_ascii=False, indent=2)
    print(f"✅ Article IDs saved: {ARTICLE_IDS_MAP_FILE}")
    
    # File sizes
    index_size = os.path.getsize(FAISS_INDEX_FILE) / 1024**2
    ids_size = os.path.getsize(ARTICLE_IDS_MAP_FILE) / 1024**2
    
    print(f"\n📁 File sizes:")
    print(f"   - FAISS index: {index_size:.1f} MB")
    print(f"   - Article IDs: {ids_size:.1f} MB")
    print(f"   - Total: {index_size + ids_size:.1f} MB")
    
    # Summary
    print(f"\n🎉 Semantic indexing completed successfully!")
    print(f"📊 Statistics:")
    print(f"   - Model used: {model_name}")
    print(f"   - Articles indexed: {len(article_ids):,}")
    print(f"   - Vector dimension: {dimension}")
    print(f"   - Index type: Inner Product (cosine similarity)")
else:
    print("❌ Cannot save files - index or article_ids not available.")

# 🔄 Create Compatibility with Existing Single Index System
print("🔄 Creating compatibility with existing system...")

# The main chatbot (app_intelligent.py) currently uses these files:
MAIN_FAISS_FILE = "scopus_combined_metadata_index.faiss"
MAIN_IDS_FILE = "scopus_article_ids_for_index.json"

# Use the 'metadata' index as the main index (best balance of content and metadata)
if os.path.exists(INDEXES['metadata']['faiss_file']) and os.path.exists(INDEXES['metadata']['ids_file']):
    
    # Copy metadata index as main index for compatibility
    import shutil
    
    shutil.copy2(INDEXES['metadata']['faiss_file'], MAIN_FAISS_FILE)
    shutil.copy2(INDEXES['metadata']['ids_file'], MAIN_IDS_FILE)
    
    # Verify the copy
    main_index = faiss.read_index(MAIN_FAISS_FILE)
    with open(MAIN_IDS_FILE, 'r') as f:
        main_ids = json.load(f)
    
    main_size = os.path.getsize(MAIN_FAISS_FILE) / (1024*1024)
    
    print(f"✅ Main index created: {MAIN_FAISS_FILE}")
    print(f"📊 {main_index.ntotal:,} vectors, {len(main_ids):,} article IDs ({main_size:.1f} MB)")
    print(f"🎯 Compatible with existing app_intelligent.py chatbot")
    
else:
    print("❌ Metadata index not found - cannot create main compatibility index")

print("\n🎉 Enhanced semantic indexing system ready!")
print("📁 Created files can be uploaded to Hugging Face Spaces for deployment")

## 9. Verification and Testing

Verify that the saved files can be loaded correctly.

In [None]:
# Verify saved files
if os.path.exists(FAISS_INDEX_FILE) and os.path.exists(ARTICLE_IDS_MAP_FILE):
    print("🔍 Verifying saved files...")
    
    # Load and test FAISS index
    try:
        test_index = faiss.read_index(FAISS_INDEX_FILE)
        print(f"✅ FAISS index loaded successfully: {test_index.ntotal:,} vectors")
    except Exception as e:
        print(f"❌ Error loading FAISS index: {e}")
    
    # Load and test article IDs
    try:
        with open(ARTICLE_IDS_MAP_FILE, "r", encoding="utf-8") as f:
            test_article_ids = json.load(f)
        print(f"✅ Article IDs loaded successfully: {len(test_article_ids):,} IDs")
        
        # Show sample IDs
        print(f"📄 Sample article IDs: {test_article_ids[:5]}")
    except Exception as e:
        print(f"❌ Error loading article IDs: {e}")
    
    print("\n✅ All files verified successfully!")
    print("\n📥 Next steps:")
    print("1. Download both files from Kaggle output:")
    print(f"   - {os.path.basename(FAISS_INDEX_FILE)}")
    print(f"   - {os.path.basename(ARTICLE_IDS_MAP_FILE)}")
    print("2. Place them in your local Scopus ChatBot directory")
    print("3. Run your chatbot application!")
else:
    print("❌ Output files not found. Check the previous steps for errors.")

# 🧪 Advanced Search Testing
print("🧪 Testing different search strategies...")

def test_search_strategy(query, index_type, top_k=5):
    """Test search on a specific index type."""
    config = INDEXES[index_type]
    
    if not os.path.exists(config['faiss_file']):
        print(f"❌ {index_type} index not found")
        return
    
    # Load index
    index = faiss.read_index(config['faiss_file'])
    with open(config['ids_file'], 'r') as f:
        article_ids = json.load(f)
    
    # Encode query
    query_embedding = model.encode([query])
    query_embedding = query_embedding.astype('float32')
    faiss.normalize_L2(query_embedding)
    
    # Search
    distances, indices = index.search(query_embedding, top_k)
    
    print(f"\n🔍 Query: '{query}' on {index_type} index")
    print(f"📊 {config['description']}")
    
    for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
        if idx < len(article_ids):
            similarity = 1 - distance
            print(f"  {i+1}. ID: {article_ids[idx][:20]}... | Similarity: {similarity:.3f}")

# Test different types of queries
test_queries = [
    ("machine learning artificial intelligence", "content"),
    ("machine learning neural networks", "metadata"), 
    ("research from China university", "institution"),
    ("COVID-19 treatment drug discovery", "full")
]

for query, best_index in test_queries:
    test_search_strategy(query, best_index, top_k=3)

## 10. Performance Analysis

Analyze the performance and quality of the generated embeddings.

In [None]:
if 'embeddings' in locals() and 'model' in locals():
    print("📊 Performance Analysis")
    print("=" * 50)
    
    # Test with sample scientific queries
    test_queries = [
        "machine learning applications in healthcare",
        "deep neural networks for image recognition",
        "COVID-19 vaccine development",
        "artificial intelligence natural language processing",
        "climate change environmental impact"
    ]
    
    print(f"🧪 Testing with {len(test_queries)} sample queries...\n")
    
    for i, query in enumerate(test_queries, 1):
        # Generate query embedding
        query_emb = model.encode([query], convert_to_numpy=True).astype('float32')
        faiss.normalize_L2(query_emb)
        
        # Search top 3 results
        similarities, indices = index.search(query_emb, 3)
        
        print(f"{i}. Query: '{query}'")
        # Fix: Access individual elements properly
        sim_scores = similarities[0]
        print(f"   Top similarities: {sim_scores[0]:.3f}, {sim_scores[1]:.3f}, {sim_scores[2]:.3f}")
        
        # Show article IDs of top results
        top_article_ids = [article_ids[idx] for idx in indices[0]]
        print(f"   Top article IDs: {top_article_ids}")
        print()
    
    # Embedding statistics
    print("📈 Embedding Statistics:")
    print(f"   - Total vectors: {embeddings.shape[0]:,}")
    print(f"   - Dimensions: {embeddings.shape[1]}")
    print(f"   - Mean norm: {np.mean(np.linalg.norm(embeddings, axis=1)):.3f}")
    print(f"   - Std norm: {np.std(np.linalg.norm(embeddings, axis=1)):.3f}")
    
    print("\n🎯 The semantic index is ready for production use!")
    
    # 📋 Final Summary and Deployment Guide

    print("📋 ENHANCED SEMANTIC INDEXING COMPLETE")
    print("=" * 50)

    # List all created files
    created_files = []
    total_size = 0

    print("📁 Created Index Files:")
    for index_type, config in INDEXES.items():
        if os.path.exists(config['faiss_file']) and os.path.exists(config['ids_file']):
            faiss_size = os.path.getsize(config['faiss_file']) / (1024*1024)
            ids_size = os.path.getsize(config['ids_file']) / (1024*1024)
            total_size += faiss_size + ids_size
            
            print(f"  ✅ {index_type}:")
            print(f"     • {config['faiss_file']} ({faiss_size:.1f} MB)")
            print(f"     • {config['ids_file']} ({ids_size:.2f} MB)")
            
            created_files.extend([config['faiss_file'], config['ids_file']])

    # Main compatibility files
    if os.path.exists(MAIN_FAISS_FILE):
        main_size = os.path.getsize(MAIN_FAISS_FILE) / (1024*1024)
        total_size += main_size
        print(f"  ✅ Main Compatibility:")
        print(f"     • {MAIN_FAISS_FILE} ({main_size:.1f} MB)")
        print(f"     • {MAIN_IDS_FILE}")
        created_files.extend([MAIN_FAISS_FILE, MAIN_IDS_FILE])

    print(f"\n💾 Total Files: {len(created_files)} | Total Size: {total_size:.1f} MB")

    print(f"\n🚀 DEPLOYMENT READY!")
    print("📤 Upload these files to Hugging Face Spaces along with:")
    print("   • app_intelligent.py (main chatbot)")
    print("   • scopus_database.db (database)")
    print("   • requirements.txt (dependencies)")

    print(f"\n🎯 Your chatbot supports:")
    print("   ✅ Natural language queries: 'machine learning papers from 2023'")
    print("   ✅ Author searches: 'research by Smith'")
    print("   ✅ Geographic searches: 'articles from China'")
    print("   ✅ Institution searches: 'Harvard research on AI'")
    print("   ✅ Semantic searches: 'deep learning neural networks'")

    print(f"\n📊 Dataset: 4,000+ scientific articles from Scopus (2018-2025)")
    print(f"🧠 Technology: FAISS + {model_name} embeddings + Intelligent query parsing")
    print(f"🎉 Ready for production deployment!")

    # Save creation log
    with open('indexing_log.txt', 'w') as f:
        f.write(f"Enhanced Semantic Indexing Completed\n")
        f.write(f"Model: {model_name}\n")
        f.write(f"Articles processed: {len(articles_data):,}\n")
        f.write(f"Indexes created: {len(created_files)//2}\n")
        f.write(f"Total size: {total_size:.1f} MB\n")
        f.write(f"Files: {', '.join(created_files)}\n")

    print("📝 Indexing log saved to 'indexing_log.txt'")
else:
    print("❌ Cannot perform analysis - embeddings not available.")