# ResearchMate - Advanced AI Research Assistant

**Powered by Groq Llama 3.3 70B**

A comprehensive AI research assistant that combines:
- Multi-source paper collection (arXiv, Semantic Scholar, CrossRef, PubMed)
- Advanced PDF processing and citation analysis
- Real-time trend monitoring and gap identification
- Research project management and literature review generation
- RAG-powered question answering with LangChain

## Quick Start
1. Run all cells in order
2. Set your Groq API key when prompted
3. Use `research_mate.demo()` to see everything in action

In [5]:
# ============================================================================
# PACKAGE INSTALLATION
# ============================================================================

import subprocess
import sys
import os

def install_package(package_name, import_name=None):
    """Install a package and verify it can be imported"""
    if import_name is None:
        import_name = package_name

    try:
        __import__(import_name)
        print(f"✅ {package_name} already installed")
        return True
    except ImportError:
        print(f"📦 Installing {package_name}...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
            print(f"✅ {package_name} installed successfully")
            return True
        except subprocess.CalledProcessError as e:
            print(f"❌ Failed to install {package_name}: {e}")
            return False

# Core packages for ResearchMate
packages = [
    ("groq", "groq"),
    ("langchain", "langchain"),
    ("langchain-community", "langchain_community"),
    ("chromadb", "chromadb"),
    ("sentence-transformers", "sentence_transformers"),
    ("arxiv", "arxiv"),
    ("requests", "requests"),
    ("PyPDF2", "PyPDF2"),
    ("pdfplumber", "pdfplumber"),
    ("PyMuPDF", "fitz"),
    ("networkx", "networkx"),
    ("matplotlib", "matplotlib"),
    ("pandas", "pandas"),
    ("numpy", "numpy"),
    ("python-dotenv", "dotenv"),
    ("beautifulsoup4", "bs4"),
    ("plotly", "plotly"),
    ("wordcloud", "wordcloud")
]

print("🚀 Installing ResearchMate dependencies...")
print("=" * 50)

failed_packages = []
for package_name, import_name in packages:
    if not install_package(package_name, import_name):
        failed_packages.append(package_name)

print("\n" + "=" * 50)
if failed_packages:
    print(f"❌ Failed to install: {', '.join(failed_packages)}")
else:
    print("✅ All packages installed successfully!")

print("\n🔑 Set your Groq API key:")
print("   os.environ['GROQ_API_KEY'] = 'your_api_key_here'")
print("   Get your key from: https://console.groq.com/keys")

🚀 Installing ResearchMate dependencies...
📦 Installing groq...
✅ groq installed successfully
✅ langchain already installed
📦 Installing langchain-community...
✅ langchain-community installed successfully
📦 Installing chromadb...
✅ chromadb installed successfully
✅ sentence-transformers already installed
📦 Installing arxiv...
✅ arxiv installed successfully
✅ requests already installed
📦 Installing PyPDF2...
✅ PyPDF2 installed successfully
📦 Installing pdfplumber...
✅ pdfplumber installed successfully
📦 Installing PyMuPDF...
✅ PyMuPDF installed successfully
✅ networkx already installed
✅ matplotlib already installed
✅ pandas already installed
✅ numpy already installed
✅ python-dotenv already installed
✅ beautifulsoup4 already installed
✅ plotly already installed
✅ wordcloud already installed

✅ All packages installed successfully!

🔑 Set your Groq API key:
   os.environ['GROQ_API_KEY'] = 'your_api_key_here'
   Get your key from: https://console.groq.com/keys


In [None]:
# ============================================================================
# IMPORTS AND CONFIGURATION
# ============================================================================

import os
import re
import json
import time
import warnings
from typing import List, Dict, Optional, Tuple, Any
from datetime import datetime, timedelta
from collections import defaultdict

# Data processing
import pandas as pd
import numpy as np
import requests

# PDF processing
import PyPDF2
try:
    import pdfplumber
    import fitz  # PyMuPDF
    PDF_ENHANCED = True
except ImportError:
    PDF_ENHANCED = False

# AI and ML
from groq import Groq
from sentence_transformers import SentenceTransformer
import torch

# LangChain
from langchain.llms.base import LLM
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from pydantic import Field

# Data sources
import arxiv

# Analysis and visualization
import networkx as nx
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Web scraping
try:
    from bs4 import BeautifulSoup
    WEB_SCRAPING_AVAILABLE = True
except ImportError:
    WEB_SCRAPING_AVAILABLE = False

warnings.filterwarnings('ignore')

class Config:
    """Configuration for ResearchMate"""

    # Groq Llama 3.3 70B settings
    LLAMA_MODEL = "llama-3.3-70b-versatile"
    GROQ_API_KEY = os.getenv('GROQ_API_KEY')
    MAX_INPUT_TOKENS = 128000
    MAX_OUTPUT_TOKENS = 8000
    TEMPERATURE = 0.7
    TOP_P = 0.9

    # Embeddings and chunking
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
    CHUNK_SIZE = 2000
    CHUNK_OVERLAP = 400

    # Database settings
    CHROMA_DB_PATH = "./chroma_db"
    COLLECTION_NAME = "research_papers"
    PERSIST_DIRECTORY = "./chroma_persist"

    # Search settings
    TOP_K_SIMILAR = 5
    MAX_PAPER_LENGTH = 100000
    MAX_SUMMARY_LENGTH = 2000

    def __init__(self):
        os.makedirs(self.CHROMA_DB_PATH, exist_ok=True)
        os.makedirs(self.PERSIST_DIRECTORY, exist_ok=True)

        if not self.GROQ_API_KEY:
            print("⚠️  GROQ_API_KEY not found in environment variables!")
            print("💡 Set it with: os.environ['GROQ_API_KEY'] = 'your_key_here'")
            print("   Get your key from: https://console.groq.com/keys")
        else:
            print("✅ Groq API key configured!")

config = Config()

def setup_device():
    """Setup compute device"""
    if torch.cuda.is_available():
        print(f"✅ GPU Available: {torch.cuda.get_device_name(0)}")
        return torch.device("cuda")
    else:
        print("✅ Using CPU")
        return torch.device("cpu")

device = setup_device()
print("✅ Configuration loaded successfully!")

✅ Groq API key configured!
✅ Using CPU
✅ Configuration loaded successfully!


In [7]:
# ============================================================================
# GROQ LLAMA 3.3 70B INTEGRATION
# ============================================================================

class GroqLlamaLLM(LLM):
    """LangChain-compatible wrapper for Groq Llama 3.3 70B"""

    groq_client: Any = Field(default=None)
    model_name: str = Field(default="llama-3.3-70b-versatile")
    temperature: float = Field(default=0.7)
    max_tokens: int = Field(default=2000)
    top_p: float = Field(default=0.9)

    def __init__(self, api_key: str, **kwargs):
        groq_client = Groq(api_key=api_key)
        super().__init__(groq_client=groq_client, **kwargs)

    class Config:
        arbitrary_types_allowed = True

    @property
    def _llm_type(self) -> str:
        return "groq_llama"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        try:
            response = self.groq_client.chat.completions.create(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                temperature=self.temperature,
                max_tokens=self.max_tokens,
                top_p=self.top_p,
                stop=stop
            )
            return response.choices[0].message.content
        except Exception as e:
            return f"Error: {str(e)}"

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        return {
            "model_name": self.model_name,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens,
            "top_p": self.top_p
        }

class GroqProcessor:
    """Enhanced Groq Llama processor with research capabilities"""

    def __init__(self):
        if not config.GROQ_API_KEY:
            raise ValueError("Groq API key not found! Please set GROQ_API_KEY environment variable.")

        self.groq_client = Groq(api_key=config.GROQ_API_KEY)
        self.llm = GroqLlamaLLM(
            api_key=config.GROQ_API_KEY,
            model_name=config.LLAMA_MODEL,
            temperature=config.TEMPERATURE,
            max_tokens=config.MAX_OUTPUT_TOKENS,
            top_p=config.TOP_P
        )
        print("✅ Groq Llama 3.3 70B initialized successfully!")

    def generate_response(self, prompt: str, max_tokens: int = 2000) -> str:
        """Generate response using Groq Llama"""
        try:
            response = self.groq_client.chat.completions.create(
                model=config.LLAMA_MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=config.TEMPERATURE,
                max_tokens=max_tokens,
                top_p=config.TOP_P
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            return f"Error: {str(e)}"

    def summarize_paper(self, title: str, abstract: str, content: str) -> Dict[str, str]:
        """Generate comprehensive paper summary"""
        try:
            if len(content) > config.MAX_PAPER_LENGTH:
                content = content[:config.MAX_PAPER_LENGTH] + "..."

            prompt = f"""Analyze this research paper and provide a structured summary:

Title: {title}
Abstract: {abstract}
Content: {content[:8000]}

Provide a comprehensive summary with these sections:
1. **MAIN SUMMARY** (2-3 sentences)
2. **KEY CONTRIBUTIONS** (3-5 bullet points)
3. **METHODOLOGY** (brief description)
4. **KEY FINDINGS** (3-5 bullet points)
5. **LIMITATIONS** (if mentioned)

Format your response clearly with section headers."""

            response = self.generate_response(prompt, max_tokens=config.MAX_SUMMARY_LENGTH)
            return self._parse_summary_response(response, title, abstract)
        except Exception as e:
            return {
                'summary': f'Error generating summary: {str(e)}',
                'contributions': 'N/A',
                'methodology': 'N/A',
                'findings': 'N/A',
                'limitations': 'N/A',
                'title': title,
                'abstract': abstract
            }

    def _parse_summary_response(self, response: str, title: str, abstract: str) -> Dict[str, str]:
        """Parse AI response into structured summary"""
        sections = {
            'summary': '',
            'contributions': '',
            'methodology': '',
            'findings': '',
            'limitations': '',
            'title': title,
            'abstract': abstract
        }

        if "Error:" in response:
            return sections

        lines = response.split('\n')
        current_section = 'summary'

        for line in lines:
            line = line.strip()
            if not line:
                continue

            line_lower = line.lower()
            if any(keyword in line_lower for keyword in ['main summary', '1.', '**main']):
                current_section = 'summary'
                continue
            elif any(keyword in line_lower for keyword in ['key contributions', '2.', '**key contrib']):
                current_section = 'contributions'
                continue
            elif any(keyword in line_lower for keyword in ['methodology', '3.', '**method']):
                current_section = 'methodology'
                continue
            elif any(keyword in line_lower for keyword in ['key findings', 'findings', '4.', '**key find']):
                current_section = 'findings'
                continue
            elif any(keyword in line_lower for keyword in ['limitations', '5.', '**limit']):
                current_section = 'limitations'
                continue

            if not line.startswith(('1.', '2.', '3.', '4.', '5.', '**', '#')):
                sections[current_section] += line + ' '

        return sections

    def analyze_trends(self, texts: List[str]) -> Dict:
        """Analyze research trends from multiple texts"""
        try:
            combined_text = ' '.join(texts[:10])  # Limit to avoid token limits

            prompt = f"""Analyze research trends in this collection of texts:

{combined_text[:5000]}

Identify:
1. Key research themes and topics
2. Emerging trends and directions
3. Frequently mentioned technologies/methods
4. Research gaps or opportunities

Provide analysis as structured points."""

            response = self.generate_response(prompt, max_tokens=1500)

            return {
                'trend_analysis': response,
                'texts_analyzed': len(texts),
                'analysis_date': datetime.now().isoformat(),
                'keywords': self._extract_keywords(combined_text)
            }
        except Exception as e:
            return {
                'trend_analysis': f'Error: {str(e)}',
                'texts_analyzed': 0,
                'analysis_date': datetime.now().isoformat(),
                'keywords': []
            }

    def _extract_keywords(self, text: str) -> List[str]:
        """Extract keywords from text"""
        words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
        stop_words = {'the', 'and', 'for', 'are', 'with', 'this', 'that', 'from', 'they', 'have'}
        keywords = [w for w in words if len(w) > 3 and w not in stop_words]

        # Count frequency and return top keywords
        word_counts = {}
        for word in keywords:
            word_counts[word] = word_counts.get(word, 0) + 1

        return [word for word, count in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:20]]

# Initialize Groq processor
try:
    groq_processor = GroqProcessor()
    print("✅ Groq Llama 3.3 70B ready for research tasks!")
except Exception as e:
    print(f"❌ Failed to initialize Groq processor: {e}")
    print("💡 Make sure you have set the GROQ_API_KEY environment variable")

✅ Groq Llama 3.3 70B initialized successfully!
✅ Groq Llama 3.3 70B ready for research tasks!


In [8]:
# ============================================================================
# LANGCHAIN RAG SYSTEM
# ============================================================================

class RAGSystem:
    """Retrieval-Augmented Generation system using LangChain + Groq Llama"""

    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings(model_name=config.EMBEDDING_MODEL)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.CHUNK_SIZE,
            chunk_overlap=config.CHUNK_OVERLAP
        )
        self.vectorstore = None
        self.qa_chain = None
        self.retriever = None
        self.memory = ConversationBufferMemory(
            memory_key="chat_history",
            return_messages=True
        )

        # Initialize RAG system
        self._setup_vectorstore()
        self._setup_qa_chain()

        print("✅ RAG System initialized successfully!")

    def _setup_vectorstore(self):
        """Setup ChromaDB vectorstore"""
        try:
            self.vectorstore = Chroma(
                persist_directory=config.PERSIST_DIRECTORY,
                embedding_function=self.embeddings,
                collection_name=config.COLLECTION_NAME
            )
            print("✅ ChromaDB vectorstore ready!")
        except Exception as e:
            print(f"❌ Failed to setup vectorstore: {e}")
            # Fallback to in-memory vectorstore
            self.vectorstore = Chroma(
                embedding_function=self.embeddings,
                collection_name=config.COLLECTION_NAME
            )
            print("✅ Fallback to in-memory vectorstore")

    def _setup_qa_chain(self):
        """Setup QA chain with Groq Llama"""
        try:
            self.retriever = self.vectorstore.as_retriever(
                search_kwargs={"k": config.TOP_K_SIMILAR}
            )

            # Create QA chain with Groq Llama
            self.qa_chain = RetrievalQA.from_chain_type(
                llm=groq_processor.llm,
                chain_type="stuff",
                retriever=self.retriever,
                return_source_documents=True
            )

            print("✅ QA chain ready!")
        except Exception as e:
            print(f"❌ Failed to setup QA chain: {e}")

    def add_documents(self, documents: List[Dict]):
        """Add documents to the vectorstore"""
        try:
            doc_objects = []
            for doc in documents:
                content = doc.get('content', '') or doc.get('abstract', '')
                if content:
                    chunks = self.text_splitter.split_text(content)
                    for chunk in chunks:
                        # Convert authors list to string
                        authors = doc.get('authors', 'Unknown')
                        if isinstance(authors, list):
                            authors = ', '.join(str(author) for author in authors)

                        # Ensure year is a string
                        year = doc.get('year', 'Unknown')
                        if year is not None:
                            year = str(year)
                        else:
                            year = 'Unknown'

                        doc_objects.append(Document(
                            page_content=chunk,
                            metadata={
                                'title': str(doc.get('title', 'Unknown')),
                                'authors': str(authors),
                                'source': str(doc.get('source', 'Unknown')),
                                'url': str(doc.get('url', '')),
                                'year': year
                            }
                        ))

            if doc_objects:
                self.vectorstore.add_documents(doc_objects)
                print(f"✅ Added {len(doc_objects)} document chunks to vectorstore")
            else:
                print("⚠️  No valid documents to add")

        except Exception as e:
            print(f"❌ Failed to add documents: {e}")

    def query(self, question: str, include_sources: bool = True) -> Dict:
        """Query the RAG system"""
        try:
            if not self.qa_chain:
                return {
                    'answer': 'RAG system not properly initialized',
                    'sources': [],
                    'error': 'System not ready'
                }

            # Get response from QA chain
            response = self.qa_chain({"query": question})

            result = {
                'answer': response['result'],
                'sources': [],
                'query': question,
                'timestamp': datetime.now().isoformat()
            }

            # Add source documents if requested
            if include_sources and 'source_documents' in response:
                for doc in response['source_documents']:
                    result['sources'].append({
                        'title': doc.metadata.get('title', 'Unknown'),
                        'authors': doc.metadata.get('authors', 'Unknown'),
                        'source': doc.metadata.get('source', 'Unknown'),
                        'url': doc.metadata.get('url', ''),
                        'content_snippet': doc.page_content[:200] + '...' if len(doc.page_content) > 200 else doc.page_content
                    })

            return result

        except Exception as e:
            return {
                'answer': f'Error processing query: {str(e)}',
                'sources': [],
                'query': question,
                'error': str(e),
                'timestamp': datetime.now().isoformat()
            }

    def get_stats(self) -> Dict:
        """Get RAG system statistics"""
        try:
            collection = self.vectorstore._collection
            doc_count = collection.count()
            return {
                'total_documents': doc_count,
                'embedding_model': config.EMBEDDING_MODEL,
                'chunk_size': config.CHUNK_SIZE,
                'chunk_overlap': config.CHUNK_OVERLAP,
                'vectorstore_type': 'ChromaDB',
                'status': 'Ready'
            }
        except Exception as e:
            return {
                'total_documents': 0,
                'status': f'Error: {str(e)}'
            }

    def clear_database(self):
        """Clear the vectorstore"""
        try:
            self.vectorstore.delete_collection()
            self._setup_vectorstore()
            print("✅ Database cleared successfully!")
        except Exception as e:
            print(f"❌ Failed to clear database: {e}")

    def persist(self):
        """Persist the vectorstore to disk"""
        try:
            self.vectorstore.persist()
            print("✅ Vectorstore persisted to disk!")
        except Exception as e:
            print(f"❌ Failed to persist vectorstore: {e}")

# Initialize RAG system
try:
    rag_system = RAGSystem()
    print("✅ RAG System ready for research queries!")
except Exception as e:
    print(f"❌ Failed to initialize RAG system: {e}")
    rag_system = None

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ ChromaDB vectorstore ready!
✅ QA chain ready!
✅ RAG System initialized successfully!
✅ RAG System ready for research queries!


In [9]:
# ============================================================================
# ARXIV INTEGRATION
# ============================================================================

class ArxivFetcher:
    """Enhanced arXiv paper fetcher with robust error handling"""

    def __init__(self):
        self.client = arxiv.Client(
            page_size=100,
            delay_seconds=1.0,
            num_retries=3
        )
        print("✅ arXiv fetcher initialized!")

    def search_papers(self, query: str, max_results: int = 10, sort_by: str = "relevance") -> List[Dict]:
        """Search for papers on arXiv"""
        try:
            # Build search query
            search = arxiv.Search(
                query=query,
                max_results=max_results,
                sort_by=getattr(arxiv.SortCriterion, sort_by.title(), arxiv.SortCriterion.Relevance)
            )

            papers = []
            for paper in self.client.results(search):
                try:
                    paper_data = {
                        'title': paper.title,
                        'authors': [author.name for author in paper.authors],
                        'abstract': paper.summary,
                        'url': paper.entry_id,
                        'pdf_url': paper.pdf_url,
                        'published': paper.published.isoformat() if paper.published else None,
                        'updated': paper.updated.isoformat() if paper.updated else None,
                        'categories': paper.categories,
                        'primary_category': paper.primary_category,
                        'source': 'arXiv',
                        'year': paper.published.year if paper.published else None,
                        'content': paper.summary  # Use abstract as content for now
                    }
                    papers.append(paper_data)
                except Exception as e:
                    print(f"⚠️  Error processing paper {paper.title}: {e}")
                    continue

            print(f"✅ Found {len(papers)} papers on arXiv")
            return papers

        except Exception as e:
            print(f"❌ Error searching arXiv: {e}")
            return []

    def get_paper_details(self, arxiv_id: str) -> Optional[Dict]:
        """Get detailed information about a specific paper"""
        try:
            search = arxiv.Search(id_list=[arxiv_id])
            paper = next(self.client.results(search))

            return {
                'title': paper.title,
                'authors': [author.name for author in paper.authors],
                'abstract': paper.summary,
                'url': paper.entry_id,
                'pdf_url': paper.pdf_url,
                'published': paper.published.isoformat() if paper.published else None,
                'updated': paper.updated.isoformat() if paper.updated else None,
                'categories': paper.categories,
                'primary_category': paper.primary_category,
                'source': 'arXiv',
                'year': paper.published.year if paper.published else None,
                'content': paper.summary,
                'doi': paper.doi,
                'journal_ref': paper.journal_ref,
                'comment': paper.comment
            }

        except Exception as e:
            print(f"❌ Error fetching arXiv paper {arxiv_id}: {e}")
            return None

    def download_pdf(self, paper_url: str, filename: str = None) -> Optional[str]:
        """Download PDF from arXiv"""
        try:
            if not filename:
                filename = f"arxiv_{int(time.time())}.pdf"

            # Get paper ID from URL
            paper_id = paper_url.split('/')[-1]
            if paper_id.startswith('abs'):
                paper_id = paper_id[4:]

            search = arxiv.Search(id_list=[paper_id])
            paper = next(self.client.results(search))

            # Download PDF
            paper.download_pdf(dirpath="./", filename=filename)

            print(f"✅ Downloaded PDF: {filename}")
            return filename

        except Exception as e:
            print(f"❌ Error downloading PDF: {e}")
            return None

    def get_trending_papers(self, category: str = "cs.AI", days: int = 7) -> List[Dict]:
        """Get trending papers from a specific category"""
        try:
            # Calculate date range
            end_date = datetime.now()
            start_date = end_date - timedelta(days=days)

            # Search for recent papers
            query = f"cat:{category} AND submittedDate:[{start_date.strftime('%Y%m%d')}* TO {end_date.strftime('%Y%m%d')}*]"

            search = arxiv.Search(
                query=query,
                max_results=50,
                sort_by=arxiv.SortCriterion.SubmittedDate,
                sort_order=arxiv.SortOrder.Descending
            )

            papers = []
            for paper in self.client.results(search):
                try:
                    paper_data = {
                        'title': paper.title,
                        'authors': [author.name for author in paper.authors],
                        'abstract': paper.summary,
                        'url': paper.entry_id,
                        'pdf_url': paper.pdf_url,
                        'published': paper.published.isoformat() if paper.published else None,
                        'categories': paper.categories,
                        'primary_category': paper.primary_category,
                        'source': 'arXiv',
                        'year': paper.published.year if paper.published else None,
                        'content': paper.summary
                    }
                    papers.append(paper_data)
                except Exception as e:
                    continue

            print(f"✅ Found {len(papers)} trending papers in {category}")
            return papers

        except Exception as e:
            print(f"❌ Error getting trending papers: {e}")
            return []

# Initialize arXiv fetcher
arxiv_fetcher = ArxivFetcher()
print("✅ arXiv integration ready!")

✅ arXiv fetcher initialized!
✅ arXiv integration ready!


In [23]:
# ============================================================================
# MULTI-SOURCE DATA COLLECTOR
# ============================================================================

class MultiSourceCollector:
    """Unified data collector for multiple research sources"""

    def __init__(self):
        self.arxiv_fetcher = arxiv_fetcher
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'ResearchMate/1.0 (Research Assistant)',
            'Accept': 'application/json'
        })
        print("✅ Multi-source collector initialized!")

    def safe_get(self, url: str, params: Dict = None, timeout: int = 30) -> Optional[Dict]:
        """Safely make HTTP requests with error handling"""
        try:
            response = self.session.get(url, params=params, timeout=timeout)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"⚠️  HTTP error for {url}: {e}")
            return None
        except json.JSONDecodeError as e:
            print(f"⚠️  JSON decode error for {url}: {e}")
            return None
        except Exception as e:
            print(f"⚠️  Unexpected error for {url}: {e}")
            return None

    def safe_string(self, value: Any) -> str:
        """Safely convert any value to string"""
        if value is None:
            return ""
        if isinstance(value, (list, tuple)):
            return ", ".join(str(v) for v in value if v is not None)
        return str(value)

    def search_semantic_scholar(self, query: str, limit: int = 10) -> List[Dict]:
        """Search Semantic Scholar API with retry logic"""
        try:
            url = "https://api.semanticscholar.org/graph/v1/paper/search"
            params = {
                'query': query,
                'limit': limit,
                'fields': 'title,authors,abstract,year,url,citationCount,publicationDate,venue,externalIds'
            }

            # Retry logic for rate limiting
            max_retries = 3
            for attempt in range(max_retries):
                data = self.safe_get(url, params)
                time.sleep((attempt + 1) * 5)  # Exponential backoff
                if data and 'data' in data:
                    break
                elif attempt < max_retries - 1:
                    print(f"⚠️  Semantic Scholar rate limited, waiting {(attempt + 1) * 5} seconds...")
                else:
                    print("⚠️  Semantic Scholar API unavailable after retries")
                    return []

            if not data or 'data' not in data:
                return []

            papers = []
            for paper in data['data']:
                try:
                    paper_data = {
                        'title': self.safe_string(paper.get('title', '')),
                        'authors': [author.get('name', '') for author in paper.get('authors', [])],
                        'abstract': self.safe_string(paper.get('abstract', '')),
                        'year': paper.get('year'),
                        'url': paper.get('url', ''),
                        'source': 'Semantic Scholar',
                        'citation_count': paper.get('citationCount', 0),
                        'venue': self.safe_string(paper.get('venue', '')),
                        'publication_date': self.safe_string(paper.get('publicationDate', '')),
                        'content': self.safe_string(paper.get('abstract', '')),
                        'external_ids': paper.get('externalIds', {})
                    }
                    papers.append(paper_data)
                except Exception as e:
                    print(f"⚠️  Error processing Semantic Scholar paper: {e}")
                    continue

            print(f"✅ Found {len(papers)} papers on Semantic Scholar")
            return papers

        except Exception as e:
            print(f"❌ Error searching Semantic Scholar: {e}")
            return []

    def search_crossref(self, query: str, limit: int = 10) -> List[Dict]:
        """Search CrossRef API"""
        try:
            url = "https://api.crossref.org/works"
            params = {
                'query': query,
                'rows': limit,
                'select': 'title,author,abstract,published-print,published-online,URL,DOI,publisher,container-title'
            }

            data = self.safe_get(url, params)
            if not data or 'message' not in data or 'items' not in data['message']:
                return []

            papers = []
            for item in data['message']['items']:
                try:
                    # Extract authors
                    authors = []
                    for author in item.get('author', []):
                        if 'given' in author and 'family' in author:
                            authors.append(f"{author['given']} {author['family']}")
                        elif 'name' in author:
                            authors.append(author['name'])

                    # Extract publication year
                    year = None
                    if 'published-print' in item:
                        year = item['published-print']['date-parts'][0][0]
                    elif 'published-online' in item:
                        year = item['published-online']['date-parts'][0][0]

                    paper_data = {
                        'title': self.safe_string(item.get('title', [''])[0]),
                        'authors': authors,
                        'abstract': self.safe_string(item.get('abstract', '')),
                        'year': year,
                        'url': item.get('URL', ''),
                        'doi': item.get('DOI', ''),
                        'publisher': self.safe_string(item.get('publisher', '')),
                        'journal': self.safe_string(item.get('container-title', [''])[0]),
                        'source': 'CrossRef',
                        'content': self.safe_string(item.get('abstract', ''))
                    }
                    papers.append(paper_data)
                except Exception as e:
                    print(f"⚠️  Error processing CrossRef paper: {e}")
                    continue

            print(f"✅ Found {len(papers)} papers on CrossRef")
            return papers

        except Exception as e:
            print(f"❌ Error searching CrossRef: {e}")
            return []

    def search_pubmed(self, query: str, limit: int = 10) -> List[Dict]:
        """Search PubMed via NCBI E-utilities"""
        try:
            # Search for paper IDs
            search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
            search_params = {
                'db': 'pubmed',
                'term': query,
                'retmax': limit,
                'retmode': 'json'
            }

            search_data = self.safe_get(search_url, search_params)
            if not search_data or 'esearchresult' not in search_data:
                return []

            ids = search_data['esearchresult'].get('idlist', [])
            if not ids:
                return []

            # Fetch paper details
            fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
            fetch_params = {
                'db': 'pubmed',
                'id': ','.join(ids),
                'retmode': 'xml'
            }

            response = self.session.get(fetch_url, params=fetch_params)
            if response.status_code != 200:
                return []

            if not WEB_SCRAPING_AVAILABLE:
                print("⚠️  BeautifulSoup not available for PubMed XML parsing")
                return []

            # Parse XML
            soup = BeautifulSoup(response.content, 'xml')
            papers = []

            for article in soup.find_all('PubmedArticle'):
                try:
                    medline = article.find('MedlineCitation')
                    if not medline:
                        continue

                    # Extract basic info
                    title_elem = medline.find('ArticleTitle')
                    title = title_elem.text if title_elem else 'Unknown Title'

                    abstract_elem = medline.find('Abstract')
                    abstract = ''
                    if abstract_elem:
                        abstract_texts = abstract_elem.find_all('AbstractText')
                        abstract = ' '.join([t.text for t in abstract_texts])

                    # Extract authors
                    authors = []
                    author_list = medline.find('AuthorList')
                    if author_list:
                        for author in author_list.find_all('Author'):
                            first_name = author.find('ForeName')
                            last_name = author.find('LastName')
                            if first_name and last_name:
                                authors.append(f"{first_name.text} {last_name.text}")

                    # Extract publication year
                    year = None
                    pub_date = medline.find('PubDate')
                    if pub_date:
                        year_elem = pub_date.find('Year')
                        if year_elem:
                            year = int(year_elem.text)

                    # Extract PMID
                    pmid_elem = medline.find('PMID')
                    pmid = pmid_elem.text if pmid_elem else ''

                    paper_data = {
                        'title': title,
                        'authors': authors,
                        'abstract': abstract,
                        'year': year,
                        'url': f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" if pmid else '',
                        'pmid': pmid,
                        'source': 'PubMed',
                        'content': abstract
                    }
                    papers.append(paper_data)

                except Exception as e:
                    print(f"⚠️  Error processing PubMed paper: {e}")
                    continue

            print(f"✅ Found {len(papers)} papers on PubMed")
            return papers

        except Exception as e:
            print(f"❌ Error searching PubMed: {e}")
            return []

    def search_all_sources(self, query: str, max_per_source: int = 5) -> List[Dict]:
        """Search all available sources"""
        print(f"🔍 Searching all sources for: '{query}'")

        all_papers = []

        # Search each source
        sources = [
            ("arXiv", lambda: self.arxiv_fetcher.search_papers(query, max_per_source)),
            ("Semantic Scholar", lambda: self.search_semantic_scholar(query, max_per_source)),
            ("CrossRef", lambda: self.search_crossref(query, max_per_source)),
            ("PubMed", lambda: self.search_pubmed(query, max_per_source))
        ]

        for source_name, search_func in sources:
            try:
                print(f"🔍 Searching {source_name}...")
                papers = search_func()
                all_papers.extend(papers)
                # Longer delay for Semantic Scholar to avoid rate limiting
                if source_name == "Semantic Scholar":
                    time.sleep(3)  # Longer delay for Semantic Scholar
                else:
                    time.sleep(1)  # Standard delay for other APIs
            except Exception as e:
                print(f"⚠️  Error searching {source_name}: {e}")
                continue

        # Deduplicate based on title similarity
        unique_papers = self.deduplicate_papers(all_papers)

        print(f"✅ Total unique papers found: {len(unique_papers)}")
        return unique_papers

    def deduplicate_papers(self, papers: List[Dict]) -> List[Dict]:
        """Remove duplicate papers based on title similarity"""
        if not papers:
            return papers

        unique_papers = []
        seen_titles = set()

        for paper in papers:
            title = paper.get('title', '').lower().strip()
            if not title:
                continue

            # Simple deduplication based on title
            title_words = set(title.split())
            is_duplicate = False

            for seen_title in seen_titles:
                seen_words = set(seen_title.split())
                if len(title_words.intersection(seen_words)) / len(title_words.union(seen_words)) > 0.8:
                    is_duplicate = True
                    break

            if not is_duplicate:
                unique_papers.append(paper)
                seen_titles.add(title)

        return unique_papers

# Initialize multi-source collector
multi_source_collector = MultiSourceCollector()
print("✅ Multi-source data collector ready!")

✅ Multi-source collector initialized!
✅ Multi-source data collector ready!


In [11]:
# ============================================================================
# ENHANCED PDF PROCESSING
# ============================================================================

class EnhancedPDFProcessor:
    """Advanced PDF processing with multiple extraction methods"""

    def __init__(self):
        self.methods = []
        if PDF_ENHANCED:
            self.methods.extend(['pdfplumber', 'fitz'])
        self.methods.append('PyPDF2')
        print(f"✅ PDF processor initialized with methods: {', '.join(self.methods)}")

    def extract_text_pypdf2(self, pdf_path: str) -> Dict[str, Any]:
        """Extract text using PyPDF2"""
        try:
            text = ""
            metadata = {}

            with open(pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)

                # Extract metadata
                if pdf_reader.metadata:
                    metadata = {
                        'title': pdf_reader.metadata.get('/Title', ''),
                        'author': pdf_reader.metadata.get('/Author', ''),
                        'subject': pdf_reader.metadata.get('/Subject', ''),
                        'creator': pdf_reader.metadata.get('/Creator', ''),
                        'producer': pdf_reader.metadata.get('/Producer', ''),
                        'creation_date': str(pdf_reader.metadata.get('/CreationDate', '')),
                        'modification_date': str(pdf_reader.metadata.get('/ModDate', ''))
                    }

                # Extract text from all pages
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text += page.extract_text() + "\n\n"

            return {
                'text': text.strip(),
                'metadata': metadata,
                'pages': len(pdf_reader.pages),
                'method': 'PyPDF2',
                'success': True
            }

        except Exception as e:
            return {
                'text': '',
                'metadata': {},
                'pages': 0,
                'method': 'PyPDF2',
                'success': False,
                'error': str(e)
            }

    def extract_text_pdfplumber(self, pdf_path: str) -> Dict[str, Any]:
        """Extract text using pdfplumber (better formatting)"""
        if not PDF_ENHANCED:
            return {'success': False, 'error': 'pdfplumber not available'}

        try:
            text = ""
            tables = []

            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages):
                    # Extract text with better formatting
                    page_text = page.extract_text()
                    if page_text:
                        text += f"--- Page {page_num + 1} ---\n"
                        text += page_text + "\n\n"

                    # Extract tables if present
                    page_tables = page.extract_tables()
                    if page_tables:
                        for table in page_tables:
                            tables.append({
                                'page': page_num + 1,
                                'data': table
                            })

            return {
                'text': text.strip(),
                'tables': tables,
                'pages': len(pdf.pages),
                'method': 'pdfplumber',
                'success': True
            }

        except Exception as e:
            return {
                'text': '',
                'tables': [],
                'pages': 0,
                'method': 'pdfplumber',
                'success': False,
                'error': str(e)
            }

    def extract_text_fitz(self, pdf_path: str) -> Dict[str, Any]:
        """Extract text using PyMuPDF (fitz) - best for complex layouts"""
        if not PDF_ENHANCED:
            return {'success': False, 'error': 'PyMuPDF not available'}

        try:
            text = ""
            doc = fitz.open(pdf_path)

            # Extract metadata
            metadata = doc.metadata

            # Extract text from all pages
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                page_text = page.get_text()
                if page_text.strip():
                    text += f"--- Page {page_num + 1} ---\n"
                    text += page_text + "\n\n"

            doc.close()

            return {
                'text': text.strip(),
                'metadata': metadata,
                'pages': len(doc),
                'method': 'PyMuPDF',
                'success': True
            }

        except Exception as e:
            return {
                'text': '',
                'metadata': {},
                'pages': 0,
                'method': 'PyMuPDF',
                'success': False,
                'error': str(e)
            }

    def extract_text_best_method(self, pdf_path: str) -> Dict[str, Any]:
        """Try multiple methods and return the best result"""
        methods = [
            ('pdfplumber', self.extract_text_pdfplumber),
            ('fitz', self.extract_text_fitz),
            ('PyPDF2', self.extract_text_pypdf2)
        ]

        best_result = None
        best_score = 0

        for method_name, method_func in methods:
            if method_name not in self.methods:
                continue

            try:
                result = method_func(pdf_path)
                if result['success']:
                    # Score based on text length and quality
                    text_length = len(result['text'])
                    score = text_length

                    # Bonus for having metadata
                    if 'metadata' in result and result['metadata']:
                        score += 1000

                    # Bonus for having tables (pdfplumber)
                    if 'tables' in result and result['tables']:
                        score += 500

                    if score > best_score:
                        best_score = score
                        best_result = result

            except Exception as e:
                print(f"⚠️  Method {method_name} failed: {e}")
                continue

        if best_result:
            print(f"✅ Best extraction method: {best_result['method']}")
            return best_result
        else:
            return {
                'text': '',
                'metadata': {},
                'pages': 0,
                'method': 'none',
                'success': False,
                'error': 'All extraction methods failed'
            }

    def identify_sections(self, text: str) -> Dict[str, str]:
        """Identify paper sections using AI"""
        try:
            # Use Groq to identify sections
            prompt = f"""Analyze this research paper text and identify the main sections.
            Extract the following sections if they exist:
            1. Abstract
            2. Introduction
            3. Methodology/Methods
            4. Results
            5. Discussion
            6. Conclusion
            7. References

            For each section found, provide the section name and its content.
            If a section is not found, return "Not found" for that section.

            Paper text (first 8000 characters):
            {text[:8000]}

            Please format your response as:
            ABSTRACT: [content or "Not found"]
            INTRODUCTION: [content or "Not found"]
            METHODOLOGY: [content or "Not found"]
            RESULTS: [content or "Not found"]
            DISCUSSION: [content or "Not found"]
            CONCLUSION: [content or "Not found"]
            REFERENCES: [content or "Not found"]"""

            response = groq_processor.generate_response(prompt, max_tokens=2000)

            # Parse response
            sections = {
                'abstract': '',
                'introduction': '',
                'methodology': '',
                'results': '',
                'discussion': '',
                'conclusion': '',
                'references': ''
            }

            lines = response.split('\n')
            current_section = None

            for line in lines:
                line = line.strip()
                if not line:
                    continue

                # Check for section headers
                if line.startswith('ABSTRACT:'):
                    current_section = 'abstract'
                    sections[current_section] = line[9:].strip()
                elif line.startswith('INTRODUCTION:'):
                    current_section = 'introduction'
                    sections[current_section] = line[13:].strip()
                elif line.startswith('METHODOLOGY:'):
                    current_section = 'methodology'
                    sections[current_section] = line[12:].strip()
                elif line.startswith('RESULTS:'):
                    current_section = 'results'
                    sections[current_section] = line[8:].strip()
                elif line.startswith('DISCUSSION:'):
                    current_section = 'discussion'
                    sections[current_section] = line[11:].strip()
                elif line.startswith('CONCLUSION:'):
                    current_section = 'conclusion'
                    sections[current_section] = line[11:].strip()
                elif line.startswith('REFERENCES:'):
                    current_section = 'references'
                    sections[current_section] = line[11:].strip()
                elif current_section and not line.startswith(('ABSTRACT:', 'INTRODUCTION:', 'METHODOLOGY:', 'RESULTS:', 'DISCUSSION:', 'CONCLUSION:', 'REFERENCES:')):
                    # Continue adding to current section
                    sections[current_section] += ' ' + line

            return sections

        except Exception as e:
            print(f"❌ Error identifying sections: {e}")
            return {
                'abstract': 'Error extracting sections',
                'introduction': 'Error extracting sections',
                'methodology': 'Error extracting sections',
                'results': 'Error extracting sections',
                'discussion': 'Error extracting sections',
                'conclusion': 'Error extracting sections',
                'references': 'Error extracting sections'
            }

    def process_pdf_file(self, pdf_path: str) -> Dict[str, Any]:
        """Complete PDF processing pipeline"""
        try:
            if not os.path.exists(pdf_path):
                return {
                    'success': False,
                    'error': f'PDF file not found: {pdf_path}'
                }

            print(f"📄 Processing PDF: {pdf_path}")

            # Extract text using best method
            extraction_result = self.extract_text_best_method(pdf_path)

            if not extraction_result['success']:
                return extraction_result

            # Identify sections
            sections = self.identify_sections(extraction_result['text'])

            # Generate summary
            summary = groq_processor.summarize_paper(
                title=extraction_result.get('metadata', {}).get('title', 'PDF Document'),
                abstract=sections.get('abstract', ''),
                content=extraction_result['text']
            )

            return {
                'success': True,
                'file_path': pdf_path,
                'text': extraction_result['text'],
                'metadata': extraction_result.get('metadata', {}),
                'pages': extraction_result.get('pages', 0),
                'extraction_method': extraction_result.get('method', 'unknown'),
                'sections': sections,
                'summary': summary,
                'tables': extraction_result.get('tables', []),
                'processed_at': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'success': False,
                'error': str(e),
                'file_path': pdf_path
            }

    def process_multiple_pdfs(self, pdf_paths: List[str]) -> List[Dict[str, Any]]:
        """Process multiple PDF files"""
        results = []

        for pdf_path in pdf_paths:
            result = self.process_pdf_file(pdf_path)
            results.append(result)

            # Add to RAG system if successful
            if result['success'] and rag_system:
                try:
                    doc_data = {
                        'title': result['metadata'].get('title', f'PDF: {os.path.basename(pdf_path)}'),
                        'content': result['text'],
                        'authors': result['metadata'].get('author', 'Unknown'),
                        'source': 'PDF',
                        'url': f'file://{pdf_path}',
                        'year': 'Unknown'
                    }
                    rag_system.add_documents([doc_data])
                    print(f"✅ Added PDF to RAG system: {doc_data['title']}")
                except Exception as e:
                    print(f"⚠️  Failed to add PDF to RAG system: {e}")

        return results

# Initialize PDF processor
pdf_processor = EnhancedPDFProcessor()
print("✅ Enhanced PDF processor ready!")

✅ PDF processor initialized with methods: pdfplumber, fitz, PyPDF2
✅ Enhanced PDF processor ready!


In [12]:
# ============================================================================
# CITATION NETWORK ANALYSIS - COMPLETELY REWRITTEN VERSION
# ============================================================================

import networkx as nx
import json
from datetime import datetime
from typing import List, Dict, Any
import matplotlib.pyplot as plt

class CitationNetworkAnalyzer:
    """Analyze citation networks and author collaborations - Robust Version"""

    def __init__(self):
        self.reset()
        print("✅ Citation network analyzer initialized (robust version)!")

    def reset(self):
        """Reset all data structures"""
        self.citation_graph = nx.DiGraph()
        self.author_graph = nx.Graph()
        self.paper_data = {}
        self.author_data = {}  # Separate storage for author data
        print("🔄 Citation network analyzer reset")

    def _safe_get_authors(self, paper: Dict) -> List[str]:
        """Safely extract and normalize author list from paper"""
        authors = paper.get('authors', [])

        # Handle None
        if authors is None:
            return []

        # Handle string (comma-separated)
        if isinstance(authors, str):
            if not authors.strip():
                return []
            return [a.strip() for a in authors.split(',') if a.strip()]

        # Handle list
        if isinstance(authors, list):
            result = []
            for author in authors:
                if isinstance(author, str) and author.strip():
                    result.append(author.strip())
                elif isinstance(author, dict):
                    # Handle author objects with 'name' field
                    name = author.get('name', '') or author.get('authorId', '')
                    if name and isinstance(name, str):
                        result.append(name.strip())
            return result

        # Unknown format
        return []

    def _safe_add_author(self, author_name: str, paper_id: str, citation_count: int = 0):
        """Safely add author to the graph"""
        try:
            # Initialize author data if not exists
            if author_name not in self.author_data:
                self.author_data[author_name] = {
                    'papers': [],
                    'total_citations': 0
                }

            # Add to NetworkX graph if not exists
            if not self.author_graph.has_node(author_name):
                self.author_graph.add_node(author_name)

            # Update author data
            if paper_id not in self.author_data[author_name]['papers']:
                self.author_data[author_name]['papers'].append(paper_id)
                self.author_data[author_name]['total_citations'] += citation_count

            return True

        except Exception as e:
            print(f"⚠️  Error adding author {author_name}: {e}")
            return False

    def _safe_add_collaboration(self, author1: str, author2: str, paper_id: str):
        """Safely add collaboration edge between authors"""
        try:
            # Ensure both authors exist
            if not self.author_graph.has_node(author1):
                self.author_graph.add_node(author1)
            if not self.author_graph.has_node(author2):
                self.author_graph.add_node(author2)

            # Add or update edge
            if self.author_graph.has_edge(author1, author2):
                # Update existing edge
                edge_data = self.author_graph.edges[author1, author2]
                edge_data['weight'] = edge_data.get('weight', 0) + 1
                if 'papers' not in edge_data:
                    edge_data['papers'] = []
                if paper_id not in edge_data['papers']:
                    edge_data['papers'].append(paper_id)
            else:
                # Add new edge
                self.author_graph.add_edge(author1, author2, weight=1, papers=[paper_id])

            return True

        except Exception as e:
            print(f"⚠️  Error adding collaboration {author1}-{author2}: {e}")
            return False

    def add_papers(self, papers: List[Dict]):
        """Add papers to the citation network - Robust Version"""
        if not papers:
            print("⚠️  No papers provided to add_papers")
            return

        processed_count = 0
        error_count = 0

        print(f"📝 Processing {len(papers)} papers...")

        for paper_idx, paper in enumerate(papers):
            try:
                # Validate paper input
                if not isinstance(paper, dict):
                    print(f"⚠️  Paper {paper_idx} is not a dict: {type(paper)}")
                    error_count += 1
                    continue

                # Generate paper ID
                paper_id = paper.get('paper_id')
                if not paper_id:
                    paper_id = paper.get('url', '')
                    if not paper_id:
                        title = paper.get('title', f'Unknown_{paper_idx}')
                        paper_id = f"paper_{abs(hash(title)) % 1000000}"

                # Store paper data
                self.paper_data[paper_id] = {
                    'title': paper.get('title', ''),
                    'authors': self._safe_get_authors(paper),
                    'year': paper.get('year'),
                    'venue': paper.get('venue', ''),
                    'citation_count': paper.get('citation_count', 0),
                    'source': paper.get('source', ''),
                    'url': paper.get('url', ''),
                    'abstract': paper.get('abstract', '')
                }

                # Add to citation graph
                self.citation_graph.add_node(paper_id, **self.paper_data[paper_id])

                # Process authors
                authors = self._safe_get_authors(paper)
                citation_count = paper.get('citation_count', 0)

                # Validate citation count
                if not isinstance(citation_count, (int, float)):
                    citation_count = 0

                # Add authors
                valid_authors = []
                for author in authors:
                    if self._safe_add_author(author, paper_id, citation_count):
                        valid_authors.append(author)

                # Add collaborations
                for i, author1 in enumerate(valid_authors):
                    for j, author2 in enumerate(valid_authors):
                        if i < j:  # Avoid duplicates and self-loops
                            self._safe_add_collaboration(author1, author2, paper_id)

                processed_count += 1

            except Exception as e:
                print(f"⚠️  Error processing paper {paper_idx}: {e}")
                error_count += 1
                continue

        print(f"✅ Successfully processed {processed_count} papers ({error_count} errors)")
        print(f"📊 Total papers: {len(self.paper_data)}")
        print(f"📊 Total authors: {len(self.author_data)}")
        print(f"📊 Author graph nodes: {len(self.author_graph.nodes)}")

    def analyze_author_network(self) -> Dict:
        """Analyze author collaboration network"""
        try:
            if len(self.author_graph.nodes) == 0:
                return {'error': 'No authors in network'}

            # Basic network metrics
            metrics = {
                'total_authors': len(self.author_graph.nodes),
                'total_collaborations': len(self.author_graph.edges),
                'network_density': nx.density(self.author_graph),
                'number_of_components': nx.number_connected_components(self.author_graph),
                'largest_component_size': len(max(nx.connected_components(self.author_graph), key=len)) if nx.number_connected_components(self.author_graph) > 0 else 0
            }

            # Most collaborative authors
            collaboration_counts = {node: self.author_graph.degree(node) for node in self.author_graph.nodes}
            top_collaborators = sorted(collaboration_counts.items(), key=lambda x: x[1], reverse=True)[:10]

            # Most productive authors (using separate author_data)
            productivity = {}
            for author, data in self.author_data.items():
                productivity[author] = len(data.get('papers', []))
            top_productive = sorted(productivity.items(), key=lambda x: x[1], reverse=True)[:10]

            # Most cited authors
            citation_counts = {}
            for author, data in self.author_data.items():
                citation_counts[author] = data.get('total_citations', 0)
            top_cited = sorted(citation_counts.items(), key=lambda x: x[1], reverse=True)[:10]

            # Central authors (betweenness centrality)
            try:
                if len(self.author_graph.nodes) > 1:
                    centrality = nx.betweenness_centrality(self.author_graph)
                    top_central = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:10]
                else:
                    top_central = []
            except Exception as centrality_error:
                print(f"⚠️  Error calculating centrality: {centrality_error}")
                top_central = []

            return {
                'network_metrics': metrics,
                'top_collaborators': top_collaborators,
                'top_productive_authors': top_productive,
                'top_cited_authors': top_cited,
                'top_central_authors': top_central,
                'analysis_timestamp': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'error': str(e),
                'analysis_timestamp': datetime.now().isoformat()
            }

    def analyze_paper_network(self) -> Dict:
        """Analyze paper citation network"""
        try:
            if len(self.citation_graph.nodes) == 0:
                return {'error': 'No papers in network'}

            # Basic network metrics
            metrics = {
                'total_papers': len(self.citation_graph.nodes),
                'total_citations': len(self.citation_graph.edges),
                'network_density': nx.density(self.citation_graph),
                'number_of_components': nx.number_weakly_connected_components(self.citation_graph),
                'largest_component_size': len(max(nx.weakly_connected_components(self.citation_graph), key=len)) if nx.number_weakly_connected_components(self.citation_graph) > 0 else 0
            }

            # Most cited papers
            in_degree = dict(self.citation_graph.in_degree())
            most_cited = sorted(in_degree.items(), key=lambda x: x[1], reverse=True)[:10]

            # Most citing papers
            out_degree = dict(self.citation_graph.out_degree())
            most_citing = sorted(out_degree.items(), key=lambda x: x[1], reverse=True)[:10]

            # Convert paper IDs to titles for readability
            most_cited_titles = []
            for paper_id, count in most_cited:
                if paper_id in self.paper_data:
                    most_cited_titles.append((self.paper_data[paper_id]['title'], count))
                else:
                    most_cited_titles.append((paper_id, count))

            most_citing_titles = []
            for paper_id, count in most_citing:
                if paper_id in self.paper_data:
                    most_citing_titles.append((self.paper_data[paper_id]['title'], count))
                else:
                    most_citing_titles.append((paper_id, count))

            return {
                'network_metrics': metrics,
                'most_cited_papers': most_cited_titles,
                'most_citing_papers': most_citing_titles,
                'analysis_timestamp': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'error': str(e),
                'analysis_timestamp': datetime.now().isoformat()
            }

    def find_citation_relationships(self, papers: List[Dict]) -> Dict:
        """Find citation relationships between papers"""
        try:
            # This is a placeholder for AI-based analysis
            # You would integrate with your groq_processor here
            return {
                'relationships': [],
                'total_papers_analyzed': len(papers),
                'analysis_method': 'AI-inferred',
                'timestamp': datetime.now().isoformat(),
                'note': 'AI analysis not implemented in this version'
            }
        except Exception as e:
            return {
                'relationships': [],
                'error': str(e),
                'total_papers_analyzed': 0,
                'analysis_method': 'AI-inferred',
                'timestamp': datetime.now().isoformat()
            }

    def visualize_author_network(self, top_n: int = 20, save_path: str = None) -> str:
        """Visualize author collaboration network"""
        try:
            if len(self.author_graph.nodes) == 0:
                return "No authors to visualize"

            # Get top N most collaborative authors
            collaboration_counts = {node: self.author_graph.degree(node) for node in self.author_graph.nodes}
            top_authors = sorted(collaboration_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]
            top_author_names = [author for author, count in top_authors]

            # Create subgraph
            subgraph = self.author_graph.subgraph(top_author_names)

            # Create visualization
            plt.figure(figsize=(12, 8))

            # Position nodes
            pos = nx.spring_layout(subgraph, k=1, iterations=50)

            # Draw nodes
            node_sizes = [subgraph.degree(node) * 100 for node in subgraph.nodes]
            nx.draw_networkx_nodes(subgraph, pos, node_size=node_sizes,
                                 node_color='lightblue', alpha=0.7)

            # Draw edges
            edge_weights = [subgraph.edges[edge].get('weight', 1) for edge in subgraph.edges]
            nx.draw_networkx_edges(subgraph, pos, width=edge_weights,
                                 alpha=0.5, edge_color='gray')

            # Draw labels
            nx.draw_networkx_labels(subgraph, pos, font_size=8, font_weight='bold')

            plt.title(f"Author Collaboration Network (Top {top_n} Authors)")
            plt.axis('off')

            # Save or show
            if save_path:
                plt.savefig(save_path, dpi=300, bbox_inches='tight')
                plt.close()
                return f"Visualization saved to {save_path}"
            else:
                plt.show()
                return "Visualization displayed"

        except Exception as e:
            return f"Error creating visualization: {str(e)}"

    def get_network_summary(self) -> Dict:
        """Get comprehensive network summary"""
        try:
            author_analysis = self.analyze_author_network()
            paper_analysis = self.analyze_paper_network()

            return {
                'author_network': author_analysis,
                'paper_network': paper_analysis,
                'overall_stats': {
                    'total_papers': len(self.paper_data),
                    'total_authors': len(self.author_data),
                    'papers_per_author': len(self.paper_data) / max(len(self.author_data), 1),
                    'collaborations_per_author': len(self.author_graph.edges) / max(len(self.author_graph.nodes), 1)
                },
                'analysis_timestamp': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'error': str(e),
                'analysis_timestamp': datetime.now().isoformat()
            }

# Create a new instance to replace the old one
def create_new_citation_analyzer():
    """Factory function to create a new citation analyzer"""
    return CitationNetworkAnalyzer()

# Initialize citation network analyzer
citation_analyzer = create_new_citation_analyzer()
print("✅ Citation network analyzer ready (robust version)!")

# If you're using this in an existing system, you might want to:
# globals()['citation_analyzer'] = create_new_citation_analyzer()

🔄 Citation network analyzer reset
✅ Citation network analyzer initialized (robust version)!
✅ Citation network analyzer ready (robust version)!


In [19]:
# ============================================================================
# RESEARCH TREND MONITORING
# ============================================================================

class ResearchTrendMonitor:
    """Monitor research trends and detect emerging topics"""

    def __init__(self):
        self.trend_data = {}
        self.keyword_trends = defaultdict(list)
        self.topic_evolution = {}
        print("✅ Research trend monitor initialized!")

    def analyze_temporal_trends(self, papers: List[Dict], timeframe: str = "yearly") -> Dict:
        """Analyze trends over time"""
        try:
            if not papers:
                return {'error': 'No papers provided'}

            # Group papers by time period
            time_groups = defaultdict(list)

            for paper in papers:
                year = paper.get('year')
                if not year:
                    continue

                if timeframe == "yearly":
                    time_key = str(year)
                elif timeframe == "monthly":
                    # For monthly, we'd need more detailed date info
                    time_key = str(year)
                else:
                    time_key = str(year)

                time_groups[time_key].append(paper)

            # Analyze trends for each time period
            trend_analysis = {}

            for time_key, period_papers in time_groups.items():
                # Extract keywords and topics
                all_text = ' '.join([
                    (paper.get('title', '') + ' ' + paper.get('abstract', ''))
                    for paper in period_papers
                ])

                keywords = self._extract_period_keywords(all_text)

                # Count papers by category/source
                categories = defaultdict(int)
                sources = defaultdict(int)

                for paper in period_papers:
                    paper_categories = paper.get('categories', [])
                    if isinstance(paper_categories, list):
                        for cat in paper_categories:
                            categories[cat] += 1
                    elif isinstance(paper_categories, str):
                        categories[paper_categories] += 1

                    source = paper.get('source', 'Unknown')
                    sources[source] += 1

                trend_analysis[time_key] = {
                    'paper_count': len(period_papers),
                    'top_keywords': keywords[:20],
                    'top_categories': dict(sorted(categories.items(), key=lambda x: x[1], reverse=True)[:10]),
                    'sources': dict(sources),
                    'avg_citations': np.mean([paper.get('citation_count', 0) for paper in period_papers])
                }

            # Identify emerging trends
            emerging_trends = self._identify_emerging_trends(trend_analysis)

            return {
                'temporal_analysis': trend_analysis,
                'emerging_trends': emerging_trends,
                'timeframe': timeframe,
                'total_papers': len(papers),
                'analysis_timestamp': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'error': str(e),
                'analysis_timestamp': datetime.now().isoformat()
            }

    def _extract_period_keywords(self, text: str) -> List[str]:
        """Extract keywords from text for a specific period"""
        # Clean and tokenize
        words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())

        # Common stop words
        stop_words = {
            'the', 'and', 'for', 'are', 'with', 'this', 'that', 'from', 'they', 'have',
            'been', 'was', 'were', 'will', 'would', 'could', 'should', 'can', 'may',
            'paper', 'study', 'research', 'method', 'approach', 'results', 'conclusion',
            'show', 'present', 'propose', 'analysis', 'experiments', 'data', 'based'
        }

        # Filter and count
        word_counts = defaultdict(int)
        for word in words:
            if word not in stop_words and len(word) > 3:
                word_counts[word] += 1

        # Return top keywords
        return [word for word, count in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)]

    def _identify_emerging_trends(self, trend_analysis: Dict) -> List[Dict]:
        """Identify emerging trends from temporal analysis"""
        emerging_trends = []

        # Sort time periods
        sorted_periods = sorted(trend_analysis.keys())

        if len(sorted_periods) < 2:
            return emerging_trends

        # Compare recent periods with earlier ones
        recent_period = sorted_periods[-1]
        previous_period = sorted_periods[-2] if len(sorted_periods) >= 2 else None

        if not previous_period:
            return emerging_trends

        recent_keywords = set(trend_analysis[recent_period]['top_keywords'][:10])
        previous_keywords = set(trend_analysis[previous_period]['top_keywords'][:10])

        # Find new keywords
        new_keywords = recent_keywords - previous_keywords

        # Find growing categories
        recent_categories = trend_analysis[recent_period]['top_categories']
        previous_categories = trend_analysis[previous_period]['top_categories']

        growing_categories = []
        for cat, count in recent_categories.items():
            prev_count = previous_categories.get(cat, 0)
            if count > prev_count * 1.5:  # 50% growth threshold
                growing_categories.append({
                    'category': cat,
                    'growth_rate': (count - prev_count) / max(prev_count, 1),
                    'recent_count': count,
                    'previous_count': prev_count
                })

        emerging_trends.append({
            'type': 'new_keywords',
            'keywords': list(new_keywords),
            'period': recent_period
        })

        emerging_trends.append({
            'type': 'growing_categories',
            'categories': growing_categories,
            'period': recent_period
        })

        return emerging_trends

    def detect_research_gaps(self, papers: List[Dict]) -> Dict:
        """Detect potential research gaps using AI analysis"""
        try:
            if not papers:
                return {'error': 'No papers provided'}

            # Analyze papers to identify gaps
            sample_papers = papers[:15]  # Limit to avoid token limits

            # Prepare paper summaries
            paper_summaries = []
            for paper in sample_papers:
                summary = f"Title: {paper.get('title', 'Unknown')}\n"
                summary += f"Abstract: {paper.get('abstract', 'N/A')[:300]}...\n"
                summary += f"Year: {paper.get('year', 'Unknown')}\n"
                summary += f"Authors: {', '.join(paper.get('authors', [])[:3])}\n"
                paper_summaries.append(summary)

            # Use AI to identify gaps
            prompt = f"""Analyze these research papers and identify potential research gaps and opportunities:

{chr(10).join(paper_summaries[:10])}

Please identify:
1. **Methodological Gaps**: Missing approaches or techniques
2. **Temporal Gaps**: Time periods or recent developments not covered
3. **Interdisciplinary Gaps**: Connections between fields that could be explored
4. **Application Gaps**: Real-world applications that need more research
5. **Data Gaps**: Types of data or datasets that are underexplored

Format your response as structured points under each category.
Focus on actionable research opportunities."""

            response = groq_processor.generate_response(prompt, max_tokens=2000)

            # Extract methodological insights
            methodologies = self._extract_methodologies(sample_papers)

            return {
                'ai_analysis': response,
                'methodological_analysis': methodologies,
                'papers_analyzed': len(sample_papers),
                'analysis_timestamp': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'error': str(e),
                'analysis_timestamp': datetime.now().isoformat()
            }

    def _extract_methodologies(self, papers: List[Dict]) -> Dict:
        """Extract and analyze methodologies from papers across all research disciplines"""
        methodologies = defaultdict(int)
        data_types = defaultdict(int)
        evaluation_metrics = defaultdict(int)

        # General research methodology keywords (applicable across disciplines)
        method_keywords = {
            'quantitative_analysis': ['quantitative', 'statistical analysis', 'survey', 'questionnaire', 'regression', 'correlation'],
            'qualitative_analysis': ['qualitative', 'interview', 'case study', 'ethnography', 'grounded theory', 'content analysis'],
            'experimental': ['experiment', 'randomized', 'control group', 'controlled trial', 'intervention', 'treatment'],
            'observational': ['observational', 'longitudinal', 'cross-sectional', 'cohort study', 'prospective', 'retrospective'],
            'systematic_review': ['systematic review', 'meta-analysis', 'literature review', 'scoping review'],
            'theoretical': ['theoretical framework', 'conceptual model', 'theoretical analysis', 'mathematical model'],
            'simulation': ['simulation', 'modeling', 'monte carlo', 'computational model', 'numerical analysis'],
            'comparative': ['comparative study', 'comparison', 'cross-country', 'benchmarking', 'comparative analysis'],
            'mixed_methods': ['mixed methods', 'triangulation', 'multi-method', 'convergent parallel'],
            'field_study': ['field study', 'field work', 'natural setting', 'in-situ', 'real-world']
        }

        for paper in papers:
            content = (paper.get('title', '') + ' ' + paper.get('abstract', '')).lower()

            # Count methodologies
            for method, keywords in method_keywords.items():
                if any(keyword in content for keyword in keywords):
                    methodologies[method] += 1

            # General data types (applicable across disciplines)
            data_type_keywords = {
                'quantitative_data': ['numerical data', 'statistical data', 'survey data', 'measurement', 'metrics'],
                'qualitative_data': ['interview data', 'textual data', 'narrative', 'qualitative data', 'thematic'],
                'archival_data': ['archival', 'historical data', 'records', 'documents', 'archives'],
                'observational_data': ['observational data', 'behavioral data', 'field notes', 'observations'],
                'secondary_data': ['secondary data', 'existing data', 'database', 'administrative data'],
                'experimental_data': ['experimental data', 'controlled data', 'laboratory data', 'trial data']
            }

            for data_type, keywords in data_type_keywords.items():
                if any(keyword in content for keyword in keywords):
                    data_types[data_type] += 1

            # General evaluation metrics (cross-disciplinary)
            general_metrics = [
                'validity', 'reliability', 'significance', 'confidence interval', 'p-value',
                'effect size', 'correlation coefficient', 'cronbach alpha', 'inter-rater reliability',
                'sensitivity', 'specificity', 'response rate', 'sample size', 'power analysis'
            ]

            for metric in general_metrics:
                if metric in content:
                    evaluation_metrics[metric] += 1

        return {
            'methodologies': dict(methodologies),
            'data_types': dict(data_types),
            'evaluation_metrics': dict(evaluation_metrics),
            'total_papers': len(papers),
            'analysis_timestamp': datetime.now().isoformat()
        }

    def generate_trend_report(self, papers: List[Dict]) -> Dict:
        """Generate comprehensive trend report"""
        try:
            # Temporal analysis
            temporal_trends = self.analyze_temporal_trends(papers)

            # Research gaps
            research_gaps = self.detect_research_gaps(papers)

            # Overall statistics
            total_papers = len(papers)
            years = [p.get('year') for p in papers if p.get('year')]
            year_range = f"{min(years)} - {max(years)}" if years else "Unknown"

            # Top authors
            author_counts = defaultdict(int)
            for paper in papers:
                for author in paper.get('authors', []):
                    author_counts[author] += 1

            top_authors = sorted(author_counts.items(), key=lambda x: x[1], reverse=True)[:10]

            # Top venues/sources
            venue_counts = defaultdict(int)
            for paper in papers:
                venue = paper.get('venue', paper.get('source', 'Unknown'))
                venue_counts[venue] += 1

            top_venues = sorted(venue_counts.items(), key=lambda x: x[1], reverse=True)[:10]

            # AI-powered summary
            summary_prompt = f"""Generate a comprehensive research trend summary based on this data:

Total Papers: {total_papers}
Year Range: {year_range}
Top Authors: {', '.join([f"{author} ({count})" for author, count in top_authors[:5]])}
Top Venues: {', '.join([f"{venue} ({count})" for venue, count in top_venues[:5]])}

Key Trends: {temporal_trends.get('emerging_trends', [])}

Provide a 3-paragraph executive summary covering:
1. Overall research landscape and activity
2. Key trends and emerging areas
3. Future research directions and opportunities"""

            ai_summary = groq_processor.generate_response(summary_prompt, max_tokens=1500)

            return {
                'executive_summary': ai_summary,
                'statistics': {
                    'total_papers': total_papers,
                    'year_range': year_range,
                    'top_authors': top_authors,
                    'top_venues': top_venues
                },
                'temporal_trends': temporal_trends,
                'research_gaps': research_gaps,
                'report_generated': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'error': str(e),
                'report_generated': datetime.now().isoformat()
            }

    def monitor_realtime_trends(self, query: str, sources: List[str] = None) -> Dict:
        """Monitor real-time trends for a specific query"""
        try:
            if not sources:
                sources = ['arxiv', 'semantic_scholar']

            # Collect recent papers
            recent_papers = []

            for source in sources:
                try:
                    if source == 'arxiv':
                        papers = arxiv_fetcher.get_trending_papers(days=30)
                        # Filter by query
                        filtered_papers = [p for p in papers if query.lower() in p.get('title', '').lower() or query.lower() in p.get('abstract', '').lower()]
                        recent_papers.extend(filtered_papers[:10])

                    elif source == 'semantic_scholar':
                        papers = multi_source_collector.search_semantic_scholar(query, limit=10)
                        recent_papers.extend(papers)

                    time.sleep(1)  # Rate limiting

                except Exception as e:
                    print(f"⚠️  Error monitoring {source}: {e}")
                    continue

            if not recent_papers:
                return {
                    'query': query,
                    'trend_status': 'No recent papers found',
                    'papers_found': 0,
                    'monitoring_timestamp': datetime.now().isoformat()
                }

            # Analyze trends
            trend_analysis = self.analyze_temporal_trends(recent_papers)

            # Generate insights
            insights_prompt = f"""Analyze these recent research papers for the query "{query}":

Found {len(recent_papers)} recent papers.

Recent trends: {trend_analysis.get('emerging_trends', [])}

Provide insights on:
1. Current research momentum (high/medium/low)
2. Key developments in the last 30 days
3. Emerging sub-topics or applications
4. Potential future directions

Keep it concise and actionable."""

            insights = groq_processor.generate_response(insights_prompt, max_tokens=1000)

            return {
                'query': query,
                'trend_status': 'active' if len(recent_papers) > 5 else 'emerging',
                'papers_found': len(recent_papers),
                'recent_papers': recent_papers[:5],  # Top 5 most recent
                'trend_analysis': trend_analysis,
                'ai_insights': insights,
                'monitoring_timestamp': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'query': query,
                'error': str(e),
                'monitoring_timestamp': datetime.now().isoformat()
            }

    def create_trend_visualization(self, trend_data: Dict, save_path: str = None) -> str:
        """Create trend visualization"""
        try:
            if 'temporal_analysis' not in trend_data:
                return "No temporal data available for visualization"

            temporal_data = trend_data['temporal_analysis']

            # Extract data for plotting
            years = sorted(temporal_data.keys())
            paper_counts = [temporal_data[year]['paper_count'] for year in years]

            # Create visualization
            fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

            # Plot 1: Papers over time
            ax1.plot(years, paper_counts, marker='o', linewidth=2, markersize=8)
            ax1.set_title('Research Papers Over Time', fontsize=14, fontweight='bold')
            ax1.set_xlabel('Year')
            ax1.set_ylabel('Number of Papers')
            ax1.grid(True, alpha=0.3)

            # Plot 2: Top keywords word cloud for recent period
            if years:
                recent_year = years[-1]
                recent_keywords = temporal_data[recent_year]['top_keywords'][:20]

                if recent_keywords:
                    # Create word frequency dict
                    word_freq = {word: len(recent_keywords) - i for i, word in enumerate(recent_keywords)}

                    # Create word cloud
                    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

                    ax2.imshow(wordcloud, interpolation='bilinear')
                    ax2.axis('off')
                    ax2.set_title(f'Top Keywords ({recent_year})', fontsize=14, fontweight='bold')

            plt.tight_layout()

            # Save or show
            if save_path:
                plt.savefig(save_path, dpi=300, bbox_inches='tight')
                plt.close()
                return f"Trend visualization saved to {save_path}"
            else:
                plt.show()
                return "Trend visualization displayed"

        except Exception as e:
            return f"Error creating trend visualization: {str(e)}"

# Initialize trend monitor
trend_monitor = ResearchTrendMonitor()
print("✅ Research trend monitor ready!")

✅ Research trend monitor initialized!
✅ Research trend monitor ready!


In [14]:
# ============================================================================
# ADVANCED RESEARCH ASSISTANT
# ============================================================================

class AdvancedResearchAssistant:
    """Advanced research assistant with project management and analysis capabilities"""

    def __init__(self):
        self.projects = {}
        self.literature_reviews = {}
        self.analysis_cache = {}

        # Initialize components
        self.groq_processor = groq_processor
        self.rag_system = rag_system
        self.arxiv_fetcher = arxiv_fetcher
        self.multi_source_collector = multi_source_collector
        self.pdf_processor = pdf_processor
        self.citation_analyzer = citation_analyzer
        self.trend_monitor = trend_monitor

        print("✅ Advanced Research Assistant initialized!")

    def create_research_project(self, project_name: str, research_question: str,
                              keywords: List[str], scope: str = "comprehensive") -> Dict:
        """Create a new research project"""
        try:
            project_id = f"proj_{int(time.time())}_{hash(project_name) % 10000}"

            project = {
                'id': project_id,
                'name': project_name,
                'research_question': research_question,
                'keywords': keywords,
                'scope': scope,
                'created_at': datetime.now().isoformat(),
                'status': 'active',
                'papers': [],
                'analyses': {},
                'notes': [],
                'progress': {
                    'literature_search': 'pending',
                    'paper_analysis': 'pending',
                    'gap_analysis': 'pending',
                    'trend_analysis': 'pending',
                    'report_generation': 'pending'
                }
            }

            self.projects[project_id] = project

            # Generate initial research plan
            plan_prompt = f"""Create a comprehensive research plan for this project:

Project: {project_name}
Research Question: {research_question}
Keywords: {', '.join(keywords)}
Scope: {scope}

Generate a structured research plan including:
1. **Literature Search Strategy**
2. **Key Areas to Investigate**
3. **Methodology Approach**
4. **Expected Outcomes**
5. **Timeline Recommendations**

Format as a detailed research plan."""

            research_plan = self.groq_processor.generate_response(plan_prompt, max_tokens=2000)
            project['research_plan'] = research_plan

            return {
                'success': True,
                'project_id': project_id,
                'project': project,
                'message': f'Research project "{project_name}" created successfully'
            }

        except Exception as e:
            return {
                'success': False,
                'error': str(e),
                'message': 'Failed to create research project'
            }

    def conduct_literature_search(self, project_id: str, max_papers_per_source: int = 10) -> Dict:
        """Conduct comprehensive literature search for a project"""
        try:
            if project_id not in self.projects:
                return {'success': False, 'error': 'Project not found'}

            project = self.projects[project_id]
            keywords = project['keywords']

            print(f"🔍 Conducting literature search for project: {project['name']}")

            # Search all sources
            all_papers = []
            search_results = {}

            for keyword in keywords:
                print(f"🔍 Searching for keyword: {keyword}")

                # Multi-source search
                papers = self.multi_source_collector.search_all_sources(keyword, max_papers_per_source)
                all_papers.extend(papers)
                search_results[keyword] = len(papers)

                time.sleep(2)  # Rate limiting

            # Deduplicate papers
            unique_papers = self.multi_source_collector.deduplicate_papers(all_papers)

            # Add papers to project
            project['papers'] = unique_papers
            project['progress']['literature_search'] = 'completed'

            # Add to RAG system
            if self.rag_system:
                self.rag_system.add_documents(unique_papers)

            # Add to citation analyzer
            self.citation_analyzer.add_papers(unique_papers)

            return {
                'success': True,
                'papers_found': len(unique_papers),
                'search_results': search_results,
                'unique_papers': len(unique_papers),
                'message': f'Found {len(unique_papers)} unique papers'
            }

        except Exception as e:
            return {
                'success': False,
                'error': str(e),
                'message': 'Literature search failed'
            }

    def analyze_literature(self, project_id: str) -> Dict:
        """Analyze literature for a project"""
        try:
            if project_id not in self.projects:
                return {'success': False, 'error': 'Project not found'}

            project = self.projects[project_id]
            papers = project['papers']

            if not papers:
                return {'success': False, 'error': 'No papers found in project'}

            print(f"📊 Analyzing literature for project: {project['name']}")

            # Comprehensive analysis
            analyses = {}

            # 1. Citation network analysis
            print("📊 Analyzing citation networks...")
            citation_analysis = self.citation_analyzer.get_network_summary()
            analyses['citation_network'] = citation_analysis

            # 2. Trend analysis
            print("📊 Analyzing research trends...")
            trend_analysis = self.trend_monitor.generate_trend_report(papers)
            analyses['trends'] = trend_analysis

            # 3. Gap analysis
            print("📊 Detecting research gaps...")
            gap_analysis = self.trend_monitor.detect_research_gaps(papers)
            analyses['gaps'] = gap_analysis

            # 4. Thematic analysis using AI
            print("📊 Conducting thematic analysis...")
            thematic_analysis = self._conduct_thematic_analysis(papers, project['research_question'])
            analyses['thematic'] = thematic_analysis

            # 5. Methodology analysis
            print("📊 Analyzing methodologies...")
            methodology_analysis = self._analyze_methodologies(papers)
            analyses['methodologies'] = methodology_analysis

            # Store analyses
            project['analyses'] = analyses
            project['progress']['paper_analysis'] = 'completed'
            project['progress']['gap_analysis'] = 'completed'
            project['progress']['trend_analysis'] = 'completed'

            return {
                'success': True,
                'analyses': analyses,
                'message': 'Literature analysis completed successfully'
            }

        except Exception as e:
            return {
                'success': False,
                'error': str(e),
                'message': 'Literature analysis failed'
            }

    def _conduct_thematic_analysis(self, papers: List[Dict], research_question: str) -> Dict:
        """Conduct thematic analysis of papers"""
        try:
            # Prepare paper data for analysis
            paper_themes = []
            for paper in papers[:20]:  # Limit for token management
                theme_text = f"Title: {paper.get('title', '')}\nAbstract: {paper.get('abstract', '')[:400]}"
                paper_themes.append(theme_text)

            # AI-powered thematic analysis
            prompt = f"""Conduct a comprehensive thematic analysis of these research papers in relation to the research question: "{research_question}"

Papers to analyze:
{chr(10).join(paper_themes[:15])}

Identify:
1. **Main Themes**: 5-7 major themes across the papers
2. **Sub-themes**: Related concepts under each main theme
3. **Research Approaches**: Common methodological approaches
4. **Theoretical Frameworks**: Underlying theories and models
5. **Contradictions**: Conflicting findings or viewpoints
6. **Consensus Areas**: Where researchers agree

Format as structured analysis with clear sections."""

            analysis = self.groq_processor.generate_response(prompt, max_tokens=2500)

            return {
                'analysis': analysis,
                'papers_analyzed': len(paper_themes),
                'research_question': research_question,
                'analysis_timestamp': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'analysis': f'Error in thematic analysis: {str(e)}',
                'error': str(e),
                'analysis_timestamp': datetime.now().isoformat()
            }

    def _analyze_methodologies(self, papers: List[Dict]) -> Dict:
        """Analyze methodologies used in papers"""
        try:
            methodologies = defaultdict(int)
            data_types = defaultdict(int)
            evaluation_metrics = defaultdict(int)

            # Common methodology keywords
            method_keywords = {
                'machine_learning': ['machine learning', 'ml', 'supervised', 'unsupervised'],
                'deep_learning': ['deep learning', 'neural network', 'cnn', 'rnn', 'transformer'],
                'nlp': ['natural language processing', 'nlp', 'text mining', 'sentiment analysis'],
                'computer_vision': ['computer vision', 'image processing', 'object detection'],
                'reinforcement_learning': ['reinforcement learning', 'rl', 'q-learning'],
                'statistical_analysis': ['statistical analysis', 'regression', 'correlation', 'anova'],
                'experimental': ['experiment', 'randomized', 'control group', 'a/b test'],
                'survey': ['survey', 'questionnaire', 'interview', 'qualitative'],
                'simulation': ['simulation', 'model', 'monte carlo']
            }

            for paper in papers:
                content = (paper.get('title', '') + ' ' + paper.get('abstract', '')).lower()

                # Count methodologies
                for method, keywords in method_keywords.items():
                    if any(keyword in content for keyword in keywords):
                        methodologies[method] += 1

                # Identify data types
                if any(word in content for word in ['dataset', 'data', 'corpus']):
                    if 'text' in content or 'nlp' in content:
                        data_types['text'] += 1
                    elif 'image' in content or 'vision' in content:
                        data_types['image'] += 1
                    elif 'audio' in content or 'speech' in content:
                        data_types['audio'] += 1
                    else:
                        data_types['tabular'] += 1

                # Common evaluation metrics
                metrics = ['accuracy', 'precision', 'recall', 'f1', 'bleu', 'rouge', 'mae', 'mse']
                for metric in metrics:
                    if metric in content:
                        evaluation_metrics[metric] += 1

            return {
                'methodologies': dict(methodologies),
                'data_types': dict(data_types),
                'evaluation_metrics': dict(evaluation_metrics),
                'total_papers': len(papers),
                'analysis_timestamp': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'error': str(e),
                'analysis_timestamp': datetime.now().isoformat()
            }

    def generate_literature_review(self, project_id: str, review_type: str = "comprehensive") -> Dict:
        """Generate comprehensive literature review"""
        try:
            if project_id not in self.projects:
                return {'success': False, 'error': 'Project not found'}

            project = self.projects[project_id]

            if 'analyses' not in project:
                return {'success': False, 'error': 'No analyses found. Run analyze_literature first.'}

            print(f"📝 Generating literature review for project: {project['name']}")

            # Gather all analysis data
            analyses = project['analyses']
            papers = project['papers']

            # Generate comprehensive review
            review_prompt = f"""Generate a comprehensive literature review based on this research project:

Project: {project['name']}
Research Question: {project['research_question']}
Keywords: {', '.join(project['keywords'])}

Analysis Summary:
- Total Papers: {len(papers)}
- Thematic Analysis: {analyses.get('thematic', {}).get('analysis', 'N/A')[:500]}...
- Trend Analysis: {analyses.get('trends', {}).get('executive_summary', 'N/A')[:500]}...
- Gap Analysis: {analyses.get('gaps', {}).get('ai_analysis', 'N/A')[:500]}...

Generate a structured literature review with:
1. **Introduction** - Research context and objectives
2. **Literature Search Methodology** - How papers were identified
3. **Thematic Analysis** - Key themes and patterns
4. **Current State of Research** - What we know
5. **Research Gaps** - What's missing
6. **Future Directions** - Recommendations for future work
7. **Conclusion** - Summary and implications

Write in academic style, approximately 2000-3000 words."""

            literature_review = self.groq_processor.generate_response(review_prompt, max_tokens=4000)

            # Store review
            review_id = f"review_{int(time.time())}"
            review_data = {
                'id': review_id,
                'project_id': project_id,
                'review_type': review_type,
                'content': literature_review,
                'generated_at': datetime.now().isoformat(),
                'word_count': len(literature_review.split()),
                'papers_reviewed': len(papers)
            }

            self.literature_reviews[review_id] = review_data
            project['progress']['report_generation'] = 'completed'

            return {
                'success': True,
                'review_id': review_id,
                'review': review_data,
                'message': 'Literature review generated successfully'
            }

        except Exception as e:
            return {
                'success': False,
                'error': str(e),
                'message': 'Literature review generation failed'
            }

    def ask_research_question(self, project_id: str, question: str) -> Dict:
        """Ask questions about the research project using RAG"""
        try:
            if project_id not in self.projects:
                return {'success': False, 'error': 'Project not found'}

            project = self.projects[project_id]

            # Use RAG system to answer question
            if self.rag_system:
                response = self.rag_system.query(question, include_sources=True)

                # Add project context to response
                response['project_context'] = {
                    'project_name': project['name'],
                    'research_question': project['research_question'],
                    'total_papers': len(project.get('papers', [])),
                    'project_id': project_id
                }

                return {
                    'success': True,
                    'response': response
                }
            else:
                return {
                    'success': False,
                    'error': 'RAG system not available'
                }

        except Exception as e:
            return {
                'success': False,
                'error': str(e)
            }

    def get_project_summary(self, project_id: str) -> Dict:
        """Get comprehensive project summary"""
        try:
            if project_id not in self.projects:
                return {'success': False, 'error': 'Project not found'}

            project = self.projects[project_id]

            # Calculate completion percentage
            progress = project['progress']
            completed_tasks = sum(1 for status in progress.values() if status == 'completed')
            total_tasks = len(progress)
            completion_percentage = (completed_tasks / total_tasks) * 100

            # Generate executive summary
            exec_summary_prompt = f"""Generate an executive summary for this research project:

Project: {project['name']}
Research Question: {project['research_question']}
Keywords: {', '.join(project['keywords'])}
Papers Found: {len(project.get('papers', []))}
Completion: {completion_percentage:.1f}%

Key Findings:
{project.get('analyses', {}).get('thematic', {}).get('analysis', 'Analysis pending')[:300]}...

Provide a concise 3-paragraph executive summary covering:
1. Project overview and objectives
2. Key findings and insights
3. Next steps and recommendations"""

            executive_summary = self.groq_processor.generate_response(exec_summary_prompt, max_tokens=1000)

            return {
                'success': True,
                'project_id': project_id,
                'project_name': project['name'],
                'executive_summary': executive_summary,
                'completion_percentage': completion_percentage,
                'progress': progress,
                'statistics': {
                    'papers_found': len(project.get('papers', [])),
                    'analyses_completed': len(project.get('analyses', {})),
                    'reviews_generated': len([r for r in self.literature_reviews.values() if r['project_id'] == project_id]),
                    'created_at': project['created_at']
                },
                'next_steps': self._get_next_steps(project)
            }

        except Exception as e:
            return {
                'success': False,
                'error': str(e)
            }

    def _get_next_steps(self, project: Dict) -> List[str]:
        """Get next steps for a project"""
        next_steps = []
        progress = project['progress']

        if progress['literature_search'] == 'pending':
            next_steps.append('Conduct literature search')
        elif progress['paper_analysis'] == 'pending':
            next_steps.append('Analyze collected papers')
        elif progress['gap_analysis'] == 'pending':
            next_steps.append('Identify research gaps')
        elif progress['trend_analysis'] == 'pending':
            next_steps.append('Analyze research trends')
        elif progress['report_generation'] == 'pending':
            next_steps.append('Generate literature review')
        else:
            next_steps.append('Project completed - consider follow-up research')

        return next_steps

    def list_projects(self) -> Dict:
        """List all projects"""
        try:
            project_list = []
            for project_id, project in self.projects.items():
                progress = project['progress']
                completed_tasks = sum(1 for status in progress.values() if status == 'completed')
                total_tasks = len(progress)
                completion_percentage = (completed_tasks / total_tasks) * 100

                project_list.append({
                    'id': project_id,
                    'name': project['name'],
                    'research_question': project['research_question'],
                    'status': project['status'],
                    'completion_percentage': completion_percentage,
                    'papers_count': len(project.get('papers', [])),
                    'created_at': project['created_at']
                })

            return {
                'success': True,
                'projects': project_list,
                'total_projects': len(project_list)
            }

        except Exception as e:
            return {
                'success': False,
                'error': str(e)
            }

    def get_system_status(self) -> Dict:
        """Get comprehensive system status"""
        try:
            # RAG system stats
            rag_stats = self.rag_system.get_stats() if self.rag_system else {'status': 'Not available'}

            # Citation network stats
            citation_stats = self.citation_analyzer.get_network_summary()

            # Project stats
            project_stats = {
                'total_projects': len(self.projects),
                'active_projects': len([p for p in self.projects.values() if p['status'] == 'active']),
                'completed_projects': len([p for p in self.projects.values() if p['status'] == 'completed']),
                'total_papers': sum(len(p.get('papers', [])) for p in self.projects.values()),
                'total_reviews': len(self.literature_reviews)
            }

            return {
                'system_status': 'operational',
                'components': {
                    'groq_processor': 'ready' if self.groq_processor else 'not available',
                    'rag_system': rag_stats.get('status', 'unknown'),
                    'arxiv_fetcher': 'ready' if self.arxiv_fetcher else 'not available',
                    'multi_source_collector': 'ready' if self.multi_source_collector else 'not available',
                    'pdf_processor': 'ready' if self.pdf_processor else 'not available',
                    'citation_analyzer': 'ready' if self.citation_analyzer else 'not available',
                    'trend_monitor': 'ready' if self.trend_monitor else 'not available'
                },
                'statistics': {
                    'projects': project_stats,
                    'rag_system': rag_stats,
                    'citation_network': citation_stats
                },
                'timestamp': datetime.now().isoformat()
            }

        except Exception as e:
            return {
                'system_status': 'error',
                'error': str(e),
                'timestamp': datetime.now().isoformat()
            }

# Initialize Advanced Research Assistant
research_assistant = AdvancedResearchAssistant()
print("✅ Advanced Research Assistant ready!")

✅ Advanced Research Assistant initialized!
✅ Advanced Research Assistant ready!


In [24]:
# ============================================================================
# UNIFIED RESEARCHMATE INTERFACE
# ============================================================================

class ResearchMate:
    """Unified interface for the ResearchMate AI Research Assistant"""

    def __init__(self):
        # Initialize all components
        self.assistant = research_assistant
        self.version = "2.0.0"
        self.initialized_at = datetime.now().isoformat()

        print("=" * 70)
        print("🚀 RESEARCHMATE - AI RESEARCH ASSISTANT")
        print("=" * 70)
        print(f"Version: {self.version}")
        print(f"Powered by: Groq Llama 3.3 70B")
        print(f"Initialized: {self.initialized_at}")
        print("=" * 70)
        print("✅ All systems operational!")
        print("=" * 70)

    # =========================
    # QUICK START METHODS
    # =========================

    def quick_search(self, query: str, max_results: int = 10) -> Dict:
        """Quick search across all sources"""
        try:
            print(f"🔍 Quick search: '{query}'")
            results = self.assistant.multi_source_collector.search_all_sources(query, max_results)

            # Add to RAG system for immediate querying
            if results and self.assistant.rag_system:
                self.assistant.rag_system.add_documents(results)

            return {
                'success': True,
                'query': query,
                'results': results,
                'count': len(results),
                'message': f'Found {len(results)} papers'
            }
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def ask(self, question: str) -> Dict:
        """Ask a question using RAG system"""
        try:
            if not self.assistant.rag_system:
                return {'success': False, 'error': 'RAG system not available'}

            response = self.assistant.rag_system.query(question, include_sources=True)
            return {
                'success': True,
                'question': question,
                'answer': response['answer'],
                'sources': response.get('sources', []),
                'timestamp': response.get('timestamp')
            }
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def analyze_pdf(self, pdf_path: str) -> Dict:
        """Analyze a PDF file"""
        try:
            result = self.assistant.pdf_processor.process_pdf_file(pdf_path)
            return result
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def get_trends(self, topic: str) -> Dict:
        """Get research trends for a topic"""
        try:
            return self.assistant.trend_monitor.monitor_realtime_trends(topic)
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def status(self) -> Dict:
        """Get system status"""
        try:
            return self.assistant.get_system_status()
        except Exception as e:
            return {'success': False, 'error': str(e)}

    # =========================
    # PROJECT MANAGEMENT
    # =========================

    def create_project(self, name: str, research_question: str, keywords: List[str]) -> Dict:
        """Create a new research project"""
        try:
            return self.assistant.create_research_project(name, research_question, keywords)
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def list_projects(self) -> Dict:
        """List all projects"""
        try:
            return self.assistant.list_projects()
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def get_project(self, project_id: str) -> Dict:
        """Get project summary"""
        try:
            return self.assistant.get_project_summary(project_id)
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def search_literature(self, project_id: str, max_papers: int = 10) -> Dict:
        """Search literature for a project"""
        try:
            return self.assistant.conduct_literature_search(project_id, max_papers)
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def analyze_project(self, project_id: str) -> Dict:
        """Analyze project literature"""
        try:
            return self.assistant.analyze_literature(project_id)
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def generate_review(self, project_id: str) -> Dict:
        """Generate literature review"""
        try:
            return self.assistant.generate_literature_review(project_id)
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def ask_project(self, project_id: str, question: str) -> Dict:
        """Ask question about a specific project"""
        try:
            return self.assistant.ask_research_question(project_id, question)
        except Exception as e:
            return {'success': False, 'error': str(e)}

    # =========================
    # ANALYSIS METHODS
    # =========================

    def analyze_citations(self, papers: List[Dict]) -> Dict:
        """Analyze citation network"""
        try:
            self.assistant.citation_analyzer.add_papers(papers)
            return self.assistant.citation_analyzer.get_network_summary()
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def detect_gaps(self, papers: List[Dict]) -> Dict:
        """Detect research gaps"""
        try:
            return self.assistant.trend_monitor.detect_research_gaps(papers)
        except Exception as e:
            return {'success': False, 'error': str(e)}

    def generate_report(self, papers: List[Dict]) -> Dict:
        """Generate comprehensive trend report"""
        try:
            return self.assistant.trend_monitor.generate_trend_report(papers)
        except Exception as e:
            return {'success': False, 'error': str(e)}

    # =========================
    # WORKFLOW METHODS
    # =========================

    def full_workflow(self, project_name: str, research_question: str, keywords: List[str]) -> Dict:
        """Complete research workflow"""
        try:
            print("🚀 Starting full research workflow...")

            # Step 1: Create project
            print("📝 Creating research project...")
            project_result = self.create_project(project_name, research_question, keywords)
            if not project_result['success']:
                return project_result

            project_id = project_result['project_id']

            # Step 2: Literature search
            print("🔍 Conducting literature search...")
            search_result = self.search_literature(project_id, max_papers=15)
            if not search_result['success']:
                return search_result

            # Step 3: Analysis
            print("📊 Analyzing literature...")
            analysis_result = self.analyze_project(project_id)
            if not analysis_result['success']:
                return analysis_result

            # Step 4: Generate review
            print("📝 Generating literature review...")
            review_result = self.generate_review(project_id)
            if not review_result['success']:
                return review_result

            # Step 5: Get final summary
            print("📋 Preparing final summary...")
            summary_result = self.get_project(project_id)

            return {
                'success': True,
                'project_id': project_id,
                'workflow_completed': True,
                'steps_completed': {
                    'project_creation': project_result['success'],
                    'literature_search': search_result['success'],
                    'analysis': analysis_result['success'],
                    'review_generation': review_result['success']
                },
                'final_summary': summary_result,
                'message': 'Full research workflow completed successfully!'
            }

        except Exception as e:
            return {'success': False, 'error': str(e)}

    def demo(self) -> Dict:
        """Run a comprehensive demonstration"""
        try:
            print("🎯 ResearchMate Demo Starting...")
            print("=" * 50)

            demo_results = {}

            # Demo 1: System Status
            print("1️⃣ System Status Check...")
            status_result = self.status()
            demo_results['system_status'] = status_result
            print(f"   ✅ System Status: {status_result.get('system_status', 'unknown')}")

            # Demo 2: Quick Search
            print("\n2️⃣ Quick Search Demo...")
            search_result = self.quick_search("transformer attention mechanism", max_results=5)
            demo_results['quick_search'] = search_result
            print(f"   ✅ Found {search_result.get('count', 0)} papers")

            # Demo 3: Ask Question
            print("\n3️⃣ Question Answering Demo...")
            qa_result = self.ask("What is attention mechanism in transformers?")
            demo_results['question_answering'] = qa_result
            if qa_result['success']:
                print(f"   ✅ Answer: {qa_result['answer'][:100]}...")

            # Demo 4: Create Demo Project
            print("\n4️⃣ Project Creation Demo...")
            project_result = self.create_project(
                "Transformer Architecture Analysis",
                "How do attention mechanisms improve transformer performance?",
                ["transformer", "attention mechanism", "neural networks"]
            )
            demo_results['project_creation'] = project_result

            if project_result['success']:
                project_id = project_result['project_id']
                print(f"   ✅ Project created: {project_id}")

                # Demo 5: Literature Search
                print("\n5️⃣ Literature Search Demo...")
                lit_search_result = self.search_literature(project_id, max_papers=3)
                demo_results['literature_search'] = lit_search_result
                print(f"   ✅ Found {lit_search_result.get('papers_found', 0)} papers")

            # Demo 6: Trend Analysis
            print("\n6️⃣ Trend Analysis Demo...")
            trend_result = self.get_trends("large language models")
            demo_results['trend_analysis'] = trend_result
            print(f"   ✅ Trend status: {trend_result.get('trend_status', 'unknown')}")

            print("\n" + "=" * 50)
            print("🎉 Demo completed successfully!")
            print("=" * 50)

            return {
                'success': True,
                'demo_completed': True,
                'demo_results': demo_results,
                'message': 'All demo components executed successfully!'
            }

        except Exception as e:
            return {
                'success': False,
                'error': str(e),
                'message': 'Demo failed'
            }

    # =========================
    # UTILITY METHODS
    # =========================

    def help(self) -> str:
        """Show help information"""
        help_text = """
        🚀 RESEARCHMATE - AI RESEARCH ASSISTANT HELP
        ==========================================

        QUICK START METHODS:
        - quick_search(query, max_results=10) : Search all sources
        - ask(question) : Ask questions using RAG
        - analyze_pdf(pdf_path) : Analyze PDF files
        - get_trends(topic) : Get research trends
        - status() : Get system status

        PROJECT MANAGEMENT:
        - create_project(name, research_question, keywords) : Create new project
        - list_projects() : List all projects
        - get_project(project_id) : Get project summary
        - search_literature(project_id, max_papers=10) : Search literature
        - analyze_project(project_id) : Analyze project literature
        - generate_review(project_id) : Generate literature review
        - ask_project(project_id, question) : Ask project-specific questions

        ANALYSIS METHODS:
        - analyze_citations(papers) : Analyze citation networks
        - detect_gaps(papers) : Detect research gaps
        - generate_report(papers) : Generate trend reports

        WORKFLOW METHODS:
        - full_workflow(name, question, keywords) : Complete research workflow
        - demo() : Run comprehensive demonstration

        EXAMPLES:
        >>> rm = ResearchMate()
        >>> rm.demo()  # Run demo
        >>> rm.quick_search("machine learning")  # Quick search
        >>> rm.ask("What is deep learning?")  # Ask question
        >>> rm.create_project("My Research", "Research question?", ["keyword1", "keyword2"])
        """
        print(help_text)
        return help_text

    def __str__(self) -> str:
        """String representation"""
        return f"ResearchMate v{self.version} - AI Research Assistant powered by Groq Llama 3.3 70B"

    def __repr__(self) -> str:
        """Detailed representation"""
        return f"ResearchMate(version='{self.version}', initialized_at='{self.initialized_at}')"

# Initialize the unified ResearchMate interface
research_mate = ResearchMate()
print("✅ ResearchMate unified interface ready!")
print("\n💡 Quick start:")
print("   research_mate.demo()  # Run comprehensive demo")
print("   research_mate.help()  # Show help")
print("   research_mate.quick_search('your topic')  # Quick search")
print("   research_mate.ask('your question')  # Ask questions")

🚀 RESEARCHMATE - AI RESEARCH ASSISTANT
Version: 2.0.0
Powered by: Groq Llama 3.3 70B
Initialized: 2025-07-08T17:54:17.016088
✅ All systems operational!
✅ ResearchMate unified interface ready!

💡 Quick start:
   research_mate.demo()  # Run comprehensive demo
   research_mate.help()  # Show help
   research_mate.quick_search('your topic')  # Quick search
   research_mate.ask('your question')  # Ask questions


In [25]:
# ============================================================================
# DEMO: COMPLETE RESEARCHMATE WORKFLOW
# ============================================================================

def run_complete_demo():
    """Run a complete demonstration of ResearchMate capabilities"""
    print("🚀 STARTING COMPLETE RESEARCHMATE DEMO")
    print("=" * 70)

    try:
        # Initialize timing
        start_time = time.time()

        # Step 1: System Status
        print("1️⃣ SYSTEM STATUS CHECK")
        print("-" * 30)
        status = research_mate.status()
        if status['success']:
            print(f"✅ System Status: {status['system_status']}")
            print(f"✅ Components Ready: {len([c for c in status['components'].values() if c == 'ready'])}")
            print(f"✅ Total Projects: {status['statistics']['projects']['total_projects']}")
        else:
            print(f"❌ System Error: {status.get('error', 'Unknown error')}")

        print()

        # Step 2: Quick Search Demo
        print("2️⃣ QUICK SEARCH DEMONSTRATION")
        print("-" * 30)
        search_query = "transformer attention mechanism"
        print(f"🔍 Searching for: '{search_query}'")

        search_result = research_mate.quick_search(search_query, max_results=5)
        if search_result['success']:
            print(f"✅ Found {search_result['count']} papers")
            for i, paper in enumerate(search_result['results'][:3], 1):
                print(f"   {i}. {paper.get('title', 'Unknown Title')[:60]}...")
                print(f"      Authors: {', '.join(paper.get('authors', ['Unknown'])[:2])}")
                print(f"      Source: {paper.get('source', 'Unknown')}")
        else:
            print(f"❌ Search failed: {search_result.get('error', 'Unknown error')}")

        print()

        # Step 3: Question Answering Demo
        print("3️⃣ QUESTION ANSWERING DEMONSTRATION")
        print("-" * 30)
        question = "What is the attention mechanism in transformer models?"
        print(f"❓ Question: {question}")

        qa_result = research_mate.ask(question)
        if qa_result['success']:
            print(f"✅ Answer: {qa_result['answer'][:200]}...")
            print(f"✅ Sources: {len(qa_result.get('sources', []))} papers referenced")
        else:
            print(f"❌ QA failed: {qa_result.get('error', 'Unknown error')}")

        print()

        # Step 4: Project Creation and Management
        print("4️⃣ PROJECT MANAGEMENT DEMONSTRATION")
        print("-" * 30)
        project_name = "Transformer Architecture Research"
        research_question = "How do attention mechanisms improve model performance in transformers?"
        keywords = ["transformer", "attention mechanism", "neural networks", "deep learning"]

        print(f"📝 Creating project: '{project_name}'")
        project_result = research_mate.create_project(project_name, research_question, keywords)

        if project_result['success']:
            project_id = project_result['project_id']
            print(f"✅ Project created with ID: {project_id}")

            # Literature search
            print(f"🔍 Conducting literature search...")
            lit_result = research_mate.search_literature(project_id, max_papers=5)
            if lit_result['success']:
                print(f"✅ Found {lit_result['papers_found']} papers for the project")

                # Project analysis
                print(f"📊 Analyzing project literature...")
                analysis_result = research_mate.analyze_project(project_id)
                if analysis_result['success']:
                    print(f"✅ Literature analysis completed")
                    print(f"   - Citation network analysis: ✅")
                    print(f"   - Trend analysis: ✅")
                    print(f"   - Gap analysis: ✅")
                    print(f"   - Thematic analysis: ✅")
                else:
                    print(f"❌ Analysis failed: {analysis_result.get('error', 'Unknown error')}")
            else:
                print(f"❌ Literature search failed: {lit_result.get('error', 'Unknown error')}")
        else:
            print(f"❌ Project creation failed: {project_result.get('error', 'Unknown error')}")

        print()

        # Step 5: Trend Analysis Demo
        print("5️⃣ TREND ANALYSIS DEMONSTRATION")
        print("-" * 30)
        trend_topic = "large language models"
        print(f"📈 Analyzing trends for: '{trend_topic}'")

        trend_result = research_mate.get_trends(trend_topic)
        if trend_result.get('trend_status'):
            print(f"✅ Trend Status: {trend_result['trend_status']}")
            print(f"✅ Papers Found: {trend_result.get('papers_found', 0)}")
            if 'ai_insights' in trend_result:
                print(f"✅ AI Insights: {trend_result['ai_insights'][:150]}...")
        else:
            print(f"❌ Trend analysis failed: {trend_result.get('error', 'Unknown error')}")

        print()

        # Step 6: System Statistics
        print("6️⃣ FINAL SYSTEM STATISTICS")
        print("-" * 30)
        final_status = research_mate.status()
        if final_status['success']:
            stats = final_status['statistics']
            print(f"✅ Total Projects: {stats['projects']['total_projects']}")
            print(f"✅ RAG Documents: {stats['rag_system'].get('total_documents', 0)}")
            print(f"✅ Citation Network Authors: {stats['citation_network'].get('author_network', {}).get('network_metrics', {}).get('total_authors', 0)}")
            print(f"✅ System Status: All components operational")

        # Calculate demo time
        end_time = time.time()
        demo_duration = end_time - start_time

        print()
        print("🎉 DEMO COMPLETED SUCCESSFULLY!")
        print("=" * 70)
        print(f"⏱️  Demo Duration: {demo_duration:.2f} seconds")
        print(f"🚀 ResearchMate is ready for your research projects!")
        print("=" * 70)

        return {
            'success': True,
            'demo_duration': demo_duration,
            'components_tested': ['search', 'qa', 'projects', 'trends', 'analysis'],
            'message': 'Complete demo executed successfully'
        }

    except Exception as e:
        print(f"❌ DEMO FAILED: {str(e)}")
        return {
            'success': False,
            'error': str(e),
            'message': 'Demo execution failed'
        }

# Run the complete demonstration
print("🎯 Ready to run ResearchMate complete demo!")
print("💡 Execute: run_complete_demo() to see everything in action")
print()
print("🚀 RESEARCHMATE IS READY!")
print("=" * 70)
print("📚 Quick Start Commands:")
print("   research_mate.demo()                    # Run built-in demo")
print("   research_mate.quick_search('topic')     # Quick paper search")
print("   research_mate.ask('question')           # Ask research questions")
print("   research_mate.help()                    # Show detailed help")
print("   run_complete_demo()                     # This comprehensive demo")
print("=" * 70)
print()
print("🎉 Welcome to ResearchMate - Your AI Research Assistant!")
print("   Powered by Groq Llama 3.3 70B with advanced RAG capabilities")
print("   Ready to accelerate your research workflow!")

# Uncomment the line below to run the demo automatically
# run_complete_demo()

🎯 Ready to run ResearchMate complete demo!
💡 Execute: run_complete_demo() to see everything in action

🚀 RESEARCHMATE IS READY!
📚 Quick Start Commands:
   research_mate.demo()                    # Run built-in demo
   research_mate.quick_search('topic')     # Quick paper search
   research_mate.ask('question')           # Ask research questions
   research_mate.help()                    # Show detailed help
   run_complete_demo()                     # This comprehensive demo

🎉 Welcome to ResearchMate - Your AI Research Assistant!
   Powered by Groq Llama 3.3 70B with advanced RAG capabilities
   Ready to accelerate your research workflow!


In [26]:
 research_mate.demo()

🎯 ResearchMate Demo Starting...
1️⃣ System Status Check...
   ✅ System Status: operational

2️⃣ Quick Search Demo...
🔍 Quick search: 'transformer attention mechanism'
🔍 Searching all sources for: 'transformer attention mechanism'
🔍 Searching arXiv...
✅ Found 5 papers on arXiv
🔍 Searching Semantic Scholar...
✅ Found 5 papers on Semantic Scholar
🔍 Searching CrossRef...
✅ Found 5 papers on CrossRef
🔍 Searching PubMed...
✅ Found 5 papers on PubMed
✅ Total unique papers found: 20
✅ Added 15 document chunks to vectorstore
   ✅ Found 20 papers

3️⃣ Question Answering Demo...
   ✅ Answer: The attention mechanism in transformers is a computational model that is claimed to implement attent...

4️⃣ Project Creation Demo...
   ✅ Project created: proj_1751997279_3656

5️⃣ Literature Search Demo...
🔍 Conducting literature search for project: Transformer Architecture Analysis
🔍 Searching for keyword: transformer
🔍 Searching all sources for: 'transformer'
🔍 Searching arXiv...
✅ Found 3 papers on arXiv

{'success': True,
 'demo_completed': True,
 'demo_results': {'system_status': {'system_status': 'operational',
   'components': {'groq_processor': 'ready',
    'rag_system': 'Ready',
    'arxiv_fetcher': 'ready',
    'multi_source_collector': 'ready',
    'pdf_processor': 'ready',
    'citation_analyzer': 'ready',
    'trend_monitor': 'ready'},
   'statistics': {'projects': {'total_projects': 2,
     'active_projects': 2,
     'completed_projects': 0,
     'total_papers': 66,
     'total_reviews': 0},
    'rag_system': {'total_documents': 111,
     'embedding_model': 'sentence-transformers/all-MiniLM-L6-v2',
     'chunk_size': 2000,
     'chunk_overlap': 400,
     'vectorstore_type': 'ChromaDB',
     'status': 'Ready'},
    'citation_network': {'author_network': {'network_metrics': {'total_authors': 109,
       'total_collaborations': 288,
       'network_density': 0.04892966360856269,
       'number_of_components': 27,
       'largest_component_size': 13},
      'top_collaborators': [

In [18]:
 research_mate.quick_search('attention')

🔍 Quick search: 'attention'
🔍 Searching all sources for: 'attention'
🔍 Searching arXiv...
✅ Found 10 papers on arXiv
🔍 Searching Semantic Scholar...
✅ Found 10 papers on Semantic Scholar
🔍 Searching CrossRef...
✅ Found 10 papers on CrossRef
🔍 Searching PubMed...
✅ Found 10 papers on PubMed
✅ Total unique papers found: 37
✅ Added 32 document chunks to vectorstore


{'success': True,
 'query': 'attention',
 'results': [{'title': 'Exploring Human-like Attention Supervision in Visual Question Answering',
   'authors': ['Tingting Qiao', 'Jianfeng Dong', 'Duanqing Xu'],
   'abstract': 'Attention mechanisms have been widely applied in the Visual Question\nAnswering (VQA) task, as they help to focus on the area-of-interest of both\nvisual and textual information. To answer the questions correctly, the model\nneeds to selectively target different areas of an image, which suggests that an\nattention-based model may benefit from an explicit attention supervision. In\nthis work, we aim to address the problem of adding attention supervision to VQA\nmodels. Since there is a lack of human attention data, we first propose a Human\nAttention Network (HAN) to generate human-like attention maps, training on a\nrecently released dataset called Human ATtention Dataset (VQA-HAT). Then, we\napply the pre-trained HAN on the VQA v2.0 dataset to automatically produce the