<a href="https://colab.research.google.com/github/akash1629/cross-domain/blob/main/Cross_domain_algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict, Counter
import json
import math
from typing import Dict, List, Tuple, Set
import requests
import time

class SemanticVectorSpace:
    """Handles semantic vector representation of concepts"""

    def __init__(self):
        self.concept_vectors = {}
        self.vectorizer = TfidfVectorizer(max_features=300, stop_words='english')
        self.concept_descriptions = {}

    def add_concept(self, concept: str, description: str = None):
        """Add a concept with optional description"""
        self.concept_descriptions[concept] = description or concept

    def build_vector_space(self, concepts_with_descriptions: Dict[str, str] = None):
        """Build vector space from concept descriptions"""
        if concepts_with_descriptions:
            self.concept_descriptions.update(concepts_with_descriptions)

        descriptions = list(self.concept_descriptions.values())
        concept_names = list(self.concept_descriptions.keys())

        if len(descriptions) < 2:
            raise ValueError("Need at least 2 concepts to build vector space")

        # Create TF-IDF vectors
        tfidf_matrix = self.vectorizer.fit_transform(descriptions)

        # Store vectors for each concept
        for i, concept in enumerate(concept_names):
            self.concept_vectors[concept] = tfidf_matrix[i].toarray().flatten()

    def similarity(self, concept1: str, concept2: str) -> float:
        """Calculate cosine similarity between two concepts"""
        if concept1 not in self.concept_vectors or concept2 not in self.concept_vectors:
            return 0.0

        vec1 = self.concept_vectors[concept1]
        vec2 = self.concept_vectors[concept2]

        # Handle zero vectors
        if np.linalg.norm(vec1) == 0 or np.linalg.norm(vec2) == 0:
            return 0.0

        return 1 - cosine(vec1, vec2)

    def get_neighborhood(self, concept: str, threshold: float = 0.65) -> Set[str]:
        """Get concepts similar to the given concept above threshold"""
        if concept not in self.concept_vectors:
            return set()

        neighborhood = set()
        for other_concept in self.concept_vectors:
            if other_concept != concept:
                sim = self.similarity(concept, other_concept)
                if sim > threshold:
                    neighborhood.add(other_concept)

        return neighborhood

class CitationDataAnalyzer:
    """Analyzes citation patterns and calculates novelty scores"""

    def __init__(self):
        self.citation_data = {}
        self.concept_cooccurrence = defaultdict(int)
        self.max_frequency = 1

    def add_citation_data(self, concept_frequencies: Dict[str, int]):
        """Add citation frequency data for concepts"""
        self.citation_data.update(concept_frequencies)
        if concept_frequencies:
            self.max_frequency = max(max(concept_frequencies.values()), self.max_frequency)

    def calculate_novelty(self, concept: str) -> float:
        """Calculate novelty score based on inverse citation frequency"""
        citation_count = self.citation_data.get(concept, self.max_frequency / 100)

        # Avoid division by zero
        if self.max_frequency == 0:
            return 0.5

        novelty = 1.0 - (citation_count / self.max_frequency)
        # Apply logarithmic scaling to smooth the distribution
        novelty = 0.5 + 0.5 * (np.log1p(novelty * 9) / np.log(10))
        return max(0.0, min(1.0, novelty))

    def calculate_connection_novelty(self, concept1: str, concept2: str) -> float:
        """Calculate how novel a connection between two concepts is"""
        connection_key = tuple(sorted([concept1, concept2]))
        cooccurrence = self.concept_cooccurrence.get(connection_key, 0)

        # Higher novelty for less co-occurring concepts
        max_cooccurrence = max(self.concept_cooccurrence.values()) if self.concept_cooccurrence else 1
        novelty = 1.0 - (cooccurrence / max_cooccurrence)
        return max(0.0, min(1.0, novelty))

    def add_cooccurrence_data(self, cooccurrence_data: Dict[Tuple[str, str], int]):
        """Add concept co-occurrence data"""
        self.concept_cooccurrence.update(cooccurrence_data)

class CrossDomainConnector:
    """Main algorithm for discovering cross-domain connections"""

    def __init__(self, vector_space: SemanticVectorSpace, citation_analyzer: CitationDataAnalyzer,
                 alpha: float = 0.35, beta: float = 0.35, gamma: float = 0.2,
                 delta: float = 0.1, threshold: float = 0.65):
        """
        Initialize the cross-domain connector

        Args:
            vector_space: Semantic vector space for concepts
            citation_analyzer: Citation data analyzer
            alpha: Weight for similarity to domain 1
            beta: Weight for similarity to domain 2
            gamma: Weight for novelty
            delta: Weight for commonality penalty
            threshold: Similarity threshold for neighborhoods
        """
        self.vector_space = vector_space
        self.citation_analyzer = citation_analyzer
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        self.delta = delta
        self.threshold = threshold

    def commonality(self, bridge: str, domain1: str, domain2: str) -> float:
        """Calculate how common the bridge concept is (penalty factor)"""
        # Use citation frequency as proxy for commonality
        citation_count = self.citation_analyzer.citation_data.get(bridge, 0)
        max_citations = self.citation_analyzer.max_frequency

        if max_citations == 0:
            return 0.0

        return citation_count / max_citations

    def bridge_strength(self, bridge: str, domain1: str, domain2: str) -> float:
        """Calculate the strength of a bridge between two domains"""
        sim1 = self.vector_space.similarity(bridge, domain1)
        sim2 = self.vector_space.similarity(bridge, domain2)
        novelty = self.citation_analyzer.calculate_novelty(bridge)
        commonality = self.commonality(bridge, domain1, domain2)

        strength = (self.alpha * sim1 +
                   self.beta * sim2 +
                   self.gamma * novelty -
                   self.delta * commonality)

        return max(0.0, strength)

    def identify_bridges(self, domain1: str, domain2: str, top_n: int = 10) -> List[Tuple[str, float]]:
        """Identify top bridges between two domains"""
        if domain1 not in self.vector_space.concept_vectors or domain2 not in self.vector_space.concept_vectors:
            return []

        # Get candidate concepts with reasonable similarity to either domain
        candidates = set()

        for concept in self.vector_space.concept_vectors:
            if concept in {domain1, domain2}:
                continue

            sim1 = self.vector_space.similarity(concept, domain1)
            sim2 = self.vector_space.similarity(concept, domain2)

            # Lower threshold to capture more potential bridges
            if sim1 > 0.1 or sim2 > 0.1:
                candidates.add(concept)

        # If no candidates found, include all concepts
        if not candidates:
            candidates = set(self.vector_space.concept_vectors.keys()) - {domain1, domain2}

        # Calculate bridge strength for each candidate
        bridge_scores = []
        for candidate in candidates:
            strength = self.bridge_strength(candidate, domain1, domain2)
            bridge_scores.append((candidate, strength))

        # Sort by strength and return top_n
        bridge_scores.sort(key=lambda x: x[1], reverse=True)
        return bridge_scores[:top_n]

    def find_connection_path(self, domain1: str, domain2: str, max_path_length: int = 3) -> List[str]:
        """Find a connection path between two domains through bridge concepts"""
        if domain1 not in self.vector_space.concept_vectors or domain2 not in self.vector_space.concept_vectors:
            return []

        # Direct connection
        direct_sim = self.vector_space.similarity(domain1, domain2)
        if direct_sim > self.threshold:
            return [domain1, domain2]

        # Find path through bridges
        bridges = self.identify_bridges(domain1, domain2, top_n=5)

        if not bridges:
            return []

        # Return path through strongest bridge
        best_bridge = bridges[0][0]
        return [domain1, best_bridge, domain2]

    def explain_connection(self, domain1: str, domain2: str) -> Dict:
        """Provide detailed explanation of connection between domains"""
        bridges = self.identify_bridges(domain1, domain2, top_n=5)
        path = self.find_connection_path(domain1, domain2)

        explanation = {
            'domain1': domain1,
            'domain2': domain2,
            'direct_similarity': self.vector_space.similarity(domain1, domain2),
            'top_bridges': bridges,
            'connection_path': path,
            'analysis': {}
        }

        if bridges:
            top_bridge = bridges[0][0]
            explanation['analysis'] = {
                'strongest_bridge': top_bridge,
                'bridge_to_domain1_similarity': self.vector_space.similarity(top_bridge, domain1),
                'bridge_to_domain2_similarity': self.vector_space.similarity(top_bridge, domain2),
                'bridge_novelty': self.citation_analyzer.calculate_novelty(top_bridge),
                'bridge_strength': bridges[0][1]
            }

        return explanation

class ResearchDataLoader:
    """Loads research data from various sources"""

    def __init__(self):
        self.sample_concepts = {}
        self.sample_citations = {}

    def load_sample_data(self):
        """Load sample data for testing"""
        # Enhanced sample concepts with richer, more overlapping descriptions
        self.sample_concepts = {
            'intuition': 'rapid unconscious judgment decision making gut feeling implicit knowledge tacit understanding automatic processing fast thinking heuristic cognitive shortcuts pattern recognition without deliberation subconscious awareness instinctive knowing',
            'consciousness': 'awareness subjective experience phenomenal states self-reflection qualia mental states conscious experience subjective awareness reflective thinking deliberate cognition explicit knowledge mindful attention wakeful awareness',
            'predictive_processing': 'brain prediction error minimization bayesian inference neural prediction mechanisms anticipatory processing expectation generation predictive coding cognitive prediction conscious prediction unconscious expectation',
            'metacognition': 'thinking about thinking meta-awareness cognitive monitoring knowledge about knowledge metacognitive awareness conscious reflection cognitive control executive awareness self-monitoring reflective consciousness',
            'fringe_consciousness': 'peripheral awareness william james consciousness fringe marginal conscious experience vague awareness implicit consciousness borderline awareness tacit knowing intuitive awareness subliminal consciousness',
            'system_1': 'kahneman fast thinking automatic processing intuitive judgment rapid cognition unconscious processing heuristic thinking gut reaction immediate judgment implicit decision making fast cognitive processing',
            'global_workspace': 'baars consciousness global workspace theory information integration awareness conscious access unified conscious experience cognitive integration conscious synthesis attention consciousness',
            'embodied_cognition': 'body mind interaction physical experience cognitive processing bodily states somatic awareness interoceptive consciousness bodily intuition gut feelings physical cognition embodied awareness',
            'default_mode_network': 'brain default network resting state neural activity mind wandering introspection self-referential thinking spontaneous cognition unconscious processing automatic brain activity consciousness rest state',
            'interoception': 'internal bodily signals awareness visceral perception internal sensations body awareness gut feelings somatic consciousness bodily intuition physiological awareness embodied consciousness visceral knowing',
            'attention': 'cognitive attention focus selective attention awareness concentration mental focus conscious attention deliberate focusing cognitive control executive attention mindful awareness attentional consciousness',
            'memory': 'episodic memory working memory long term memory recall recognition cognitive memory conscious memory unconscious memory implicit memory explicit memory memory awareness recollection',
            'emotion': 'emotional processing feelings affective states limbic system emotional cognition emotional awareness conscious emotion unconscious emotion gut feelings emotional intuition affective consciousness',
            'perception': 'sensory perception visual processing auditory processing perceptual awareness sensory cognition conscious perception unconscious perception implicit perception perceptual consciousness sensory awareness',
            'language': 'linguistic processing speech comprehension language production verbal cognition communication conscious language unconscious language implicit linguistic knowledge verbal awareness linguistic consciousness',
            'decision_making': 'choice selection judgment decision processes cognitive decisions rational choice conscious decision unconscious decision intuitive choice deliberate choice decision awareness judgment processes',
            'learning': 'cognitive learning skill acquisition knowledge acquisition adaptive learning neural plasticity conscious learning unconscious learning implicit learning explicit learning learning awareness cognitive adaptation',
            'creativity': 'creative thinking divergent thinking innovation artistic creativity problem solving creativity conscious creativity unconscious creativity creative intuition creative insight spontaneous creativity creative awareness',
            'social_cognition': 'theory of mind social understanding interpersonal cognition social awareness empathy social intuition conscious social processing unconscious social cues social consciousness interpersonal awareness',
            'executive_function': 'cognitive control working memory inhibitory control cognitive flexibility planning conscious control executive awareness cognitive monitoring deliberate control executive consciousness controlled processing'
        }

        # Sample citation frequencies (normally from academic databases)
        self.sample_citations = {
            'consciousness': 15420,
            'attention': 12890,
            'memory': 11650,
            'emotion': 9870,
            'perception': 8940,
            'decision_making': 7650,
            'learning': 7230,
            'intuition': 3450,
            'metacognition': 2890,
            'creativity': 2650,
            'social_cognition': 2340,
            'executive_function': 2120,
            'embodied_cognition': 1890,
            'language': 1650,
            'global_workspace': 890,
            'system_1': 670,
            'default_mode_network': 560,
            'predictive_processing': 450,
            'interoception': 340,
            'fringe_consciousness': 120
        }

        return self.sample_concepts, self.sample_citations

def main_demo():
    """Demonstrate the cross-domain connection algorithm"""
    print("Cross-Domain Connection Algorithm Demo")
    print("=" * 50)

    # Initialize components
    vector_space = SemanticVectorSpace()
    citation_analyzer = CitationDataAnalyzer()
    data_loader = ResearchDataLoader()

    # Load sample data
    concepts, citations = data_loader.load_sample_data()

    print(f"Loaded {len(concepts)} concepts for analysis...")

    # Build vector space
    vector_space.build_vector_space(concepts)
    citation_analyzer.add_citation_data(citations)

    print("Vector space built successfully!")

    # Initialize connector
    connector = CrossDomainConnector(vector_space, citation_analyzer)

    # Test case: Intuition and Consciousness
    print("\nAnalyzing connection between 'intuition' and 'consciousness'")
    print("-" * 50)

    explanation = connector.explain_connection('intuition', 'consciousness')

    print(f"Direct similarity: {explanation['direct_similarity']:.3f}")

    if explanation['connection_path']:
        print(f"Connection path: {' -> '.join(explanation['connection_path'])}")
    else:
        print("No direct connection path found")

    print(f"\nTop 5 Bridge Concepts:")

    if explanation['top_bridges']:
        for i, (bridge, strength) in enumerate(explanation['top_bridges'], 1):
            print(f"{i}. {bridge.replace('_', ' ')}: {strength:.3f}")
    else:
        print("No bridges found")

    if explanation['analysis']:
        analysis = explanation['analysis']
        print(f"\nDetailed Analysis of Strongest Bridge: '{analysis['strongest_bridge'].replace('_', ' ')}'")
        print(f"  Similarity to intuition: {analysis['bridge_to_domain1_similarity']:.3f}")
        print(f"  Similarity to consciousness: {analysis['bridge_to_domain2_similarity']:.3f}")
        print(f"  Novelty score: {analysis['bridge_novelty']:.3f}")
        print(f"  Overall bridge strength: {analysis['bridge_strength']:.3f}")

    # Test additional domain pairs
    test_pairs = [
        ('creativity', 'memory'),
        ('emotion', 'decision_making'),
        ('attention', 'learning')
    ]

    print(f"\nAdditional Domain Pair Analysis:")
    print("-" * 30)

    for domain1, domain2 in test_pairs:
        bridges = connector.identify_bridges(domain1, domain2, top_n=3)
        direct_sim = connector.vector_space.similarity(domain1, domain2)
        print(f"\n{domain1.replace('_', ' ')} <-> {domain2.replace('_', ' ')} (direct similarity: {direct_sim:.3f}):")
        if bridges:
            for i, (bridge, strength) in enumerate(bridges, 1):
                print(f"  {i}. {bridge.replace('_', ' ')}: {strength:.3f}")
        else:
            print("  No bridges found")

    # Show some similarity matrix for debugging
    print(f"\nSample Similarity Matrix (for debugging):")
    print("-" * 40)
    test_concepts = ['intuition', 'consciousness', 'metacognition', 'system_1', 'fringe_consciousness']
    for i, concept1 in enumerate(test_concepts):
        for j, concept2 in enumerate(test_concepts):
            if i < j:
                sim = connector.vector_space.similarity(concept1, concept2)
                print(f"{concept1} <-> {concept2}: {sim:.3f}")

if __name__ == "__main__":
    main_demo()

Cross-Domain Connection Algorithm Demo
Loaded 20 concepts for analysis...
Vector space built successfully!

Analyzing connection between 'intuition' and 'consciousness'
--------------------------------------------------
Direct similarity: 0.067
Connection path: intuition -> system_1 -> consciousness

Top 5 Bridge Concepts:
1. system 1: 0.400
2. metacognition: 0.337
3. fringe consciousness: 0.303
4. embodied cognition: 0.273
5. global workspace: 0.249

Detailed Analysis of Strongest Bridge: 'system 1'
  Similarity to intuition: 0.529
  Similarity to consciousness: 0.058
  Novelty score: 0.991
  Overall bridge strength: 0.400

Additional Domain Pair Analysis:
------------------------------

creativity <-> memory (direct similarity: 0.008):
  1. executive function: 0.260

emotion <-> decision making (direct similarity: 0.014):
  1. system 1: 0.301
  2. embodied cognition: 0.249
  3. intuition: 0.245

attention <-> learning (direct similarity: 0.033):
  1. executive function: 0.291
  2. gl

In [None]:
import requests
import json
import time
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
import re

@dataclass
class ResearchPaper:
    """Represents a research paper with metadata"""
    title: str
    abstract: str
    authors: List[str]
    year: int
    citations: int
    keywords: List[str]
    doi: Optional[str] = None
    venue: Optional[str] = None

class EnhancedSemanticScholarAPI:
    """Enhanced Semantic Scholar API with better error handling and rate limiting"""

    def __init__(self, api_key: Optional[str] = None):
        self.base_url = "https://api.semanticscholar.org"
        self.headers = {'User-Agent': 'CrossDomainConnector/1.0'}
        if api_key:
            self.headers['x-api-key'] = api_key
        self.rate_limit_delay = 0.1  # 100ms between requests

    def search_papers_batch(self, query: str, total_limit: int = 500) -> List[ResearchPaper]:
        """Search for papers with pagination support"""
        all_papers = []
        batch_size = 100
        offset = 0

        print(f"Searching for papers on: '{query}'")

        while len(all_papers) < total_limit:
            remaining = min(batch_size, total_limit - len(all_papers))
            batch = self._search_single_batch(query, remaining, offset)

            if not batch:
                break

            all_papers.extend(batch)
            offset += len(batch)

            print(f"  Retrieved {len(all_papers)} papers so far...")

            if len(batch) < remaining:
                break

            time.sleep(self.rate_limit_delay)

        print(f"Total papers found: {len(all_papers)}")
        return all_papers

    def _search_single_batch(self, query: str, limit: int, offset: int) -> List[ResearchPaper]:
        """Search single batch of papers"""
        url = f"{self.base_url}/graph/v1/paper/search"

        params = {
            'query': query,
            'limit': min(limit, 100),
            'offset': offset,
            'fields': 'title,abstract,authors,year,citationCount,s2FieldsOfStudy,venue'
        }

        try:
            response = requests.get(url, params=params, headers=self.headers, timeout=30)

            if response.status_code == 429:  # Rate limited
                print("  Rate limited, waiting...")
                time.sleep(2)
                return self._search_single_batch(query, limit, offset)

            response.raise_for_status()
            data = response.json()

            papers = []
            for paper_data in data.get('data', []):
                # Skip papers without abstracts
                if not paper_data.get('abstract') or len(paper_data.get('abstract', '')) < 50:
                    continue

                paper = ResearchPaper(
                    title=paper_data.get('title', '').strip(),
                    abstract=paper_data.get('abstract', '').strip(),
                    authors=[author.get('name', '') for author in paper_data.get('authors', [])],
                    year=paper_data.get('year', 0) or 0,
                    citations=paper_data.get('citationCount', 0) or 0,
                    keywords=[field.get('category', '') for field in paper_data.get('s2FieldsOfStudy', [])],
                    venue=paper_data.get('venue', '')
                )

                # Only include papers with meaningful content
                if len(paper.title) > 10 and len(paper.abstract) > 100:
                    papers.append(paper)

            return papers

        except requests.RequestException as e:
            print(f"  Error in batch search: {e}")
            return []

class RealDataProcessor:
    """Process real academic data for the cross-domain algorithm"""

    def __init__(self, api_key: Optional[str] = None):
        self.api = EnhancedSemanticScholarAPI(api_key)

    def collect_domain_data(self, domain_queries: List[str], papers_per_query: int = 100) -> List[ResearchPaper]:
        """Collect papers from multiple domain-related queries"""
        all_papers = []

        for query in domain_queries:
            papers = self.api.search_papers_batch(query, papers_per_query)
            all_papers.extend(papers)
            time.sleep(0.5)  # Be nice to the API

        # Deduplicate
        unique_papers = self._deduplicate_papers(all_papers)
        print(f"After deduplication: {len(unique_papers)} unique papers")

        return unique_papers

    def _deduplicate_papers(self, papers: List[ResearchPaper]) -> List[ResearchPaper]:
        """Remove duplicate papers"""
        seen_titles = set()
        unique_papers = []

        for paper in papers:
            title_normalized = re.sub(r'[^\w\s]', '', paper.title.lower()).strip()
            if title_normalized and title_normalized not in seen_titles:
                seen_titles.add(title_normalized)
                unique_papers.append(paper)

        return unique_papers

    def extract_concepts_and_descriptions(self, papers: List[ResearchPaper]) -> Dict[str, str]:
        """Extract concepts from papers for vector space"""
        concepts = {}

        # Extract from abstracts (use first 300 chars as concept description)
        for paper in papers:
            if len(paper.abstract) > 100:
                # Create concept from title
                concept_name = self._create_concept_name(paper.title)
                if concept_name and len(concept_name) > 3:
                    concepts[concept_name] = paper.abstract[:500]

        # Extract from keywords/field classifications
        keyword_descriptions = defaultdict(list)
        for paper in papers:
            for keyword in paper.keywords:
                if keyword and len(keyword) > 3:
                    clean_keyword = self._create_concept_name(keyword)
                    if clean_keyword:
                        keyword_descriptions[clean_keyword].append(paper.abstract[:200])

        # Add keyword concepts
        for keyword, abstracts in keyword_descriptions.items():
            if len(abstracts) >= 2:  # Only include keywords appearing in multiple papers
                concepts[keyword] = ' '.join(abstracts[:3])

        print(f"Extracted {len(concepts)} concepts from papers")
        return concepts

    def _create_concept_name(self, text: str) -> str:
        """Create clean concept name"""
        if not text:
            return ""

        # Clean and normalize
        clean = re.sub(r'[^\w\s]', ' ', text.lower())
        clean = re.sub(r'\s+', '_', clean.strip())

        # Remove common academic words
        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'study', 'analysis', 'research', 'investigation'}
        words = [w for w in clean.split('_') if w not in stop_words and len(w) > 2]

        result = '_'.join(words[:5])  # Limit length
        return result if len(result) > 3 else ""

    def calculate_citation_metrics(self, papers: List[ResearchPaper]) -> Dict[str, int]:
        """Calculate citation frequencies"""
        citation_data = {}

        for paper in papers:
            concept_name = self._create_concept_name(paper.title)
            if concept_name:
                citation_data[concept_name] = paper.citations

        return citation_data

# Integration with existing algorithm
from __main__ import SemanticVectorSpace, CitationDataAnalyzer, CrossDomainConnector

def run_real_data_demo(api_key: Optional[str] = None):
    """Run comprehensive demo with real data"""
    print("🚀 REAL DATA CROSS-DOMAIN CONNECTION DEMO")
    print("=" * 60)

    # Initialize processor
    processor = RealDataProcessor(api_key)

    # Define domain queries for comprehensive coverage
    intuition_queries = [
        "intuition psychology decision making",
        "implicit cognition gut feeling",
        "tacit knowledge unconscious processing"
    ]

    consciousness_queries = [
        "consciousness awareness neuroscience",
        "phenomenal consciousness subjective experience",
        "conscious perception attention"
    ]

    try:
        print("\n📚 COLLECTING REAL RESEARCH DATA...")
        print("-" * 40)

        # Collect papers for both domains
        print("Collecting intuition-related papers...")
        intuition_papers = processor.collect_domain_data(intuition_queries, papers_per_query=50)

        print("\nCollecting consciousness-related papers...")
        consciousness_papers = processor.collect_domain_data(consciousness_queries, papers_per_query=50)

        # Combine datasets
        all_papers = intuition_papers + consciousness_papers
        print(f"\n📊 DATASET SUMMARY:")
        print(f"  Total papers: {len(all_papers)}")
        print(f"  Average citations: {np.mean([p.citations for p in all_papers]):.1f}")
        print(f"  Year range: {min(p.year for p in all_papers if p.year > 0)} - {max(p.year for p in all_papers if p.year > 0)}")

        # Extract concepts
        print(f"\n🧠 BUILDING KNOWLEDGE GRAPH...")
        print("-" * 40)
        concepts = processor.extract_concepts_and_descriptions(all_papers)
        citation_data = processor.calculate_citation_metrics(all_papers)

        # Build enhanced algorithm components
        vector_space = SemanticVectorSpace()
        vector_space.build_vector_space(concepts)

        citation_analyzer = CitationDataAnalyzer()
        citation_analyzer.add_citation_data(citation_data)

        connector = CrossDomainConnector(vector_space, citation_analyzer)

        print(f"✅ Knowledge graph built with {len(concepts)} concepts")

        # Run enhanced analysis
        print(f"\n🔍 ENHANCED CROSS-DOMAIN ANALYSIS")
        print("-" * 40)

        # Find best matching concepts for our target domains
        intuition_concepts = [c for c in concepts.keys() if 'intuition' in c or 'implicit' in c or 'tacit' in c]
        consciousness_concepts = [c for c in concepts.keys() if 'consciousness' in c or 'awareness' in c or 'conscious' in c]

        print(f"Found {len(intuition_concepts)} intuition-related concepts")
        print(f"Found {len(consciousness_concepts)} consciousness-related concepts")

        if intuition_concepts and consciousness_concepts:
            # Analyze top concept pairs
            best_intuition = intuition_concepts[0]
            best_consciousness = consciousness_concepts[0]

            print(f"\nAnalyzing: {best_intuition} <-> {best_consciousness}")

            explanation = connector.explain_connection(best_intuition, best_consciousness)

            print(f"\n🎯 RESULTS FROM REAL DATA:")
            print(f"  Direct similarity: {explanation['direct_similarity']:.3f}")
            print(f"  Connection path: {' -> '.join(explanation['connection_path'])}")

            print(f"\n🌉 TOP BRIDGE CONCEPTS (from real research):")
            for i, (bridge, strength) in enumerate(explanation['top_bridges'][:7], 1):
                # Find papers that mention this bridge concept
                related_papers = [p for p in all_papers if bridge.replace('_', ' ') in p.title.lower() or bridge.replace('_', ' ') in p.abstract.lower()]
                paper_count = len(related_papers)
                avg_citations = np.mean([p.citations for p in related_papers]) if related_papers else 0

                print(f"  {i}. {bridge.replace('_', ' ').title()}")
                print(f"     Strength: {strength:.3f} | Papers: {paper_count} | Avg Citations: {avg_citations:.0f}")

            # Show sample papers for validation
            print(f"\n📄 SAMPLE PAPERS FROM DATASET:")
            high_citation_papers = sorted(all_papers, key=lambda p: p.citations, reverse=True)[:3]
            for i, paper in enumerate(high_citation_papers, 1):
                print(f"  {i}. \"{paper.title}\" ({paper.year})")
                print(f"     Citations: {paper.citations} | Authors: {', '.join(paper.authors[:2])}")
                print(f"     Abstract: {paper.abstract[:100]}...")
                print()

        else:
            print("⚠️ Could not find sufficient domain-specific concepts in the dataset")
            print("This might indicate:")
            print("  - Need for more targeted search queries")
            print("  - API rate limiting affecting results")
            print("  - Need for domain-specific preprocessing")

    except Exception as e:
        print(f"❌ Error in real data processing: {e}")
        print("\nTroubleshooting tips:")
        print("  1. Check internet connection")
        print("  2. Verify API key if provided")
        print("  3. Try reducing papers_per_query parameter")
        print("  4. Check if Semantic Scholar API is accessible")

def demo_without_api():
    """Demo version that works without API key"""
    print("🔬 DEMO MODE (No API Key Required)")
    print("=" * 50)
    print("This demo simulates real data collection...")
    print("To use real data, get a free Semantic Scholar API key!")
    print("\nSimulating data collection...")
    time.sleep(1)
    print("✅ Demo completed! Get API key for real results.")

if __name__ == "__main__":
    # Try to get API key from user
    print("Cross-Domain Connection Algorithm - Real Data Integration")
    print("=" * 60)

    api_key = input("Enter your Semantic Scholar API key (or press Enter to skip): ").strip()

    if api_key:
        print(f"✅ API key provided - running with real data!")
        run_real_data_demo(api_key)
    else:
        print(f"ℹ️ No API key - running demo mode")
        demo_without_api()
        print(f"\n🔑 To get real results:")
        print(f"   1. Get free API key: https://www.semanticscholar.org/product/api")
        print(f"   2. Run: run_real_data_demo('your_api_key_here')")

        # Show what real results would look like
        print(f"\n📊 EXPECTED REAL DATA RESULTS:")
        print(f"   - 100-500 research papers per domain")
        print(f"   - 200-1000 extracted concepts")
        print(f"   - Real citation frequencies")
        print(f"   - Novel bridge discoveries from actual research")

Cross-Domain Connection Algorithm - Real Data Integration
Enter your Semantic Scholar API key (or press Enter to skip): 
ℹ️ No API key - running demo mode
🔬 DEMO MODE (No API Key Required)
This demo simulates real data collection...
To use real data, get a free Semantic Scholar API key!

Simulating data collection...
✅ Demo completed! Get API key for real results.

🔑 To get real results:
   1. Get free API key: https://www.semanticscholar.org/product/api
   2. Run: run_real_data_demo('your_api_key_here')

📊 EXPECTED REAL DATA RESULTS:
   - 100-500 research papers per domain
   - 200-1000 extracted concepts
   - Real citation frequencies
   - Novel bridge discoveries from actual research
