In [None]:
"""
OpenAlex Data Fetcher for Scientific Article Recommender

This script fetches scientific articles and concept hierarchies from the OpenAlex API
for use in the recommendation system. It focuses on Computer Science, AI, and Physics domains.

Requirements:
- requests library for API calls
- tqdm for progress bars
- Valid email for OpenAlex API (replace YOUR_EMAIL_HERE)

Output:
- openalex_works.jsonl: Scientific articles with metadata
- openalex_concepts_hierarchy.jsonl: Concept hierarchies and relationships
"""

import requests
import json
from tqdm import tqdm

# Configuration
EMAIL_FOR_OPENALEX = "YOUR_EMAIL_HERE@example.com"  # Replace with your actual email
MAX_WORKS_TO_FETCH = 2000  # Maximum number of articles to fetch

# Define research domains and their OpenAlex concept IDs
# Each tuple contains (domain_name, concept_id, target_article_count)
DOMAINS = [
    ("Computer Science", "https://openalex.org/C41008148", 1400),
    ("Artificial Intelligence", "https://openalex.org/C154945302", 400),
    ("Physics", "https://openalex.org/C121332964", 200),
]

def reconstruct_abstract(index_dict):
    """
    Reconstruct abstract text from OpenAlex inverted index format.
    
    OpenAlex stores abstracts as inverted indexes to save space:
    {"word": [position1, position2], ...}
    
    Args:
        index_dict (dict): Inverted index dictionary from OpenAlex
        
    Returns:
        str: Reconstructed abstract text
    """
    try:
        # Find the maximum position to determine text length
        length = max(pos for positions in index_dict.values() for pos in positions) + 1
        words = [''] * length
        
        # Place each word at its correct positions
        for word, positions in index_dict.items():
            for pos in positions:
                words[pos] = word
                
        return ' '.join(words)
    except Exception as e:
        print(f"Error reconstructing abstract: {e}")
        return ""

def fetch_openalex_data():
    """
    Main function to fetch scientific articles and concepts from OpenAlex API.
    
    This function:
    1. Iterates through defined research domains
    2. Fetches articles with abstracts for each domain
    3. Extracts concepts and topics from articles
    4. Saves data to JSONL files for further processing
    """
    base_url = "https://api.openalex.org/works"
    all_works = []
    concepts = set()  # Use set to avoid duplicate concepts

    # Process each research domain
    for domain_name, concept_id, target_count in DOMAINS:
        domain_works = []
        page = 1
        per_page = 100  # OpenAlex API limit
        
        print(f"Fetching {target_count} works from {domain_name}...")
        
        with tqdm(total=target_count, desc=f"Fetching {domain_name}") as pbar:
            while len(domain_works) < target_count:
                # API parameters for filtering and pagination
                params = {
                    "filter": f"concepts.id:{concept_id},has_abstract:true",  # Only articles with abstracts
                    "per_page": per_page,
                    "page": page,
                    "mailto": EMAIL_FOR_OPENALEX  # Required for API access
                }
                
                try:
                    # Make API request
                    response = requests.get(base_url, params=params)
                    response.raise_for_status()
                    data = response.json()
                    
                    results = data.get("results", [])
                    if not results:
                        print(f"No more results for {domain_name}")
                        break

                    # Process each article
                    for work in results:
                        # Reconstruct abstract from inverted index
                        abstract_raw = work.get("abstract_inverted_index", {})
                        abstract = reconstruct_abstract(abstract_raw) if abstract_raw else ""

                        # Extract article metadata
                        article_data = {
                            "id": work["id"],
                            "title": work["title"],
                            "abstract": abstract,
                            "publication_year": work.get("publication_year"),
                            "cited_by_count": work.get("cited_by_count", 0),
                            "authors": [
                                {
                                    "id": a.get("author", {}).get("id"), 
                                    "name": a.get("author", {}).get("display_name")
                                }
                                for a in work.get("authorships", [])
                            ],
                            "institutions": [
                                {"id": i.get("id"), "name": i.get("display_name")}
                                for a in work.get("authorships", []) 
                                for i in a.get("institutions", [])
                            ],
                            "topics": [
                                {
                                    "id": t["id"], 
                                    "name": t["display_name"], 
                                    "level": t.get("level", 0), 
                                    "wikidata": t.get("wikidata")
                                }
                                for t in work.get("topics", [])
                            ],
                            "concepts": [
                                {
                                    "id": c["id"], 
                                    "name": c["display_name"], 
                                    "level": c.get("level", 0), 
                                    "wikidata": c.get("wikidata")
                                }
                                for c in work.get("concepts", [])
                            ],
                            "domain": domain_name
                        }
                        
                        domain_works.append(article_data)

                        # Collect unique concepts for ontology
                        for c in work.get("concepts", []) + work.get("topics", []):
                            concept_data = {
                                "id": c["id"],
                                "name": c["display_name"],
                                "level": c.get("level", 0),
                                "wikidata": c.get("wikidata")
                            }
                            concepts.add(json.dumps(concept_data))

                    pbar.update(len(results))
                    page += 1

                except requests.RequestException as e:
                    print(f"API Error in {domain_name} (page {page}): {e}")
                    break
                except Exception as e:
                    print(f"Unexpected error in {domain_name} (page {page}): {e}")
                    break

        # Limit to target count for this domain
        all_works.extend(domain_works[:target_count])

    # Save articles to JSONL file
    output_works_file = "openalex_works.jsonl"
    with open(output_works_file, "w", encoding='utf-8') as f:
        for work in all_works[:MAX_WORKS_TO_FETCH]:
            f.write(json.dumps(work, ensure_ascii=False) + "\n")

    # Save concepts to JSONL file
    output_concepts_file = "openalex_concepts_hierarchy.jsonl"
    with open(output_concepts_file, "w", encoding='utf-8') as f:
        for concept_json in concepts:
            f.write(concept_json + "\n")

    print(f"✅ Data fetching completed!")
    print(f"📄 Saved {len(all_works)} articles to {output_works_file}")
    print(f"🔗 Saved {len(concepts)} unique concepts to {output_concepts_file}")

# Execute the data fetching
if __name__ == "__main__":
    fetch_openalex_data()


In [None]:
"""
Data Preprocessing for Scientific Article Recommender

This script processes the raw OpenAlex data fetched in the previous cell,
cleaning and structuring it for use in the recommendation system.

Input files:
- openalex_works.jsonl: Raw article data from OpenAlex
- openalex_concepts_hierarchy.jsonl: Raw concept data from OpenAlex

Output files:
- processed_articles.json: Clean article data for recommendation engine
- processed_concepts.json: Clean concept data for ontology
"""

import json
import pandas as pd
from tqdm import tqdm
import os

def preprocess_data(works_file="openalex_works.jsonl", 
                   concepts_file="openalex_concepts_hierarchy.jsonl",
                   output_dir="../../data/cleaned data/"):
    """
    Preprocess raw OpenAlex data for the recommendation system.
    
    Args:
        works_file (str): Path to raw works JSONL file
        concepts_file (str): Path to raw concepts JSONL file
        output_dir (str): Directory to save processed files
    """
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    print("🔄 Starting data preprocessing...")
    
    # Process articles/works
    print("📚 Processing articles...")
    works = []
    
    try:
        with open(works_file, "r", encoding='utf-8') as f:
            for line_num, line in enumerate(tqdm(f, desc="Processing works"), 1):
                try:
                    work = json.loads(line.strip())
                    
                    # Clean and structure article data
                    processed_work = {
                        "id": work["id"],
                        "title": work["title"] or "Untitled",
                        "abstract": work["abstract"] or "",
                        "publication_year": work["publication_year"] or 0,
                        "cited_by_count": work["cited_by_count"] or 0,
                        "authors": work.get("authors", []),
                        # Remove duplicate institutions while preserving order
                        "institutions": list({
                            i["id"]: i for i in work.get("institutions", []) 
                            if i.get("id")
                        }.values()),
                        # Extract only top-level topics (level 0) for better classification
                        "topics": [
                            t["id"] for t in work.get("topics", []) 
                            if t.get("level", 0) == 0
                        ],
                        # Include all concept IDs for broader coverage
                        "concepts": [
                            c["id"] for c in work.get("concepts", []) 
                            if c.get("id")
                        ],
                        "domain": work.get("domain", "Unknown")
                    }
                    
                    works.append(processed_work)
                    
                except json.JSONDecodeError as e:
                    print(f"⚠️  JSON error in works file at line {line_num}: {e}")
                    continue
                except Exception as e:
                    print(f"⚠️  Error processing work at line {line_num}: {e}")
                    continue
                    
    except FileNotFoundError:
        print(f"❌ Works file not found: {works_file}")
        return
    
    # Process concepts
    print("🔗 Processing concepts...")
    concepts = {}
    
    try:
        with open(concepts_file, "r", encoding='utf-8') as f:
            for line_num, line in enumerate(tqdm(f, desc="Processing concepts"), 1):
                try:
                    concept = json.loads(line.strip())
                    
                    # Create clean concept entry
                    concept_id = concept["id"]
                    concepts[concept_id] = {
                        "id": concept_id,
                        "name": concept["name"] or "Unknown",
                        "level": concept.get("level", 0),
                        "wikidata": concept.get("wikidata", "")
                    }
                    
                except json.JSONDecodeError as e:
                    print(f"⚠️  JSON error in concepts file at line {line_num}: {e}")
                    continue
                except Exception as e:
                    print(f"⚠️  Error processing concept at line {line_num}: {e}")
                    continue
                    
    except FileNotFoundError:
        print(f"❌ Concepts file not found: {concepts_file}")
        return
    
    # Save processed data
    print("💾 Saving processed data...")
    
    try:
        # Save articles
        articles_output = os.path.join(output_dir, "processed_articles.json")
        articles_df = pd.DataFrame(works)
        articles_df.to_json(articles_output, orient="records", indent=2, force_ascii=False)
        
        # Save concepts
        concepts_output = os.path.join(output_dir, "processed_concepts.json")
        concepts_df = pd.DataFrame.from_dict(concepts, orient="index")
        concepts_df.to_json(concepts_output, orient="records", indent=2, force_ascii=False)
        
        # Print summary statistics
        print("✅ Data preprocessing completed!")
        print(f"📊 Summary:")
        print(f"   - Articles processed: {len(works)}")
        print(f"   - Concepts processed: {len(concepts)}")
        print(f"   - Articles saved to: {articles_output}")
        print(f"   - Concepts saved to: {concepts_output}")
        
        # Display domain distribution
        if works:
            domain_counts = pd.DataFrame(works)['domain'].value_counts()
            print(f"📈 Domain distribution:")
            for domain, count in domain_counts.items():
                print(f"   - {domain}: {count} articles")
                
    except Exception as e:
        print(f"❌ Error saving processed data: {e}")

# Execute the preprocessing
if __name__ == "__main__":
    preprocess_data()