In [None]:
import requests
import json
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import time


INPUT_FILE = r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\processed\gold_jocs_clean.jsonl"
OUTPUT_FILE = r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\processed\author_profiles.jsonl"
EMAIL = "nehalsonu4@gmial.com" 

def get_unique_author_ids(path):
    authors = set()
    print("Unique Author IDs nikaal raha hoon...")
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                record = json.loads(line)
                authorships = record.get('openalex_work', {}).get('authorships', [])
                for auth in authorships:
                    a_id = auth.get('author', {}).get('id')
                    if a_id:
                        # Extracting ID (e.g., A5022139811)
                        authors.add(a_id.split('/')[-1])
            except Exception:
                continue
    return list(authors)

def fetch_author_object(author_id):
    """Sirf Author ka Profile Object fetch karne ke liye"""
    url = f"https://api.openalex.org/authors/{author_id}?mailto={EMAIL}"
    try:
        r = requests.get(url, timeout=15)
        if r.status_code == 200:
            return r.json()
        elif r.status_code == 429: # Rate limit handle karne ke liye
            time.sleep(2)
            return fetch_author_object(author_id)
    except Exception:
        pass
    return None

# Unique IDs ki list
unique_ids = get_unique_author_ids(INPUT_FILE)
print(f"Total {len(unique_ids)} unique authors miley hain.")

# Parallel Fetching (Phase 1: Profiles Only)
print("OpenAlex se Author Objects fetch ho rahay hain...")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out:
    # 10 threads kaafi hain polite pool ke liye
    with ThreadPoolExecutor(max_workers=10) as executor:
        for result in tqdm(executor.map(fetch_author_object, unique_ids), total=len(unique_ids)):
            if result:
                f_out.write(json.dumps(result) + "\n")

print(f"Done! Saaray Author Objects yahan save hain: {OUTPUT_FILE}")

In [None]:
import requests
import json
import os
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import time
import pandas as pd


EMAIL = "nehsalsonu4@gmail.com"
INPUT_FILE = r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\processed\gold_jocs_clean.jsonl"
OUTPUT_DIR = r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\processed"
# OUTPUT_FILE = os.path.join(OUTPUT_DIR, "authors_work.jsonl")
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "authors_work.parquet")

# Ensure directory exists
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

def get_unique_author_ids(path):
    authors = set()
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                record = json.loads(line)
                authorships = record.get('openalex_work', {}).get('authorships', [])
                for auth in authorships:
                    a_id = auth.get('author', {}).get('id')
                    if a_id: authors.add(a_id.split('/')[-1])
            except: continue
    return list(authors)

def fetch_full_career(author_id):
    """Hits the works_api_url and follows cursors until ALL papers are fetched"""
    all_works = []
    cursor = '*' # Starting cursor for deep paging
    base_url = "https://api.openalex.org/works"
    
    while cursor:
        params = {
            'filter': f'author.id:{author_id}',
            'select': 'id,title,abstract_inverted_index,publication_year,authorships,concepts',
            'per_page': 200, # Max allowed for faster fetching
            'cursor': cursor,
            'mailto': EMAIL
        }
        try:
            r = requests.get(base_url, params=params, timeout=30)
            if r.status_code == 200:
                data = r.json()
                results = data.get('results', [])
                all_works.extend(results)
                
                # Check for next page
                next_cursor = data.get('meta', {}).get('next_cursor')
                if next_cursor and results:
                    cursor = next_cursor
                else:
                    cursor = None # Stop when no more results
            else:
                cursor = None 
        except Exception:
            time.sleep(1) # Simple retry delay
            continue
            
    return {"author_id": author_id, "total_works": len(all_works), "works": all_works}

# --- EXECUTION ---
# unique_ids = get_unique_author_ids(INPUT_FILE)
# print(f"Found {len(unique_ids)} authors. Fetching EVERY single paper in their careers...")

# with open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out:
#     # 8 workers are optimal for OpenAlex Polite Pool
#     with ThreadPoolExecutor(max_workers=8) as executor:
#         for result in tqdm(executor.map(fetch_full_career, unique_ids), total=len(unique_ids)):
#             if result and result['works']:
#                 f_out.write(json.dumps(result) + "\n")

# print(f"MISSION COMPLETE! Full career data saved to: {OUTPUT_FILE}")

# --- Naya Execution Block ---
 # Result jama karne ke liye list

unique_ids = get_unique_author_ids(INPUT_FILE)
print(f"Found {len(unique_ids)} authors. Fetching EVERY single paper in their careers...")
    
all_results = []

with ThreadPoolExecutor(max_workers=8) as executor:
    # Saare results ko ek list mein collect karein
    for result in tqdm(executor.map(fetch_full_career, unique_ids), total=len(unique_ids)):
        if result and result['works']:
            all_results.append(result)

print("Finalizing Parquet file...")
# List ko DataFrame mein badlein
df = pd.DataFrame(all_results)

# Parquet format mein save karein (Binary format)
df.to_parquet(OUTPUT_FILE, engine='pyarrow', index=False)

In [None]:
import json
from tqdm import tqdm

# Aapki file ka sahi path
input_file = r"D:\ITMO Big Data & ML School\semester 3\RI3\notebooks\data\processed\authors_work.jsonl"

total_papers = 0
total_authors = 0
papers_with_abstract = 0

print("Auditing Works and Abstracts...")

with open(input_file, 'r', encoding='utf-8') as f:
    for line in tqdm(f):
        try:
            data = json.loads(line)
            total_authors += 1
            
            # Author ke total works ka count (meta data se)
            total_papers += data.get('total_works', 0)
            
            # Har individual paper ko check karna
            works_list = data.get('works', [])
            for work in works_list:
                # Check if abstract_inverted_index exists and is not empty
                if work.get('abstract_inverted_index'):
                    papers_with_abstract += 1
                    
        except Exception as e:
            continue

# Coverage Calculation
if total_papers > 0:
    coverage = (papers_with_abstract / total_papers) * 100
else:
    coverage = 0

print("-" * 40)
print(f"Total Authors Processed: {total_authors}")
print(f"Total Papers Found:      {total_papers}")
print(f"Papers with Abstracts:   {papers_with_abstract}")
print(f"Abstract Coverage:       {coverage:.2f}%")
print("-" * 40)