In [None]:
import pandas as pd
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Cache to store already fetched IDs
UNIPROT_CACHE = {}

def get_uniprot_ids_batch(gene_batch):
    """Fetch UniProt IDs for a batch of genes"""
    base_url = "https://rest.uniprot.org/uniprotkb/search"
    query = ' OR '.join([f'gene_exact:{gene}' for gene in gene_batch])
    query += ' AND organism_id:9606 AND reviewed:true'

    params = {
        'query': query,
        'format': 'tsv',
        'fields': 'accession,gene_names',
        'size': len(gene_batch)
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()

        # Parse response and build mapping
        lines = response.text.split('\n')[1:]  # Skip header
        results = {}
        for line in lines:
            if line.strip():
                parts = line.split('\t')
                if len(parts) >= 2:
                    uniprot_id, genes = parts[0], parts[1]
                    for gene in genes.split():
                        results[gene] = uniprot_id
        return results

    except Exception as e:
        print(f"Batch error: {str(e)}")
        return {}

def process_tsv_fast(input_file, output_file, batch_size=100, max_workers=5):
    """Process TSV file with optimized batch lookups"""
    # Read TSV file
    df = pd.read_csv(input_file, sep='\t')

    # Clean gene names
    df['GENE_SYMBOL'] = df['GENE_SYMBOL'].str.split(',').str[0].str.strip()

    # Get unique genes
    unique_genes = df['GENE_SYMBOL'].unique().tolist()
    total_genes = len(unique_genes)
    print(f"Processing {total_genes} unique genes...")

    # Process in batches with threading
    start_time = time.time()
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for i in range(0, total_genes, batch_size):
            batch = unique_genes[i:i+batch_size]
            futures.append(executor.submit(get_uniprot_ids_batch, batch))
            time.sleep(0.1)  # Small delay between batch submissions

        # Collect results
        for future in as_completed(futures):
            UNIPROT_CACHE.update(future.result())

    # Add UniProt IDs to dataframe
    df['UniProt_ID'] = df['GENE_SYMBOL'].map(UNIPROT_CACHE).fillna('Not found')

    # Reorder columns
    cols = ['UniProt_ID'] + [col for col in df.columns if col != 'UniProt_ID']
    df = df[cols]

    # Save as TSV
    df.to_csv(output_file, sep='\t', index=False)

    elapsed = time.time() - start_time
    print(f"\nProcessed {len(df)} rows in {elapsed:.2f} seconds")
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    input_tsv = "Cosmic_CompleteTargetedScreensMutant_v101_GRCh38_output.tsv"
    output_tsv = "output_with_uniprot_ids.tsv"

    # Adjust parameters based on your needs:
    process_tsv_fast(
        input_file=input_tsv,
        output_file=output_tsv,
        batch_size=100,  # Number of genes per API call
        max_workers=5    # Number of concurrent requests
    )

Processing 18633 unique genes...

Processed 18633 rows in 35.06 seconds
Results saved to output_with_uniprot_ids.tsv
