In [None]:


def show_all_uniprot_fields():
    """showing ALL available UniProt fields organized by group"""
    url = "https://rest.uniprot.org/configure/uniprotkb/result-fields"
    response = requests.get(url)
    
    if response.status_code == 200:
        groups = response.json()
        
        total_fields = 0
        
        for group in groups:
            group_name = group['groupName']
            fields = group['fields']
            
            print(f"\n{'='*60}")
            print(f"GROUP: {group_name}")
            print(f"{'='*60}")
            print(f"Number of fields: {len(fields)}")
            print()
            
            for field in fields:
                field_name = field['name']
                field_label = field['label']
                print(f"{field_name:<30} | {field_label}")
                total_fields += 1
            
        print(f"\n{'='*60}")
        print(f"TOTAL AVAILABLE FIELDS: {total_fields}")
        print(f"{'='*60}")
        
        return groups
    else:
        print(f"Error: {response.status_code}")
        return []

# show everything
all_groups = show_all_uniprot_fields()


GROUP: Names & Taxonomy
Number of fields: 14

accession                      | Entry
id                             | Entry Name
gene_names                     | Gene Names
gene_oln                       | Gene Names (ordered locus)
gene_orf                       | Gene Names (ORF)
gene_primary                   | Gene Names (primary)
gene_synonym                   | Gene Names (synonym)
organism_name                  | Organism
organism_id                    | Organism (ID)
protein_name                   | Protein names
xref_proteomes                 | Proteomes
lineage                        | Taxonomic lineage
lineage_ids                    | Taxonomic lineage (Ids)
virus_hosts                    | Virus hosts

GROUP: Sequences
Number of fields: 19

cc_alternative_products        | Alternative products (isoforms)
ft_var_seq                     | Alternative sequence
error_gmodel_pred              | Erroneous gene model prediction
fragment                       | Fragment
organelle 

In [None]:
import requests
import pandas as pd
import time
from io import StringIO

def download_swissprot_stream():
    """Download ALL Swiss-Prot proteins using stream endpoint"""
    
    # fields:
    fields = [
        # Basic info (8)
        'accession', 'id', 'gene_names', 'organism_name', 'protein_name',
        'length', 'mass', 'sequence',

        # Function (11)
        'ft_act_site', 'ft_binding', 'ft_non_std', 'ft_site', 'ft_dna_bind',
        'cc_catalytic_activity', 'cc_function', 'ec', 'cc_pathway',
        'cc_cofactor', 'cc_activity_regulation',

        # Domains & Families (9)
        'ft_domain', 'ft_motif', 'ft_region', 'ft_repeat', 'ft_zn_fing',
        'ft_coiled', 'ft_compbias', 'protein_families', 'cc_domain',
        
        # PTM/Processing (11)
        'ft_mod_res', 'ft_carbohyd', 'ft_lipid', 'ft_signal', 'ft_transit',
        'ft_disulfid', 'ft_chain', 'cc_ptm', 'ft_crosslnk', 'ft_propep', 'ft_peptide',

        # Structure (3)
        'ft_helix', 'ft_strand', 'ft_turn',
        
        # Location (4)
        'ft_transmem', 'ft_intramem', 'ft_topo_dom', 'cc_subcellular_location',

        # Gene Ontology (4)
        'go', 'go_f', 'go_p', 'go_c',

        # Cross-references (13)
        'keyword', 'xref_interpro', 'xref_pfam', 'xref_smart', 'xref_prosite',
        'xref_pdb', 'xref_alphafolddb', 'xref_cdd', 'xref_hamap',
        'xref_panther', 'xref_pirsf', 'xref_prints', 'xref_supfam', 'xref_gene3d'
    ]
    
    params = {
        'query': 'reviewed:true',  # all Swiss-Prot proteins
        'format': 'tsv',           # TSV format (easier to parse)
        'fields': ','.join(fields) 
    }
    
    print("🚀 Starting stream download of ALL Swiss-Prot proteins...")
    print(f"📋 Downloading {len(fields)} fields")
    
    try:
        # use stream endpoint, which allows for all proteins to be downloaded at once
        response = requests.get(
            "https://rest.uniprot.org/uniprotkb/stream", 
            params=params,
            timeout=900, 
            stream=True  
        )
        
        if response.status_code == 200:
            print("✅ Stream connection successful!")
            print("💾 Saving data to file...")
            
            # save raw TSV to file first 
            filename = 'swissprot_full_dataset.tsv'
            with open(filename, 'w', encoding='utf-8') as f:
                for chunk in response.iter_content(chunk_size=8192, decode_unicode=True):
                    if chunk:  
                        f.write(chunk)
            
            print(f"✅ Raw data saved to {filename}")
            print("📊 Loading into pandas DataFrame...")
            
            df = pd.read_csv(filename, sep='\t', low_memory=False)
            
            print(f"🎉 SUCCESS!")
            print(f"📈 Downloaded: {len(df):,} proteins")
            print(f"📋 Fields: {len(df.columns)} columns")
            print(f"🌍 Organisms: {df['Organism'].nunique():,} species")
            
            # validation
            print(f"\n🔍 validation:")
            print(f"   unique accessions: {df['Entry'].nunique():,}")
            print(f"   unique sequences: {df['Sequence'].nunique():,}")
            print(f"   avg length: {df['Length'].mean():.0f} amino acids")
            
            return df
            
        else:
            print(f"stream failed with status: {response.status_code}")
            print(f"response: {response.text[:300]}")
            return None
            
    except requests.exceptions.Timeout:
        print("Download timed out after 15 minutes")
        return None
    except Exception as e:
        print(f"Error during download: {e}")
        return None

# Download ALL Swiss-Prot proteins
df_full = download_swissprot_stream()

if df_full is not None:
    compressed_filename = 'data/swissprot_full_dataset.tsv.gz'
    df_full.to_csv(compressed_filename, sep='\t', index=False, compression='gzip')
    print(f"💾 Compressed dataset saved as: {compressed_filename}")

🚀 Starting stream download of ALL Swiss-Prot proteins...
📋 Downloading 64 fields
✅ Stream connection successful!
💾 Saving data to file...


In [None]:
def analyze_df(df):
  
  print("=== DATASET ANALYSIS ===")
  
  # 1. Annotation coverage analysis
  print("\n Annotation Coverage:")
  high_coverage_fields = []
  coverage_data = []
  
  for col in df.columns:
      if col not in ['Entry', 'Entry Name', 'Sequence']:
          coverage = df[col].notna().sum()
          percentage = coverage / len(df) * 100
          
          coverage_data.append({
              'field': col,
              'coverage': coverage,
              'percentage': percentage
          })
          
          if percentage >= 50:
              high_coverage_fields.append(col)
  
  # Sort by percentage (highest to lowest)
  coverage_data.sort(key=lambda x: x['percentage'], reverse=True)
  
  # Print sorted coverage
  for item in coverage_data:
      col = item['field']
      coverage = item['coverage']
      percentage = item['percentage']
      
      # for visual purposes
      if percentage >= 80:
          indicator = "🟢"  
      elif percentage >= 50:
          indicator = "🟡"  
      elif percentage >= 10:
          indicator = "🟠"
      else:
          indicator = "🔴" 
          
      print(f"{indicator} {col}: {coverage:,}/{len(df):,} ({percentage:.1f}%)")
  
  # 2. organsism diversity
  print(f"\n Organism Diversity:")
  top_organisms = df['Organism'].value_counts().head(10)
  print("Top 10 organisms:")
  for org, count in top_organisms.items():
      print(f"  {org}: {count:,} proteins")
  
  # 3. protein length dist
  print(f"\n📏 Protein Length Statistics:")
  print(f"  Min length: {df['Length'].min()} amino acids")
  print(f"  Max length: {df['Length'].max()} amino acids")
  print(f"  Median length: {df['Length'].median():.0f} amino acids")
  print(f"  Proteins >1000 AA: {(df['Length'] > 1000).sum():,}")
  
  # 4. d0main analysis
  if 'Domain [FT]' in df.columns:
      domain_coverage = df['Domain [FT]'].notna().sum()
      print(f"\n Domain Analysis:")
      print(f"  Proteins with domains: {domain_coverage:,} ({domain_coverage/len(df)*100:.1f}%)")
  
  # 5. Overall coverage summary
  print(f"\n Coverage Summary:")
  high_cov = sum(1 for item in coverage_data if item['percentage'] >= 80)
  medium_cov = sum(1 for item in coverage_data if 50 <= item['percentage'] < 80)
  low_cov = sum(1 for item in coverage_data if 10 <= item['percentage'] < 50)
  very_low_cov = sum(1 for item in coverage_data if item['percentage'] < 10)
  
  print(f"  High coverage (≥80%): {high_cov} fields")
  print(f"  Medium coverage (50-79%): {medium_cov} fields") 
  print(f"  Low coverage (10-49%): {low_cov} fields")
  print(f"  Very low coverage (<10%): {very_low_cov} fields")
  
  return {
      'total_proteins': len(df),
      'unique_sequences': df['Sequence'].nunique(),
      'organisms': df['Organism'].nunique(),
      'high_coverage_fields': high_coverage_fields,
      'coverage_data': coverage_data 
  }

analysis = analyze_df(df_full)

In [None]:
## testing with *almost * all fields:

import requests
import pandas as pd
import time
from io import StringIO

def download_swissprot_stream():
    """Download ALL Swiss-Prot proteins using stream endpoint"""
    
    # fields:
    fields = [
        # Basic info (8)
        'accession', 'id', 'gene_names', 'organism_name', 'protein_name',
        'length', 'mass', 'sequence', 
        # additional fields:
        'gene_oln', 'gene_orf', 'gene_primary', 'gene_synonym', 'lineage', 'lineage_ids', 'virus_hosts',

        # Function (11)
        'ft_act_site', 'ft_binding', 'ft_non_std', 'ft_site', 'ft_dna_bind',
        'cc_catalytic_activity', 'cc_function', 'ec', 'cc_pathway',
        'cc_cofactor', 'cc_activity_regulation',
        # additional fields:
        'temp_dependence', 'redox_potential', 'ph_dependence', 'kinetics',

        # additional fields (interaction + expression):
        'cc_interaction', 'cc_subunit', 'cc_developmental_stage', 'cc_induction', 'cc_tissue_specificity',

        # Domains & Families (9)
        'ft_domain', 'ft_motif', 'ft_region', 'ft_repeat', 'ft_zn_fing',
        'ft_coiled', 'ft_compbias', 'protein_families', 'cc_domain',
        
        # PTM/Processing (11)
        'ft_mod_res', 'ft_carbohyd', 'ft_lipid', 'ft_signal', 'ft_transit',
        'ft_disulfid', 'ft_chain', 'cc_ptm', 'ft_crosslnk', 'ft_propep', 'ft_peptide',

        # Structure (3)
        'ft_helix', 'ft_strand', 'ft_turn',
        
        # Location (4)
        'ft_transmem', 'ft_intramem', 'ft_topo_dom', 'cc_subcellular_location',

        # Gene Ontology (4)
        'go', 'go_f', 'go_p', 'go_c',

        # Cross-references (13)
        'keyword', 'xref_interpro', 'xref_pfam', 'xref_smart', 'xref_prosite',
        'xref_pdb', 'xref_alphafolddb', 'xref_cdd', 'xref_hamap',
        'xref_panther', 'xref_pirsf', 'xref_prints', 'xref_supfam', 'xref_gene3d',

        # extra references:
        'xref_ccds', 
        'xref_embl', 
        'xref_generif', 
        'xref_pir',
        'xref_refseq',
        'xref_bmrb',
        'xref_emdb', 
        'xref_pcddb',
        'xref_pdbsum', 
        'xref_sasbdb', 
        'xref_smr',
        'xref_antifam',
        'xref_cdd',
        'xref_disprot',
        'xref_funfam',
        'xref_gene3d',
        'xref_hamap',
        'xref_ideal',
        'xref_interpro',
        'xref_ncbifam',
        'xref_panther',
        'xref_pirsf',
        'xref_prints',
        'xref_prosite', 
        'xref_pfam', 
        'xref_sfld', 
        'xref_smart', 
        'xref_supfam', 
        'xref_bgee',
        'xref_cleanex',
        'xref_collectf',
        'xref_expressionatlas',
        'xref_biogrid-orcs',
        'xref_cd-code',
        'xref_chitars',
        'xref_evolutionarytrace',
        'xref_genewiki',
        'xref_genomernai',
        'xref_orcid',
        'xref_pgenn',
        'xref_phi-base',
        'xref_pro',
        'xref_pharos',
        'xref_pubtator',
        'xref_rnact',
        'xref_emind',
        'xref_brenda',
        'xref_biocyc',
        'xref_pathwaycommons',
        'xref_plantreactome',
        'xref_reactome',
        'xref_sabio-rk',
        'xref_signor',
        'xref_strenda-db',
        'xref_signalink',
        'xref_unipathway',
        'xref_genetree',
        'xref_hogenom',
        'xref_inparanoid',
        'xref_oma',
        'xref_orthodb',
        'xref_pan-go',
        'xref_phylomedb',
        'xref_treefam',
        'xref_eggnog',
        'xref_agr',
        'xref_arachnoserver',
        'xref_araport',
        'xref_cgd',
        'xref_ctd',
        'xref_conoserver',
        'xref_disgenet',
        'xref_echobase',
        'xref_flybase',
        'xref_genecards',
        'xref_genereviews',
        'xref_hgnc',
        'xref_hpa',
        'xref_ic4r',
        'xref_japonicusdb',
        'xref_legiolist',
        'xref_leproma',
        'xref_mgi',
        'xref_mim',
        'xref_maizegdb',
        'xref_malacards',
        'xref_niagads',
        'xref_opentargets',
        'xref_orphanet',
        'xref_pharmgkb',
        'xref_pombase',
        'xref_pseudocap',
        'xref_rgd',
        'xref_sgd',
        'xref_tair',
        'xref_tuberculist',
        'xref_veupathdb',
        'xref_vgnc',
        'xref_wormbase',
        'xref_xenbase',
        'xref_zfin',
        'xref_dictybase',
        'xref_euhcvdb',
        'xref_nextprot',
        'xref_ensembl',
        'xref_ensemblbacteria',
        'xref_ensemblfungi',
        'xref_ensemblmetazoa',
        'xref_ensemblplants',
        'xref_ensemblprotists',
        'xref_geneid',
        'xref_gramene',
        'xref_kegg',
        'xref_mane-select',
        'xref_patric',
        'xref_ucsc',
        'xref_vectorbase',
        'xref_wbparasite',
        'xref_abcd',
        'xref_antibodypedia',
        'xref_cptc',
        'xref_dnasu',
        'xref_ycharos',
        'xref_cptac',
        'xref_massive',
        'xref_pride',
        'xref_paxdb',
        'xref_peptideatlas',
        'xref_promex',
        'xref_proteomicsdb',
        'xref_pumba',
        'xref_topdownproteomics',
        'xref_jpost',
        'xref_ogp',
        'xref_reproduction-2dpage',
        'xref_alzforum',
        'xref_biomuta',
        'xref_dmdm',
        'xref_dbsnp',
        'xref_carbonyldb',
        'xref_depod',
        'xref_glyconnect',
        'xref_glycosmos',
        'xref_glygen',
        'xref_metosite',
        'xref_phosphositeplus',
        'xref_swisspalm',
        'xref_unicarbkb',
        'xref_iptmnet',
        'xref_allergome',
        'xref_card',
        'xref_cazy',
        'xref_esther',
        'xref_imgt_gene-db',
        'xref_merops',
        'xref_moondb',
        'xref_moonprot',
        'xref_peroxibase',
        'xref_rebase',
        'xref_tcdb',
        'xref_unilectin',
        'xref_bindingdb',
        'xref_chembl',
        'xref_drugbank',
        'xref_drugcentral',
        'xref_guidetopharmacology',
        'xref_swisslipids',
        'xref_biogrid',
        'xref_corum',
        'xref_complexportal',
        'xref_dip',
        'xref_elm',
        'xref_funcoup',
        'xref_intact',
        'xref_mint',
        'xref_string',
        'xref_pdbsum',
        'xref_sasbdb',
        'xref_smr'
    ]
    
    params = {
        'query': 'reviewed:true',  # all Swiss-Prot proteins
        'format': 'tsv',           # TSV format (easier to parse)
        'fields': ','.join(fields) 
    }
    
    print("🚀 Starting stream download of ALL Swiss-Prot proteins...")
    print(f"📋 Downloading {len(fields)} fields")
    
    try:
        # use stream endpoint, which allows for all proteins to be downloaded at once
        response = requests.get(
            "https://rest.uniprot.org/uniprotkb/stream", 
            params=params,
            timeout=900, 
            stream=True  
        )
        
        if response.status_code == 200:
            print("Stream connection successful!")
            print("Saving data to file...")
            
            # save raw TSV to file first 
            filename = 'swissprot_full_dataset.tsv'
            with open(filename, 'w', encoding='utf-8') as f:
                for chunk in response.iter_content(chunk_size=8192, decode_unicode=True):
                    if chunk:  
                        f.write(chunk)
            
            print(f"Raw data saved to {filename}")
            print("Loading into pandas DataFrame...")

            df = pd.read_csv(filename, sep='\t', low_memory=False)
            
            print(f"SUCCESS!")
            print(f"Downloaded: {len(df):,} proteins")
            print(f"Fields: {len(df.columns)} columns")
            print(f"Organisms: {df['Organism'].nunique():,} species")
            
            # validation
            print(f"\nvalidation:")
            print(f"   unique accessions: {df['Entry'].nunique():,}")
            print(f"   unique sequences: {df['Sequence'].nunique():,}")
            print(f"   avg length: {df['Length'].mean():.0f} amino acids")
            
            return df
            
        else:
            print(f"stream failed with status: {response.status_code}")
            print(f"response: {response.text[:300]}")
            return None
            
    except requests.exceptions.Timeout:
        print("Download timed out after 15 minutes")
        return None
    except Exception as e:
        print(f"Error during download: {e}")
        return None

# Download ALL Swiss-Prot proteins
df_full = download_swissprot_stream()

if df_full is not None:
    compressed_filename = 'swissprot_all_fields.tsv.gz'
    df_full.to_csv(compressed_filename, sep='\t', index=False, compression='gzip')
    print(f"💾 Compressed dataset saved as: {compressed_filename}")

🚀 Starting stream download of ALL Swiss-Prot proteins...
📋 Downloading 259 fields
✅ Stream connection successful!
💾 Saving data to file...
✅ Raw data saved to swissprot_full_dataset.tsv
📊 Loading into pandas DataFrame...
🎉 SUCCESS!
📈 Downloaded: 573,661 proteins
📋 Fields: 259 columns
🌍 Organisms: 14,803 species

🔍 validation:
   unique accessions: 573,661
   unique sequences: 485,423
   avg length: 362 amino acids
💾 Compressed dataset saved as: swissprot_all_fields.tsv.gz


In [49]:
def analyze_df(df):
  
  print("=== DATASET ANALYSIS ===")
  
  # 1. Annotation coverage analysis
  print("\n Annotation Coverage:")
  high_coverage_fields = []
  coverage_data = []
  
  for col in df.columns:
      if col not in ['Entry', 'Entry Name', 'Sequence']:
          coverage = df[col].notna().sum()
          percentage = coverage / len(df) * 100
          
          coverage_data.append({
              'field': col,
              'coverage': coverage,
              'percentage': percentage
          })
          
          if percentage >= 50:
              high_coverage_fields.append(col)
  
  # Sort by percentage (highest to lowest)
  coverage_data.sort(key=lambda x: x['percentage'], reverse=True)
  
  # Print sorted coverage
  for item in coverage_data:
      col = item['field']
      coverage = item['coverage']
      percentage = item['percentage']
      
      # for visual purposes
      if percentage >= 80:
          indicator = "🟢"  
      elif percentage >= 50:
          indicator = "🟡"  
      elif percentage >= 10:
          indicator = "🟠"
      else:
          indicator = "🔴" 
          
      print(f"{indicator} {col}: {coverage:,}/{len(df):,} ({percentage:.1f}%)")
  
  # 2. organsism diversity
  print(f"\n Organism Diversity:")
  top_organisms = df['Organism'].value_counts().head(10)
  print("Top 10 organisms:")
  for org, count in top_organisms.items():
      print(f"  {org}: {count:,} proteins")
  
  # 3. protein length dist
  print(f"\n📏 Protein Length Statistics:")
  print(f"  Min length: {df['Length'].min()} amino acids")
  print(f"  Max length: {df['Length'].max()} amino acids")
  print(f"  Median length: {df['Length'].median():.0f} amino acids")
  print(f"  Proteins >1000 AA: {(df['Length'] > 1000).sum():,}")
  
  # 4. d0main analysis
  if 'Domain [FT]' in df.columns:
      domain_coverage = df['Domain [FT]'].notna().sum()
      print(f"\n Domain Analysis:")
      print(f"  Proteins with domains: {domain_coverage:,} ({domain_coverage/len(df)*100:.1f}%)")
  
  # 5. Overall coverage summary
  print(f"\n Coverage Summary:")
  high_cov = sum(1 for item in coverage_data if item['percentage'] >= 80)
  medium_cov = sum(1 for item in coverage_data if 50 <= item['percentage'] < 80)
  low_cov = sum(1 for item in coverage_data if 10 <= item['percentage'] < 50)
  very_low_cov = sum(1 for item in coverage_data if item['percentage'] < 10)
  
  print(f"  High coverage (≥80%): {high_cov} fields")
  print(f"  Medium coverage (50-79%): {medium_cov} fields") 
  print(f"  Low coverage (10-49%): {low_cov} fields")
  print(f"  Very low coverage (<10%): {very_low_cov} fields")
  
  return {
      'total_proteins': len(df),
      'unique_sequences': df['Sequence'].nunique(),
      'organisms': df['Organism'].nunique(),
      'high_coverage_fields': high_coverage_fields,
      'coverage_data': coverage_data 
  }

analysis = analyze_df(df_full)

=== DATASET ANALYSIS ===

 Annotation Coverage:
🟢 Organism: 573,661/573,661 (100.0%)
🟢 Protein names: 573,661/573,661 (100.0%)
🟢 Length: 573,661/573,661 (100.0%)
🟢 Mass: 573,661/573,661 (100.0%)
🟢 Taxonomic lineage: 573,661/573,661 (100.0%)
🟢 Taxonomic lineage (Ids): 573,661/573,661 (100.0%)
🟢 Keywords: 566,515/573,661 (98.8%)
🟢 Chain: 565,890/573,661 (98.6%)
🟢 EMBL: 560,693/573,661 (97.7%)
🟢 InterPro: 555,533/573,661 (96.8%)
🟢 InterPro.1: 555,533/573,661 (96.8%)
🟢 Gene Ontology (GO): 552,917/573,661 (96.4%)
🟢 Gene Names: 548,889/573,661 (95.7%)
🟢 AlphaFoldDB: 548,729/573,661 (95.7%)
🟢 Pfam: 545,517/573,661 (95.1%)
🟢 Pfam.1: 545,517/573,661 (95.1%)
🟢 SMR: 524,392/573,661 (91.4%)
🟢 SMR.1: 524,392/573,661 (91.4%)
🟢 Protein families: 516,427/573,661 (90.0%)
🟢 PANTHER: 505,013/573,661 (88.0%)
🟢 PANTHER.1: 505,013/573,661 (88.0%)
🟢 Gene Names (primary): 502,595/573,661 (87.6%)
🟢 Gene Ontology (molecular function): 492,124/573,661 (85.8%)
🟢 KEGG: 481,691/573,661 (84.0%)
🟢 Gene3D: 477,613/573