In [5]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd
import requests
from io import StringIO
from tqdm import tqdm
from modlamp.descriptors import GlobalDescriptor
import pyarrow as pa


In [50]:
import requests
import pandas as pd
from io import StringIO
from tqdm.auto import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def fetch_uniprot_cursor(query: str = "reviewed:true", total: int = 1000, batch_size=500) -> pd.DataFrame:
    base_url = "https://rest.uniprot.org/uniprotkb/search"
    fields = (
        "accession,id,protein_name,organism_name,length,sequence,mass,"
        "cc_subcellular_location,go_p,go_f,ec,cc_disruption_phenotype,"
        "cc_catalytic_activity,cc_pathway,cc_subcellular_location,"
        "cc_function,cc_domain,cc_induction,cc_pharmaceutical,"
        "cc_disruption_phenotype"
    )

    headers = {"Accept": "text/tab-separated-values"}
    params = {
        "query": query,
        "format": "tsv",
        "fields": fields,
        "size": batch_size,
    }

    # Set up retry strategy
    retry_strategy = Retry(
        total=30,
        backoff_factor=2,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session = requests.Session()
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    results = []
    next_url = base_url
    fetched = 0

    with tqdm(total=total, desc="Fetching UniProt entries") as pbar:
        while next_url and fetched < total:
            try:
                if fetched == 0:
                    resp = session.get(next_url, params=params, headers=headers, timeout=10)
                else:
                    resp = session.get(next_url, headers=headers, timeout=10)
                resp.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Request failed: {e}")
                break

            df = pd.read_csv(StringIO(resp.text), sep="\t")
            if df.empty:
                break

            results.append(df)
            fetched += len(df)
            pbar.update(len(df))

            # get next URL from headers (cursor-based)
            next_url = resp.links.get("next", {}).get("url", None)

    if results:
        return pd.concat(results, ignore_index=True).iloc[:total].reset_index(drop=True)
    else:
        return pd.DataFrame()


In [51]:
import math
import requests
import pandas as pd
from io import StringIO
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm

def fetch_uniprot_batches_parallel(
    query: str = "reviewed:true",
    batch_size: int = 500,
    total: int = 1000,
    max_workers: int = 32 
) -> pd.DataFrame:
    """
    Parallelized fetch of UniProt search results in TSV format.

    Args:
      query: UniProt query string
      batch_size: number of entries per page
      total: total entries desired
      max_workers: number of threads to use for HTTP fetches

    Returns:
      Pandas DataFrame with up to `total` rows.
    """
    base_url = "https://rest.uniprot.org/uniprotkb/search"
    fields = (
        "accession,id,protein_name,organism_name,length,sequence,mass,"
        "cc_subcellular_location,go_p,go_f,ec,cc_disruption_phenotype,"
        "cc_catalytic_activity,cc_pathway,cc_subcellular_location,"
        "cc_function,cc_domain,cc_induction,cc_pharmaceutical,"
        "cc_disruption_phenotype"
    )

    # how many pages we need
    num_pages = math.ceil(total / batch_size)
    
    def fetch_page(offset: int) -> pd.DataFrame:
        params = {
            "query": query,
            "format": "tsv",
            "fields": fields,
            "size": batch_size,
            "offset": offset
        }
        resp = requests.get(base_url, params=params)
        resp.raise_for_status()
        return pd.read_csv(StringIO(resp.text), sep="\t")

    # kick off all page fetches
    offsets = [i * batch_size for i in range(num_pages)]
    dfs = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(fetch_page, off): off for off in offsets}
        for future in tqdm(as_completed(futures),
                           total=len(futures),
                           desc="UniProt pages"):
            df = future.result()
            if df.empty:
                break
            dfs.append(df)

    # combine and cap to `total`
    all_data = pd.concat(dfs, ignore_index=True)
    return all_data.iloc[:total].reset_index(drop=True)


In [52]:
def fetch_uniprot_batches(query="reviewed:true", batch_size=500, total=10000):
    base_url = "https://rest.uniprot.org/uniprotkb/search"
    all_dataframes = []
    fetched = 0
    params = {
        "query": query,
        "format": "tsv",
        "fields": (
            "accession,id,protein_name,organism_name,length,sequence,mass,"
            "cc_subcellular_location,go_p,go_f,ec,cc_disruption_phenotype,"
            "cc_catalytic_activity,cc_pathway,cc_function,cc_domain,"
            "cc_induction,cc_pharmaceutical"
        ),
        "size": batch_size,
    }

    response = requests.get(base_url, params=params)
    if not response.ok:
        raise Exception(f"Initial fetch failed: {response.status_code} {response.text}")

    with tqdm(total=total, desc="Fetching UniProt entries") as pbar:
        while True:
            df_batch = pd.read_csv(StringIO(response.text), sep="\t")
            if df_batch.empty:
                break

            all_dataframes.append(df_batch)
            batch_size = len(df_batch)
            fetched += batch_size
            pbar.update(batch_size)

            if fetched >= total:
                break

          # Get full next URL from response.links
            next_url = response.links.get("next", {}).get("url", None)
            if not next_url:
                break

            response = requests.get(next_url)
            if not response.ok:
                raise Exception(f"Next fetch failed: {response.status_code} {response.text}")

    return pd.concat(all_dataframes, ignore_index=True)



In [53]:
def compute_biopython_features(seq: str, entry: str) -> dict:
    analysis = ProteinAnalysis(seq)

    try:
        aa_counts = analysis.count_amino_acids()
        aa_percents = analysis.get_amino_acids_percent()
        sec_frac = analysis.secondary_structure_fraction()
    except Exception as e:
        print(f"Failed on sequence: {seq} - {e}")
        return None

    try:
        features = {
            "Entry": entry,
            "Sequence": seq,
            "length": len(seq),
            "mol_weight": analysis.molecular_weight(),
            "iso_point": analysis.isoelectric_point(),
            "aromaticity": analysis.aromaticity(),
            "instability_index": analysis.instability_index(),
            "gravy": analysis.gravy(),
            "helix_frac": sec_frac[0],
            "turn_frac": sec_frac[1],
            "sheet_frac": sec_frac[2],
        }
    except Exception as e:
        #print(f"Failed to compute features for sequence: {seq} - {e}")
        return None

    # Add amino acid counts and percents
    for aa in aa_counts:
        features[f"count_{aa}"] = aa_counts[aa]
        features[f"percent_{aa}"] = round(aa_percents[aa],3)

    
    # --- modlamp GlobalDescriptor ---
    try:
        desc = GlobalDescriptor([seq])
        desc.calculate_all()
        modlamp_feats = desc.descriptor[0]
        modlamp_names = [
            "charge_pH7", "boman_index", "aliphatic_index", "hydrophobic_moment"
        ]

        for name, value in zip(modlamp_names, modlamp_feats):
            orig_name = name.split("_", 1)[-1]
            features[name] = value

    except Exception as e:
        #print(f"modlamp failed on sequence: {seq} - {e}")
        return None

    return features


In [54]:
from concurrent.futures import ProcessPoolExecutor

def parallel_compute_features(seq_entry_list, max_workers=None):
    """
    seq_entry_list: List of (seq, entry) tuples
    max_workers:    Number of processes to spin up (defaults to os.cpu_count())
    """
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # schedule all the jobs
        futures = {
            executor.submit(compute_biopython_features, seq, entry): (seq, entry)
            for seq, entry in seq_entry_list
        }

        # progress over completions
        for future in tqdm(as_completed(futures),
                           total=len(futures),
                           desc="Computing features"):
            seq, entry = futures[future]
            try:
                feat = future.result()
                if feat is not None:
                    results.append(feat)
            except Exception as exc:
                print(f"[Worker error] seq={entry} raised {exc!r}")
    return results

In [56]:
df = fetch_uniprot_cursor(batch_size=500, total= 1000000)
df_filtered = df[df['Length'] <= 1024]
df_filtered = df = df.drop(columns=['Pharmaceutical use'])

Fetching UniProt entries:  57%|█████▋    | 573230/1000000 [31:27<23:25, 303.64it/s]  


In [57]:
df_filtered.nunique()


Entry                                 573230
Entry Name                            573230
Protein names                         169016
Organism                               14778
Length                                  3476
Sequence                              484998
Mass                                  110205
Subcellular location [CC]              59530
Gene Ontology (biological process)     66513
Gene Ontology (molecular function)     40407
EC number                               7141
Disruption phenotype                   19948
Catalytic activity                     30481
Pathway                                 8622
Subcellular location [CC].1            59530
Function [CC]                         115427
Domain [CC]                            13940
Induction                              16973
Disruption phenotype.1                 19948
dtype: int64

In [58]:
sequences = df_filtered["Sequence"].tolist()
entry = df_filtered["Entry"].tolist()
seq_entry_list = list(zip(sequences, entry))

feature_dicts = []
feature_dicts = parallel_compute_features(seq_entry_list, max_workers=64)

Computing features: 100%|██████████| 573230/573230 [03:36<00:00, 2653.24it/s]


In [59]:
len(feature_dicts)

570558

In [67]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

output_path = "datatest573230.parquet"
chunk_size = 5000
writer = None
schema = None
reference_columns_and_types = None

def fix_null_columns(df, reference_columns_and_types):
    for col, dtype in reference_columns_and_types.items():
        if col not in df.columns:
            df[col] = pd.NA
        try:
            df[col] = df[col].astype(dtype)
        except Exception:
            df[col] = df[col].astype("object")
    return df

def sanitize_for_arrow(df):
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].astype(str)  # Coerce to string
    return df

for i in tqdm(range(0, len(feature_dicts), chunk_size)):
    chunk_df = pd.DataFrame(feature_dicts[i:i + chunk_size])

    if reference_columns_and_types is None:
        reference_columns_and_types = {
            col: str(chunk_df[col].dtype) for col in chunk_df.columns
        }

    chunk_df = fix_null_columns(chunk_df, reference_columns_and_types)
    merged = chunk_df.merge(df, on="Sequence", how="left")

    if len(merged) > 2 * len(chunk_df):
        print(len(merged))
        print(len(chunk_df))
        print("Merge resulted in too many rows. Check for duplicates.")

    merged = sanitize_for_arrow(merged)

    table = pa.Table.from_pandas(merged, preserve_index=False)

    if writer is None:
        schema = pa.schema([
            pa.field(name, typ).with_nullable(True)
            for name, typ in zip(table.schema.names, table.schema.types)
        ])
        writer = pq.ParquetWriter(output_path, schema, compression="snappy")

    table = pa.Table.from_pandas(merged, schema=schema, preserve_index=False)
    writer.write_table(table)

if writer is not None:
    writer.close()


  0%|          | 0/115 [00:00<?, ?it/s]

 23%|██▎       | 27/115 [00:19<01:04,  1.37it/s]

10801
5000
Merge resulted in too many rows. Check for duplicates.


 24%|██▍       | 28/115 [00:20<01:05,  1.34it/s]

12678
5000
Merge resulted in too many rows. Check for duplicates.


 25%|██▌       | 29/115 [00:21<01:05,  1.31it/s]

12708
5000
Merge resulted in too many rows. Check for duplicates.


 26%|██▌       | 30/115 [00:22<01:05,  1.29it/s]

11238
5000
Merge resulted in too many rows. Check for duplicates.


 27%|██▋       | 31/115 [00:22<01:05,  1.29it/s]

10276
5000
Merge resulted in too many rows. Check for duplicates.


 30%|███       | 35/115 [00:25<01:01,  1.30it/s]

10448
5000
Merge resulted in too many rows. Check for duplicates.
16767
5000
Merge resulted in too many rows. Check for duplicates.


 32%|███▏      | 37/115 [00:27<01:02,  1.25it/s]

10131
5000
Merge resulted in too many rows. Check for duplicates.


 34%|███▍      | 39/115 [00:29<01:00,  1.26it/s]

10169
5000
Merge resulted in too many rows. Check for duplicates.


 37%|███▋      | 43/115 [00:32<00:56,  1.28it/s]

12269
5000
Merge resulted in too many rows. Check for duplicates.


 38%|███▊      | 44/115 [00:33<00:57,  1.25it/s]

16040
5000
Merge resulted in too many rows. Check for duplicates.


 40%|████      | 46/115 [00:34<00:55,  1.25it/s]

11678
5000
Merge resulted in too many rows. Check for duplicates.


 44%|████▍     | 51/115 [00:38<00:50,  1.28it/s]

13039
5000
Merge resulted in too many rows. Check for duplicates.


 45%|████▌     | 52/115 [00:39<00:50,  1.26it/s]

14423
5000
Merge resulted in too many rows. Check for duplicates.


 46%|████▌     | 53/115 [00:40<00:49,  1.26it/s]

10535
5000
Merge resulted in too many rows. Check for duplicates.


 47%|████▋     | 54/115 [00:40<00:48,  1.27it/s]

10205
5000
Merge resulted in too many rows. Check for duplicates.


 50%|█████     | 58/115 [00:44<00:43,  1.30it/s]

10474
5000
Merge resulted in too many rows. Check for duplicates.


 51%|█████▏    | 59/115 [00:44<00:44,  1.27it/s]

14541
5000
Merge resulted in too many rows. Check for duplicates.


 52%|█████▏    | 60/115 [00:45<00:43,  1.26it/s]

12714
5000
Merge resulted in too many rows. Check for duplicates.


 54%|█████▍    | 62/115 [00:47<00:41,  1.28it/s]

10545
5000
Merge resulted in too many rows. Check for duplicates.


 60%|██████    | 69/115 [00:52<00:35,  1.31it/s]

15378
5000
Merge resulted in too many rows. Check for duplicates.


 61%|██████    | 70/115 [00:53<00:35,  1.26it/s]

20265
5000
Merge resulted in too many rows. Check for duplicates.


 62%|██████▏   | 71/115 [00:54<00:35,  1.24it/s]

18030
5000
Merge resulted in too many rows. Check for duplicates.


 63%|██████▎   | 72/115 [00:54<00:34,  1.24it/s]

14968
5000
Merge resulted in too many rows. Check for duplicates.


 63%|██████▎   | 73/115 [00:55<00:34,  1.23it/s]

14898
5000
Merge resulted in too many rows. Check for duplicates.


 64%|██████▍   | 74/115 [00:56<00:33,  1.24it/s]

12249
5000
Merge resulted in too many rows. Check for duplicates.


 65%|██████▌   | 75/115 [00:57<00:31,  1.25it/s]

10770
5000
Merge resulted in too many rows. Check for duplicates.


 66%|██████▌   | 76/115 [00:58<00:30,  1.26it/s]

10769
5000
Merge resulted in too many rows. Check for duplicates.


 67%|██████▋   | 77/115 [00:58<00:30,  1.26it/s]

13706
5000
Merge resulted in too many rows. Check for duplicates.
23123
5000
Merge resulted in too many rows. Check for duplicates.


 69%|██████▊   | 79/115 [01:00<00:29,  1.22it/s]

14540
5000
Merge resulted in too many rows. Check for duplicates.


 70%|██████▉   | 80/115 [01:01<00:28,  1.21it/s]

14906
5000
Merge resulted in too many rows. Check for duplicates.


 70%|███████   | 81/115 [01:02<00:27,  1.22it/s]

14120
5000
Merge resulted in too many rows. Check for duplicates.


 71%|███████▏  | 82/115 [01:03<00:26,  1.23it/s]

12117
5000
Merge resulted in too many rows. Check for duplicates.


 73%|███████▎  | 84/115 [01:04<00:24,  1.27it/s]

10592
5000
Merge resulted in too many rows. Check for duplicates.


 74%|███████▍  | 85/115 [01:05<00:23,  1.26it/s]

14087
5000
Merge resulted in too many rows. Check for duplicates.
25687
5000
Merge resulted in too many rows. Check for duplicates.


 76%|███████▌  | 87/115 [01:07<00:23,  1.20it/s]

14366
5000
Merge resulted in too many rows. Check for duplicates.


 77%|███████▋  | 88/115 [01:07<00:22,  1.21it/s]

13394
5000
Merge resulted in too many rows. Check for duplicates.


 77%|███████▋  | 89/115 [01:08<00:21,  1.22it/s]

15272
5000
Merge resulted in too many rows. Check for duplicates.


 78%|███████▊  | 90/115 [01:09<00:20,  1.23it/s]

11869
5000
Merge resulted in too many rows. Check for duplicates.


 79%|███████▉  | 91/115 [01:10<00:19,  1.25it/s]

10080
5000
Merge resulted in too many rows. Check for duplicates.


 80%|████████  | 92/115 [01:11<00:18,  1.26it/s]

10780
5000
Merge resulted in too many rows. Check for duplicates.


 81%|████████  | 93/115 [01:11<00:17,  1.25it/s]

15211
5000
Merge resulted in too many rows. Check for duplicates.
24402
5000
Merge resulted in too many rows. Check for duplicates.


 83%|████████▎ | 95/115 [01:13<00:16,  1.21it/s]

12996
5000
Merge resulted in too many rows. Check for duplicates.


 83%|████████▎ | 96/115 [01:14<00:15,  1.23it/s]

13162
5000
Merge resulted in too many rows. Check for duplicates.


 84%|████████▍ | 97/115 [01:15<00:14,  1.22it/s]

15180
5000
Merge resulted in too many rows. Check for duplicates.


 85%|████████▌ | 98/115 [01:16<00:13,  1.22it/s]

11775
5000
Merge resulted in too many rows. Check for duplicates.


 87%|████████▋ | 100/115 [01:17<00:11,  1.26it/s]

11007
5000
Merge resulted in too many rows. Check for duplicates.


 88%|████████▊ | 101/115 [01:18<00:11,  1.24it/s]

17336
5000
Merge resulted in too many rows. Check for duplicates.
23378
5000
Merge resulted in too many rows. Check for duplicates.


 90%|████████▉ | 103/115 [01:20<00:09,  1.21it/s]

13667
5000
Merge resulted in too many rows. Check for duplicates.


 90%|█████████ | 104/115 [01:20<00:09,  1.22it/s]

13929
5000
Merge resulted in too many rows. Check for duplicates.


 91%|█████████▏| 105/115 [01:21<00:08,  1.23it/s]

13960
5000
Merge resulted in too many rows. Check for duplicates.


 92%|█████████▏| 106/115 [01:22<00:07,  1.23it/s]

12753
5000
Merge resulted in too many rows. Check for duplicates.


 95%|█████████▍| 109/115 [01:24<00:04,  1.31it/s]

11336
5000
Merge resulted in too many rows. Check for duplicates.


 96%|█████████▌| 110/115 [01:25<00:03,  1.30it/s]

13064
5000
Merge resulted in too many rows. Check for duplicates.


 97%|█████████▋| 111/115 [01:26<00:03,  1.29it/s]

13697
5000
Merge resulted in too many rows. Check for duplicates.


 97%|█████████▋| 112/115 [01:27<00:02,  1.30it/s]

11347
5000
Merge resulted in too many rows. Check for duplicates.


 98%|█████████▊| 113/115 [01:27<00:01,  1.30it/s]

12132
5000
Merge resulted in too many rows. Check for duplicates.


 99%|█████████▉| 114/115 [01:28<00:00,  1.30it/s]

13494
5000
Merge resulted in too many rows. Check for duplicates.


100%|██████████| 115/115 [01:29<00:00,  1.29it/s]


In [68]:
check = pd.read_parquet("datatest573230.parquet", engine="pyarrow")


In [69]:
check.nunique()


Entry_x                        570558
Sequence                       482356
length                           3464
mol_weight                     480443
iso_point                      101191
                                ...  
Subcellular location [CC].1     59332
Function [CC]                  114896
Domain [CC]                     13902
Induction                       16931
Disruption phenotype.1          19927
Length: 73, dtype: int64

In [72]:
checkdropped = check.drop_duplicates(subset=['Sequence']).reset_index(drop=True)


In [None]:
len(checkdropped)


482356

: 