In [None]:
import pandas as pd
import numpy as np
import metapredict as meta

In [None]:
# Replace with your CSV
df = pd.read_csv("CIDER_Condenseq_Metapredict.csv")

In [None]:
import localcider

In [None]:
from localcider.sequenceParameters import SequenceParameters

In [None]:
cider_functions = [
    "get_FCR",
    "get_NCPR",
    "get_isoelectric_point",
    "get_molecular_weight",
    "get_countNeg",
    "get_countPos",
    "get_countNeut",
    "get_fraction_negative",
    "get_fraction_positive",
    "get_fraction_expanding",
    "get_amino_acid_fractions",
    "get_fraction_disorder_promoting",
    "get_mean_net_charge",
    "get_mean_hydropathy",
    "get_uversky_hydropathy",
    "get_PPII_propensity",
]

In [None]:
def clean_sequence(seq):
    if pd.isna(seq):
        return np.na
    return seq.replace("..", "")

In [None]:
clean_disordered_sequences = [
    clean_sequence(seq) for seq in rbp["Meta_disordered_sequence"] if pd.notna(seq)
]

clean_folded_sequences = [
    clean_sequence(seq) for seq in rbp["Meta_folded_sequence"] if pd.notna(seq)
]

In [None]:
disordered_sequence_objects = []
for sequence in clean_disordered_sequences:
    if sequence:
        disordered_sequence_objects.append(SequenceParameters(sequence))
    else:
        disordered_sequence_objects.append(None)

In [None]:
folded_sequence_objects = []
for sequence in clean_folded_sequences:
    if sequence:
        folded_sequence_objects.append(SequenceParameters(sequence))
    else:
        folded_sequence_objects.append(None)

In [None]:
disordered_outputs = [
    [getattr(seq, func)() if seq else np.nan for seq in disordered_sequence_objects]
    for func in cider_functions
]

In [None]:
folded_outputs = [
    [getattr(seq, func)() if seq else np.nan for seq in folded_sequence_objects]
    for func in cider_functions
]

In [None]:
for func_name, values in zip(cider_functions, disordered_outputs):
    rbp[func_name.replace("get_", "Meta_disordered_")] = values

In [None]:
for func_name, values in zip(cider_functions, folded_outputs):
    rbp[func_name.replace("get_", "Meta_folded_")] = values

In [None]:
# Individual amino acid fractions
aa_list = list("ACDEFGHIKLMNPQRSTVWY")
individual_fractions = {f"Fraction_{aa}": (aa, "ACDEFGHIKLMNPQRSTVWY") for aa in aa_list}

# Composite fractions
composite_fractions = {
    "Fraction_ILMV": ("ILMV", "ACDEFGHIKLMNPQRSTVWY"),
    "Fraction_RK": ("RK", "ACDEFGHIKLMNPQRSTVWY"),
    "Fraction_DE": ("DE", "ACDEFGHIKLMNPQRSTVWY"),
    "Fraction_GS": ("GS", "ACDEFGHIKLMNPQRSTVWY"),
    "Fraction_FWY": ("FWY", "ACDEFGHIKLMNPQRSTVWY"),
    "Fraction_R_over_RK": ("R", "RK"),
    "Fraction_D_over_DE": ("D", "DE"),
    "Fraction_S_over_SG": ("S", "SG"),
    "Fraction_N_over_NQ": ("N", "NQ"),
    "Fraction_Y_over_YF": ("Y", "YF"),
    "Fraction_F_over_FW": ("F", "FW"),
    "Fraction_Y_over_YW": ("Y", "YW"),
    "Fraction_R_over_RQ": ("R", "RQ"),
    "Fraction_K_over_KQ": ("K", "KQ"),
    "Fraction_FWY_over_FWYILV": ("FWY", "FWYILV"),
    "Fraction_FWY_over_FWYR": ("FWY", "FWYR")
}

# Combine both
fractions = {**individual_fractions, **composite_fractions}

In [None]:
disordered_fractions = {
    f"Meta_Disordered_{key}": value
    for key, value in fractions.items()
}

In [None]:
folded_fractions = {
    f"Meta_Folded_{key}": value
    for key, value in fractions.items()
}

In [None]:
def compute_fraction(seq, numerator_set, denominator_set):
    if pd.isna(seq) or not seq:
        return np.nan
    seq = seq.upper()
    num = sum(1 for aa in seq if aa in numerator_set)
    denom = sum(1 for aa in seq if aa in denominator_set)
    return num / denom if denom > 0 else np.nan

In [None]:
rbp["Meta_disordered_combined"] = clean_disordered_sequences
rbp["Meta_folded_combined"] = clean_folded_sequences

In [None]:
for name, (num_set, denom_set) in disordered_fractions.items():
    rbp[name] = rbp["Meta_disordered_combined"].apply(lambda seq: compute_fraction(seq, num_set, denom_set))

In [None]:
for name, (num_set, denom_set) in folded_fractions.items():
    rbp[name] = rbp["Meta_folded_combined"].apply(lambda seq: compute_fraction(seq, num_set, denom_set))