In [None]:
import pandas as pd
import numpy as np
import localcider
from localcider.sequenceParameters import SequenceParameters

In [None]:
# REPLACE WITH YOUR CSV
df = pd.read_csv("ZincFinger_Classical_RBD.csv")

In [None]:
# Should be 0 NaN sequences
nan_count = sum(pd.isna(seq) for seq in df["sequence"])
print("Number of NaN sequences:", nan_count)

In [None]:
# Should be 0 U amino acids; if there are, turn it into C
sum(["U" in df["sequence"].iloc[index] for index in range(df.shape[0])])

In [None]:
# Generate sequence objects for each sequence
sequences = list(df["sequence"])
sequence_objects = []
for sequence in sequences:
    sequence_objects.append(SequenceParameters(sequence))

In [None]:
# List of functions that are to be applied
cider_functions = [
    "get_FCR",
    "get_NCPR",
    "get_isoelectric_point",
    "get_molecular_weight",
    "get_countNeg",
    "get_countPos",
    "get_countNeut",
    "get_fraction_negative",
    "get_fraction_positive",
    "get_fraction_expanding",
    "get_amino_acid_fractions",
    "get_fraction_disorder_promoting",
    "get_mean_net_charge",
    "get_mean_hydropathy",
    "get_uversky_hydropathy",
    "get_PPII_propensity",
]

In [None]:
# Apply the functions
outputs = [
    [getattr(seq, func)() for seq in sequence_objects]
    for func in cider_functions
]

In [None]:
# Add the outputs to the dataframe
for func_name, values in zip(cider_functions, outputs):
    df[func_name.replace("get_", "")] = values

In [None]:
# Specify amino acid fractions to be calculated

aa_list = list("ACDEFGHIKLMNPQRSTVWY")
individual_fractions = {f"Fraction_{aa}": (aa, "ACDEFGHIKLMNPQRSTVWY") for aa in aa_list}

composite_fractions = {
    "Fraction_ILMV": ("ILMV", "ACDEFGHIKLMNPQRSTVWY"),
    "Fraction_RK": ("RK", "ACDEFGHIKLMNPQRSTVWY"),
    "Fraction_DE": ("DE", "ACDEFGHIKLMNPQRSTVWY"),
    "Fraction_GS": ("GS", "ACDEFGHIKLMNPQRSTVWY"),
    "Fraction_FWY": ("FWY", "ACDEFGHIKLMNPQRSTVWY"),
    "Fraction_R_over_RK": ("R", "RK"),
    "Fraction_D_over_DE": ("D", "DE"),
    "Fraction_S_over_SG": ("S", "SG"),
    "Fraction_N_over_NQ": ("N", "NQ"),
    "Fraction_Y_over_YF": ("Y", "YF"),
    "Fraction_F_over_FW": ("F", "FW"),
    "Fraction_Y_over_YW": ("Y", "YW"),
    "Fraction_R_over_RQ": ("R", "RQ"),
    "Fraction_K_over_KQ": ("K", "KQ"),
    "Fraction_FWY_over_FWYILV": ("FWY", "FWYILV"),
    "Fraction_FWY_over_FWYR": ("FWY", "FWYR")
}

fractions = {**individual_fractions, **composite_fractions}

In [None]:
# Function to compute the fraction
def compute_fraction(seq, numerator_set, denominator_set):
    if pd.isna(seq) or not seq:
        return np.nan
    seq = seq.upper()
    num = sum(1 for aa in seq if aa in numerator_set)
    denom = sum(1 for aa in seq if aa in denominator_set)
    return num / denom if denom > 0 else np.nan

In [None]:
# Compute the fractions and add into dataframe
for name, (num_set, denom_set) in fractions.items():
    rbp[name] = rbp["sequence"].apply(lambda seq: compute_fraction(seq, num_set, denom_set))