In [None]:
import pandas as pd
df = pd.read_csv('cas.csv')
df.head()

Unnamed: 0,target,sequence,design,cas_avg,kd_avg
0,P,GCaagcatcctGT,LLDDDDDDDDDLL,285.381886,25.500862
1,P,GTTactgccttcTTAc,LLLDDDDDDDDDLLLD,185.488569,27.145766
2,C,TTGaataagtggaTGT,LLLDDDDDDDDDDLLL,113.422667,78.176219
3,C,CcAAAtcttataataACtAC,LDLLLDDDDDDDDDDLLDLL,163.37202,78.082731
4,A,TGGCaagcatccTGTA,LLLLDDDDDDDDLLLL,348.966482,88.271469


In [None]:
df['activity'] = df['kd_avg'].apply(lambda x: 1 if x <= 50 else 0)
df.head()

Unnamed: 0,target,sequence,design,cas_avg,kd_avg,activity
0,P,GCaagcatcctGT,LLDDDDDDDDDLL,285.381886,25.500862,1
1,P,GTTactgccttcTTAc,LLLDDDDDDDDDLLLD,185.488569,27.145766,1
2,C,TTGaataagtggaTGT,LLLDDDDDDDDDDLLL,113.422667,78.176219,0
3,C,CcAAAtcttataataACtAC,LDLLLDDDDDDDDDDLLDLL,163.37202,78.082731,0
4,A,TGGCaagcatccTGTA,LLLLDDDDDDDDLLLL,348.966482,88.271469,0


In [None]:
df.to_csv('cas_activity.csv', index=False)

In [None]:
from IPython import get_ipython
from IPython.display import display
# %%
import pandas as pd
import random
import os
from google.colab import files # Import the files module

AMINO_ACIDS = list("AGUCT")  # standard amino acids
MASK_TOKEN = "[MASK]"

def random_replace(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = random.choice(AMINO_ACIDS)
    return ''.join(seq)

def random_delete(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices_to_delete = set(random.sample(range(len(seq)), n))
    return ''.join([aa for i, aa in enumerate(seq) if i not in indices_to_delete])

def random_replace_with_U(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = 'U'
    return ''.join(seq)

def random_swap(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq) - 1), n)
    for i in indices:
        seq[i], seq[i + 1] = seq[i + 1], seq[i]
    return ''.join(seq)

def random_insertion_with_U(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    for _ in range(n):
        idx = random.randint(0, len(seq))
        seq.insert(idx, 'U')
    return ''.join(seq)

def apply_augmentation(df, func, fraction, output_filename):
    augmented_df = df.copy()
    augmented_df["sequence"] = df["sequence"].apply(lambda x: func(x.replace(" ", ""), fraction))
    augmented_df.to_csv(output_filename, index=False)
    print(f"Saved: {output_filename}")

def main(csv_path, fraction=0.1):
    df = pd.read_csv(csv_path)

    os.makedirs("augmented_csvs", exist_ok=True)

    augmented_files = [
        "augmented_csvs/random_replace.csv",
        "augmented_csvs/random_delete.csv",
        "augmented_csvs/random_replace_with_U.csv",
        "augmented_csvs/random_swap.csv",
        "augmented_csvs/random_insertion_with_U.csv"
    ]

    for output_file in augmented_files:
        apply_augmentation(df, globals()[output_file.split("/")[1].split(".")[0]], fraction, output_file)

    # Download the generated files
    for file_to_download in augmented_files:
        if os.path.exists(file_to_download):
            files.download(file_to_download)
            print(f"Downloaded: {file_to_download}")
        else:
            print(f"File not found: {file_to_download}")


if __name__ == "__main__":
    main("cas_activity.csv", fraction=0.1) # change "peptides.csv" to your input file path

Saved: augmented_csvs/random_replace.csv
Saved: augmented_csvs/random_delete.csv
Saved: augmented_csvs/random_replace_with_U.csv
Saved: augmented_csvs/random_swap.csv
Saved: augmented_csvs/random_insertion_with_U.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_delete.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace_with_U.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_swap.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_insertion_with_U.csv


In [None]:
import pandas as pd
import os

# List of the generated augmented CSV files
augmented_files = [
    "augmented_csvs/random_replace.csv",
    "augmented_csvs/random_delete.csv",
    "augmented_csvs/random_replace_with_U.csv",
    "augmented_csvs/random_swap.csv",
    "augmented_csvs/random_insertion_with_U.csv",
]

# Initialize an empty DataFrame to store the merged data
merged_df = pd.DataFrame()

# Flag to check if it's the first file
first_file = True

for file_path in augmented_files:
    if os.path.exists(file_path):
        if first_file:
            # Read the first file with the header
            merged_df = pd.read_csv(file_path)
            first_file = False
        else:
            # Read the rest of the files without the header and append
            temp_df = pd.read_csv(file_path, header=None)
            # Assign column names from the first DataFrame's columns
            temp_df.columns = merged_df.columns
            merged_df = pd.concat([merged_df, temp_df], ignore_index=True)
        print(f"Processed: {file_path}")
    else:
        print(f"File not found, skipping: {file_path}")

initial_row_count = len(merged_df)
merged_df_cleaned = merged_df.drop_duplicates()
rows_dropped = initial_row_count - len(merged_df_cleaned)

print(f"Initial row count: {initial_row_count}")
print(f"Row count after dropping duplicates: {len(merged_df_cleaned)}")
print(f"Number of rows dropped: {rows_dropped}")

# Save the merged DataFrame to a new CSV file
output_merged_file = "augmented_csvs/cas_augmented_data.csv"
merged_df_cleaned.to_csv(output_merged_file, index=False)
print(f"Merged data saved to: {output_merged_file}")

Processed: augmented_csvs/random_replace.csv
Processed: augmented_csvs/random_delete.csv
Processed: augmented_csvs/random_replace_with_U.csv
Processed: augmented_csvs/random_swap.csv
Processed: augmented_csvs/random_insertion_with_U.csv
Initial row count: 3864
Row count after dropping duplicates: 3861
Number of rows dropped: 3
Merged data saved to: augmented_csvs/cas_augmented_data.csv


In [None]:
import pandas as pd

data = pd.read_csv("cas_augmented_data.csv")

counts = data['activity'].value_counts()
print(counts)

activity
0    2400
1    2232
Name: count, dtype: int64


In [None]:
initial_row_count = len(data)
data_cleaned = data.drop_duplicates()
rows_dropped = initial_row_count - len(data_cleaned)

print(f"Initial row count: {initial_row_count}")
print(f"Row count after dropping duplicates: {len(data_cleaned)}")
print(f"Number of rows dropped: {rows_dropped}")

Initial row count: 4632
Row count after dropping duplicates: 4439
Number of rows dropped: 193


In [None]:
data = data_cleaned
initial_row_count = len(data)
data_cleaned = data.drop_duplicates()
rows_dropped = initial_row_count - len(data_cleaned)

print(f"Initial row count: {initial_row_count}")
print(f"Row count after dropping duplicates: {len(data_cleaned)}")
print(f"Number of rows dropped: {rows_dropped}")

Initial row count: 4439
Row count after dropping duplicates: 4439
Number of rows dropped: 0


In [None]:
data = data_cleaned
# Separate classes
minority_class = data[data['activity'] == data['activity'].value_counts().idxmin()]
majority_class = data[data['activity'] == data['activity'].value_counts().idxmax()]

# Undersample majority class
majority_class_sampled = majority_class.sample(n=len(minority_class), random_state=42)

# Combine the balanced classes
balanced_data = pd.concat([minority_class, majority_class_sampled])

# Shuffle the balanced data
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

counts = balanced_data['activity'].value_counts()
print(counts)

activity
1    2136
0    2136
Name: count, dtype: int64


In [None]:
balanced_data.to_csv('cas_augmented_balanced_data.csv', index=False)