In [1]:
from IPython import get_ipython
from IPython.display import display
# %%
import pandas as pd
import random
import os
from google.colab import files # Import the files module

AMINO_ACIDS = list("ACDEFGHIKLMNPQRSTVWY")  # standard amino acids
MASK_TOKEN = "[MASK]"

def random_replace(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = random.choice(AMINO_ACIDS)
    return ''.join(seq)

def random_delete(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices_to_delete = set(random.sample(range(len(seq)), n))
    return ''.join([aa for i, aa in enumerate(seq) if i not in indices_to_delete])

def random_replace_with_A(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = 'A'
    return ''.join(seq)

def random_swap(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq) - 1), n)
    for i in indices:
        seq[i], seq[i + 1] = seq[i + 1], seq[i]
    return ''.join(seq)

def random_insertion_with_A(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    for _ in range(n):
        idx = random.randint(0, len(seq))
        seq.insert(idx, 'A')
    return ''.join(seq)

def random_replace_with_G(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = 'G'
    return ''.join(seq)

def random_insertion_with_G(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    for _ in range(n):
        idx = random.randint(0, len(seq))
        seq.insert(idx, 'G')
    return ''.join(seq)

def random_replace_with_V(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = 'V'
    return ''.join(seq)

def random_insertion_with_V(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    for _ in range(n):
        idx = random.randint(0, len(seq))
        seq.insert(idx, 'V')
    return ''.join(seq)

def random_replace_with_I(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = 'I'
    return ''.join(seq)

def random_insertion_with_I(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    for _ in range(n):
        idx = random.randint(0, len(seq))
        seq.insert(idx, 'I')
    return ''.join(seq)

def random_replace_with_L(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = 'L'
    return ''.join(seq)

def random_insertion_with_L(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    for _ in range(n):
        idx = random.randint(0, len(seq))
        seq.insert(idx, 'L')
    return ''.join(seq)

def apply_augmentation(df, func, fraction, output_filename):
    augmented_df = df.copy()
    augmented_df["sequence"] = df["sequence"].apply(lambda x: func(x.replace(" ", ""), fraction))
    augmented_df.to_csv(output_filename, index=False)
    print(f"Saved: {output_filename}")

def main(csv_path, fraction=0.1):
    df = pd.read_csv(csv_path)

    os.makedirs("augmented_csvs", exist_ok=True)

    augmented_files = [
        "augmented_csvs/random_replace.csv",
        "augmented_csvs/random_delete.csv",
        "augmented_csvs/random_replace_with_A.csv",
        "augmented_csvs/random_swap.csv",
        "augmented_csvs/random_insertion_with_A.csv",
        "augmented_csvs/random_replace_with_G.csv",
        "augmented_csvs/random_insertion_with_G.csv",
        "augmented_csvs/random_replace_with_V.csv",
        "augmented_csvs/random_insertion_with_V.csv",
        "augmented_csvs/random_replace_with_I.csv",
        "augmented_csvs/random_insertion_with_I.csv",
        "augmented_csvs/random_replace_with_L.csv",
        "augmented_csvs/random_insertion_with_L.csv"
    ]

    for output_file in augmented_files:
        apply_augmentation(df, globals()[output_file.split("/")[1].split(".")[0]], fraction, output_file)

    # Download the generated files
    for file_to_download in augmented_files:
        if os.path.exists(file_to_download):
            files.download(file_to_download)
            print(f"Downloaded: {file_to_download}")
        else:
            print(f"File not found: {file_to_download}")


if __name__ == "__main__":
    main("anticancer.csv", fraction=0.1) # change "peptides.csv" to your input file path

Saved: augmented_csvs/random_replace.csv
Saved: augmented_csvs/random_delete.csv
Saved: augmented_csvs/random_replace_with_A.csv
Saved: augmented_csvs/random_swap.csv
Saved: augmented_csvs/random_insertion_with_A.csv
Saved: augmented_csvs/random_replace_with_G.csv
Saved: augmented_csvs/random_insertion_with_G.csv
Saved: augmented_csvs/random_replace_with_V.csv
Saved: augmented_csvs/random_insertion_with_V.csv
Saved: augmented_csvs/random_replace_with_I.csv
Saved: augmented_csvs/random_insertion_with_I.csv
Saved: augmented_csvs/random_replace_with_L.csv
Saved: augmented_csvs/random_insertion_with_L.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_delete.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace_with_A.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_swap.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_insertion_with_A.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace_with_G.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_insertion_with_G.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace_with_V.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_insertion_with_V.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace_with_I.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_insertion_with_I.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace_with_L.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_insertion_with_L.csv


In [2]:
import pandas as pd
import os

# List of the generated augmented CSV files
augmented_files = [
    "augmented_csvs/random_replace.csv",
    "augmented_csvs/random_delete.csv",
    "augmented_csvs/random_replace_with_A.csv",
    "augmented_csvs/random_swap.csv",
    "augmented_csvs/random_insertion_with_A.csv",
    "augmented_csvs/random_replace_with_G.csv",
    "augmented_csvs/random_insertion_with_G.csv",
    "augmented_csvs/random_replace_with_V.csv",
    "augmented_csvs/random_insertion_with_V.csv",
    "augmented_csvs/random_replace_with_I.csv",
    "augmented_csvs/random_insertion_with_I.csv",
    "augmented_csvs/random_replace_with_L.csv",
    "augmented_csvs/random_insertion_with_L.csv"
]

# Initialize an empty DataFrame to store the merged data
merged_df = pd.DataFrame()

# Flag to check if it's the first file
first_file = True

for file_path in augmented_files:
    if os.path.exists(file_path):
        if first_file:
            # Read the first file with the header
            merged_df = pd.read_csv(file_path)
            first_file = False
        else:
            # Read the rest of the files without the header and append
            temp_df = pd.read_csv(file_path, header=None)
            # Assign column names from the first DataFrame's columns
            temp_df.columns = merged_df.columns
            merged_df = pd.concat([merged_df, temp_df], ignore_index=True)
        print(f"Processed: {file_path}")
    else:
        print(f"File not found, skipping: {file_path}")

initial_row_count = len(merged_df)
merged_df_cleaned = merged_df.drop_duplicates()
rows_dropped = initial_row_count - len(merged_df_cleaned)

print(f"Initial row count: {initial_row_count}")
print(f"Row count after dropping duplicates: {len(merged_df_cleaned)}")
print(f"Number of rows dropped: {rows_dropped}")

# Save the merged DataFrame to a new CSV file
output_merged_file = "augmented_csvs/anticancer_augmented_data.csv"
merged_df_cleaned.to_csv(output_merged_file, index=False)
print(f"Merged data saved to: {output_merged_file}")

Processed: augmented_csvs/random_replace.csv
Processed: augmented_csvs/random_delete.csv
Processed: augmented_csvs/random_replace_with_A.csv
Processed: augmented_csvs/random_swap.csv
Processed: augmented_csvs/random_insertion_with_A.csv
Processed: augmented_csvs/random_replace_with_G.csv
Processed: augmented_csvs/random_insertion_with_G.csv
Processed: augmented_csvs/random_replace_with_V.csv
Processed: augmented_csvs/random_insertion_with_V.csv
Processed: augmented_csvs/random_replace_with_I.csv
Processed: augmented_csvs/random_insertion_with_I.csv
Processed: augmented_csvs/random_replace_with_L.csv
Processed: augmented_csvs/random_insertion_with_L.csv
Initial row count: 22398
Row count after dropping duplicates: 22264
Number of rows dropped: 134
Merged data saved to: augmented_csvs/anticancer_augmented_data.csv


In [12]:
import pandas as pd

data = pd.read_csv("anticancer_augmented_data.csv")

counts = data['anticancer'].value_counts()
print(counts)

anticancer
0    11163
1    11100
Name: count, dtype: int64


In [13]:
initial_row_count = len(data)
data_cleaned = data.drop_duplicates()
rows_dropped = initial_row_count - len(data_cleaned)

print(f"Initial row count: {initial_row_count}")
print(f"Row count after dropping duplicates: {len(data_cleaned)}")
print(f"Number of rows dropped: {rows_dropped}")

Initial row count: 22263
Row count after dropping duplicates: 22232
Number of rows dropped: 31


In [14]:
data = data_cleaned
initial_row_count = len(data)
data_cleaned = data.drop_duplicates()
rows_dropped = initial_row_count - len(data_cleaned)

print(f"Initial row count: {initial_row_count}")
print(f"Row count after dropping duplicates: {len(data_cleaned)}")
print(f"Number of rows dropped: {rows_dropped}")

Initial row count: 22232
Row count after dropping duplicates: 22232
Number of rows dropped: 0


In [15]:
data = data_cleaned
# Separate classes
minority_class = data[data['anticancer'] == data['anticancer'].value_counts().idxmin()]
majority_class = data[data['anticancer'] == data['anticancer'].value_counts().idxmax()]

# Undersample majority class
majority_class_sampled = majority_class.sample(n=len(minority_class), random_state=42)

# Combine the balanced classes
balanced_data = pd.concat([minority_class, majority_class_sampled])

# Shuffle the balanced data
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

counts = balanced_data['anticancer'].value_counts()
print(counts)

anticancer
1    11085
0    11085
Name: count, dtype: int64


In [19]:
balanced_data.to_csv('anticancer_augmented_balanced_data.csv', index=False)