In [1]:
import pandas as pd
df = pd.read_excel('neurotoxicity_dataset.xlsx')
df.head()

Unnamed: 0,Sequence,Set,Target,Length,Number_LNA,Number_A,Number_C,Number_T,Number_G,Length_Gfree_5,Length_Gfree_3,Calculated_score,Measured_CaO_score_cells,Acute_tolerance_score_mice
0,AAAtctataataaccacCAC,Control,,20,6,10,6,4,0,20,20,82.36,80.17,0.0
1,CAAAtcatccatctatAAAC,Control,,20,8,9,6,5,0,20,20,80.77,88.84,0.0
2,TACcatacaataactttAAC,Control,,20,6,9,5,6,0,20,20,81.15,76.53,0.0
3,CTAAatccttaatatcAAAC,Control,,20,8,9,5,6,0,20,20,81.15,79.26,0.0
4,CcAAAtcttataataACtAC,Control,,20,8,9,5,6,0,20,20,81.15,80.17,0.0


In [2]:
df['toxicity'] = df['Calculated_score'].apply(lambda x: 0 if x > 70 else 1)
df.head()

Unnamed: 0,Sequence,Set,Target,Length,Number_LNA,Number_A,Number_C,Number_T,Number_G,Length_Gfree_5,Length_Gfree_3,Calculated_score,Measured_CaO_score_cells,Acute_tolerance_score_mice,toxicity
0,AAAtctataataaccacCAC,Control,,20,6,10,6,4,0,20,20,82.36,80.17,0.0,0
1,CAAAtcatccatctatAAAC,Control,,20,8,9,6,5,0,20,20,80.77,88.84,0.0,0
2,TACcatacaataactttAAC,Control,,20,6,9,5,6,0,20,20,81.15,76.53,0.0,0
3,CTAAatccttaatatcAAAC,Control,,20,8,9,5,6,0,20,20,81.15,79.26,0.0,0
4,CcAAAtcttataataACtAC,Control,,20,8,9,5,6,0,20,20,81.15,80.17,0.0,0


In [3]:
counts = df['toxicity'].value_counts()
print(counts)

toxicity
1    1151
0     674
Name: count, dtype: int64


In [4]:
df.to_csv('neurotoxicity.csv', index=False)

In [5]:
from IPython import get_ipython
from IPython.display import display
# %%
import pandas as pd
import random
import os
from google.colab import files # Import the files module

NUCLEOTIDES = list("AGCT")  # standard amino acids
MASK_TOKEN = "[MASK]"

def random_replace(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = random.choice(NUCLEOTIDES)
    return ''.join(seq)

def random_delete(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices_to_delete = set(random.sample(range(len(seq)), n))
    return ''.join([aa for i, aa in enumerate(seq) if i not in indices_to_delete])

def random_replace_with_C(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = 'C'
    return ''.join(seq)

def random_replace_with_T(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq)), n)
    for i in indices:
        seq[i] = 'T'
    return ''.join(seq)

def random_swap(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    indices = random.sample(range(len(seq) - 1), n)
    for i in indices:
        seq[i], seq[i + 1] = seq[i + 1], seq[i]
    return ''.join(seq)

def random_insertion_with_C(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    for _ in range(n):
        idx = random.randint(0, len(seq))
        seq.insert(idx, 'C')
    return ''.join(seq)

def random_insertion_with_T(seq, fraction):
    seq = list(seq)
    n = max(1, int(len(seq) * fraction))
    for _ in range(n):
        idx = random.randint(0, len(seq))
        seq.insert(idx, 'T')
    return ''.join(seq)

def apply_augmentation(df, func, fraction, output_filename):
    augmented_df = df.copy()
    augmented_df["Sequence"] = df["Sequence"].apply(lambda x: func(x.replace(" ", ""), fraction))
    augmented_df.to_csv(output_filename, index=False)
    print(f"Saved: {output_filename}")

def main(csv_path, fraction=0.1):
    df = pd.read_csv(csv_path)

    os.makedirs("augmented_csvs", exist_ok=True)

    augmented_files = [
        "augmented_csvs/random_replace.csv",
        "augmented_csvs/random_delete.csv",
        "augmented_csvs/random_replace_with_C.csv",
        "augmented_csvs/random_replace_with_T.csv",
        "augmented_csvs/random_swap.csv",
        "augmented_csvs/random_insertion_with_C.csv",
        "augmented_csvs/random_insertion_with_T.csv"
    ]

    for output_file in augmented_files:
        apply_augmentation(df, globals()[output_file.split("/")[1].split(".")[0]], fraction, output_file)

    # Download the generated files
    for file_to_download in augmented_files:
        if os.path.exists(file_to_download):
            files.download(file_to_download)
            print(f"Downloaded: {file_to_download}")
        else:
            print(f"File not found: {file_to_download}")


if __name__ == "__main__":
    main("neurotoxicity.csv", fraction=0.1)

Saved: augmented_csvs/random_replace.csv
Saved: augmented_csvs/random_delete.csv
Saved: augmented_csvs/random_replace_with_C.csv
Saved: augmented_csvs/random_replace_with_T.csv
Saved: augmented_csvs/random_swap.csv
Saved: augmented_csvs/random_insertion_with_C.csv
Saved: augmented_csvs/random_insertion_with_T.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_delete.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace_with_C.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_replace_with_T.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_swap.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_insertion_with_C.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded: augmented_csvs/random_insertion_with_T.csv


In [6]:
import pandas as pd
import os

# List of the generated augmented CSV files
augmented_files = [
        "augmented_csvs/random_replace.csv",
        "augmented_csvs/random_delete.csv",
        "augmented_csvs/random_replace_with_C.csv",
        "augmented_csvs/random_replace_with_T.csv",
        "augmented_csvs/random_swap.csv",
        "augmented_csvs/random_insertion_with_C.csv",
        "augmented_csvs/random_insertion_with_T.csv"
]

# Initialize an empty DataFrame to store the merged data
merged_df = pd.DataFrame()

# Flag to check if it's the first file
first_file = True

for file_path in augmented_files:
    if os.path.exists(file_path):
        if first_file:
            # Read the first file with the header
            merged_df = pd.read_csv(file_path)
            first_file = False
        else:
            # Read the rest of the files without the header and append
            temp_df = pd.read_csv(file_path, header=None)
            # Assign column names from the first DataFrame's columns
            temp_df.columns = merged_df.columns
            merged_df = pd.concat([merged_df, temp_df], ignore_index=True)
        print(f"Processed: {file_path}")
    else:
        print(f"File not found, skipping: {file_path}")

initial_row_count = len(merged_df)
merged_df_cleaned = merged_df.drop_duplicates()
rows_dropped = initial_row_count - len(merged_df_cleaned)

print(f"Initial row count: {initial_row_count}")
print(f"Row count after dropping duplicates: {len(merged_df_cleaned)}")
print(f"Number of rows dropped: {rows_dropped}")

# Save the merged DataFrame to a new CSV file
output_merged_file = "augmented_csvs/neurotoxicity_augmented_data.csv"
merged_df_cleaned.to_csv(output_merged_file, index=False)
print(f"Merged data saved to: {output_merged_file}")

Processed: augmented_csvs/random_replace.csv
Processed: augmented_csvs/random_delete.csv
Processed: augmented_csvs/random_replace_with_C.csv
Processed: augmented_csvs/random_replace_with_T.csv
Processed: augmented_csvs/random_swap.csv
Processed: augmented_csvs/random_insertion_with_C.csv
Processed: augmented_csvs/random_insertion_with_T.csv
Initial row count: 12781
Row count after dropping duplicates: 12677
Number of rows dropped: 104
Merged data saved to: augmented_csvs/neurotoxicity_augmented_data.csv


In [7]:
import pandas as pd

data = pd.read_csv("neurotoxicity_augmented_data.csv")

counts = data['toxicity'].value_counts()
print(counts)

toxicity
1    8007
0    4669
Name: count, dtype: int64


In [8]:
initial_row_count = len(data)
data_cleaned = data.drop_duplicates()
rows_dropped = initial_row_count - len(data_cleaned)

print(f"Initial row count: {initial_row_count}")
print(f"Row count after dropping duplicates: {len(data_cleaned)}")
print(f"Number of rows dropped: {rows_dropped}")

Initial row count: 12676
Row count after dropping duplicates: 12599
Number of rows dropped: 77


In [9]:
data = data_cleaned
initial_row_count = len(data)
data_cleaned = data.drop_duplicates()
rows_dropped = initial_row_count - len(data_cleaned)

print(f"Initial row count: {initial_row_count}")
print(f"Row count after dropping duplicates: {len(data_cleaned)}")
print(f"Number of rows dropped: {rows_dropped}")

Initial row count: 12599
Row count after dropping duplicates: 12599
Number of rows dropped: 0


In [10]:
# Separate classes
minority_class = data[data['toxicity'] == data['toxicity'].value_counts().idxmin()]
majority_class = data[data['toxicity'] == data['toxicity'].value_counts().idxmax()]

# Undersample majority class
majority_class_sampled = majority_class.sample(n=len(minority_class), random_state=42)

# Combine the balanced classes
balanced_data = pd.concat([minority_class, majority_class_sampled])

# Shuffle the balanced data
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

counts = balanced_data['toxicity'].value_counts()
print(counts)

toxicity
1    4647
0    4647
Name: count, dtype: int64


In [11]:
balanced_data.to_csv('toxicity_augmented_balanced_data.csv', index=False)