In [3]:
import pandas as pd

train = pd.read_csv('davispharosDataset/DAVIS/train.csv')
test = pd.read_csv('davispharosDataset/DAVIS/test.csv')
val = pd.read_csv('davispharosDataset/DAVIS/val.csv')

combined = pd.concat([train, test, val], ignore_index=True)

combined.to_csv('davispharosDataset/DAVIS.csv', index=False)

print(f"Combined dataset saved to 'davispharosDataset/DAVIS.csv' with shape: {combined.shape}")

Combined dataset saved to 'davispharosDataset/DAVIS.csv' with shape: (11103, 6)


Creating drug_protein matrix

In [5]:
import pandas as pd
import numpy as np

file_path = 'davispharosDataset/DAVIS.csv'
data = pd.read_csv(file_path)

unique_drugs = data['SMILES'].unique()
unique_proteins = data['Target Sequence'].unique()

drug_to_idx = {drug: idx for idx, drug in enumerate(unique_drugs)}
protein_to_idx = {protein: idx for idx, protein in enumerate(unique_proteins)}

interaction_matrix = np.full((len(unique_drugs), len(unique_proteins)), 0)

for _, row in data.iterrows():
    drug_idx = drug_to_idx[row['SMILES']]
    protein_idx = protein_to_idx[row['Target Sequence']]
    interaction_matrix[drug_idx, protein_idx] = row['Label']

interaction_df = pd.DataFrame(
    interaction_matrix,
    index=[drug for drug in unique_drugs],
    columns=[protein for protein in unique_proteins]
)

interaction_df.to_csv('davispharosDataset/drug_protein.csv')

print(f"Interaction matrix saved to 'davispharosDataset/drug_protein.csv' with shape {interaction_df.shape}")

Interaction matrix saved to 'davispharosDataset/drug_protein.csv' with shape (68, 379)


creating drug_drug based on Chemical Structure Similarity from rdkit

In [9]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

# Load data
file_path = 'davispharosDataset/DAVIS.csv'
data = pd.read_csv(file_path)

# Get unique drug SMILES
unique_drugs = data['SMILES'].unique()
n_drugs = len(unique_drugs)

def get_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    return None

fingerprints = [get_fingerprint(smiles) for smiles in unique_drugs]

# Compute similarity matrix
drug_similarity = np.zeros((n_drugs, n_drugs))

for i in range(n_drugs):
    for j in range(i, n_drugs):
        if fingerprints[i] is not None and fingerprints[j] is not None:
            similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
            drug_similarity[i, j] = similarity
            drug_similarity[j, i] = similarity

# Convert to binary matrix (threshold = 0.5)
threshold = 0.5
drug_drug_binary = (drug_similarity >= threshold).astype(int)

# Save to CSV
drug_drug_df = pd.DataFrame(drug_drug_binary, index=unique_drugs, columns=unique_drugs)
drug_drug_df.to_csv('davispharosDataset/drug_drug.csv')

print(" drug_drug.csv created!")

 drug_drug.csv created!




creating protein protein based on sequence similarity ( make sure to look at which similarity metric using)

In [10]:
%pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-macosx_11_0_arm64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-macosx_11_0_arm64.whl (2.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd
import numpy as np
from Bio import pairwise2

# Load data
file_path = 'davispharosDataset/DAVIS.csv'
data = pd.read_csv(file_path)

# Get unique protein sequences
unique_proteins = data['Target Sequence'].unique()
n_proteins = len(unique_proteins)

# Compute sequence similarity using pairwise alignment
protein_similarity = np.zeros((n_proteins, n_proteins))

for i in range(n_proteins):
    for j in range(i, n_proteins):
        if i != j:
            alignments = pairwise2.align.globalxx(unique_proteins[i], unique_proteins[j], score_only=True)
            max_len = max(len(unique_proteins[i]), len(unique_proteins[j]))
            similarity = alignments / max_len
            protein_similarity[i, j] = similarity
            protein_similarity[j, i] = similarity

# Convert to binary matrix (threshold = 0.7)
threshold = 0.7
protein_protein_binary = (protein_similarity >= threshold).astype(int)

# Save to CSV
protein_protein_df = pd.DataFrame(protein_protein_binary, index=unique_proteins, columns=unique_proteins)
protein_protein_df.to_csv('davispharosDataset/protein_protein.csv')

print(" protein_protein.csv created!")


 protein_protein.csv created!


Creating list of unique drugs and unique proteins dont have the fastas or the drug iD things so look for erros that coudl cause in the code

In [None]:
import pandas as pd

# Load data
file_path = 'davispharosDataset/DAVIS.csv'
data = pd.read_csv(file_path)

unique_drugs = data['SMILES'].unique()
unique_proteins = data['Target Sequence'].unique()

# Add a leading comma to each drug SMILES
drug_smiles_df = pd.DataFrame(',' + unique_drugs)
drug_smiles_df.to_csv('davispharosDataset/drug_smiles.csv', index=False, header=False)

# Add a leading comma to each protein sequence
protein_sequences_df = pd.DataFrame(',' + unique_proteins)
protein_sequences_df.to_csv('davispharosDataset/protein_fasta.csv', index=False, header=False)

print("drug_smiles.csv created!")
print("protein_fasta.csv created!")


drug_smiles.csv created!
protein_fasta.csv created!


In [25]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import numpy as np

# Load the drug SMILES data
drug_smiles_file = 'origin_dataset/drug_smiles.csv'
drug_smiles_df = pd.read_csv(drug_smiles_file, header=None)

# Convert SMILES to RDKit molecules
def smiles_to_molecule(smiles):
    return Chem.MolFromSmiles(smiles)

# Calculate Dice similarity between two molecules
def dice_similarity(mol1, mol2):
    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
    return DataStructs.DiceSimilarity(fp1, fp2)

# Generate the drug similarity matrix
n_drugs = len(drug_smiles_df)
similarity_matrix = np.zeros((n_drugs, n_drugs))

for i in range(n_drugs):
    mol1 = smiles_to_molecule(drug_smiles_df.iloc[i, 0])
    if mol1 is None:
        continue
    for j in range(i, n_drugs):
        mol2 = smiles_to_molecule(drug_smiles_df.iloc[j, 0])
        if mol2 is None:
            continue
        similarity = dice_similarity(mol1, mol2)
        similarity_matrix[i, j] = similarity
        similarity_matrix[j, i] = similarity

# Save as a .txt file (space-separated)
similarity_matrix_df = pd.DataFrame(similarity_matrix)
similarity_matrix_df.to_csv('origin_dataset/Similarity_Matrix_Drugs.txt', sep=' ', header=False, index=False)

print("Drug similarity matrix saved to Similarity_Matrix_Drugs.txt")




Drug similarity matrix saved to Similarity_Matrix_Drugs.txt




In [26]:
import pandas as pd
from Bio import pairwise2
import numpy as np

# Load the protein sequence data
protein_sequences_file = 'origin_dataset/protein_sequences.csv'
protein_sequences_df = pd.read_csv(protein_sequences_file, header=None)

# Function to compute sequence similarity using Smith-Waterman alignment
def sequence_similarity(seq1, seq2):
    alignments = pairwise2.align.localxx(seq1, seq2)  # Local alignment
    if not alignments:
        return 0.0
    return alignments[0][2] / max(len(seq1), len(seq2))  # Normalized score

# Generate the protein similarity matrix
n_proteins = len(protein_sequences_df)
similarity_matrix_proteins = np.zeros((n_proteins, n_proteins))

for i in range(n_proteins):
    seq1 = protein_sequences_df.iloc[i, 0]
    for j in range(i, n_proteins):
        seq2 = protein_sequences_df.iloc[j, 0]
        similarity = sequence_similarity(seq1, seq2)
        similarity_matrix_proteins[i, j] = similarity
        similarity_matrix_proteins[j, i] = similarity

# Save as a .txt file (space-separated)
similarity_matrix_proteins_df = pd.DataFrame(similarity_matrix_proteins)
similarity_matrix_proteins_df.to_csv('origin_dataset/Similarity_Matrix_Proteins.txt', sep=' ', header=False, index=False)

print("Protein similarity matrix saved to Similarity_Matrix_Proteins.txt")

Protein similarity matrix saved to Similarity_Matrix_Proteins.txt
