<a href="https://colab.research.google.com/github/bartala/PalliativEtiological/blob/main/Preprocess_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import necessary libraries.

In [4]:
import pandas as pd
import re
import csv

### Preprocess Datasets

1. **Creating the format of the labled dataset & preprocess DrugBank data.**  
Please remember that this code will not produce the full dataset included with the paper, as we manually annotated the mechanisms of action of the drugs. <br>
You can find the labled data set in this repository named 'DrugLabels.xlsx'

In [None]:
# Read in the input file
df_drugBank = pd.read_csv('approved_all_drug_target.csv')

# Filter out rows where 'Species' column is not 'Humans'
df_drugBank = df_drugBank[df_drugBank['Species'] == 'Humans']

# Split the values in the "Drug IDs" column by ";"
df_drugBank['Drug IDs'] = df_drugBank['Drug IDs'].apply(lambda x: x.split("; "))

# Explode the "Drug IDs" column so that each value is in a new row
df_drugBank = df_drugBank.explode('Drug IDs')

# Drugs without genes name : DB00693, DB13979, DB16900
# Fill in missing 'Gene Name' values from the 'Name' column and upcase both 'Gene Name' and 'Name'
# Convert genes name to uppercase for consistency
df_drugBank['Gene Name'] = df_drugBank['Gene Name'].fillna(df_drugBank['Name']).str.upper()
df_drugBank['Name'] = df_drugBank['Name'].str.upper()

# Export the final DataFrame to a CSV file
df_drugBank.to_csv('temp.csv', index=False)

# Group by the Drug ID and aggregate the gene names into a list for each drug
df_drugBank_grouped = df_drugBank.groupby('Drug IDs')['Gene Name'].apply(list).reset_index()

# Find the maximum number of genes associated with any drug
num_max_genes = df_drugBank_grouped['Gene Name'].str.len().max()

# Padding the gene lists so that each list has the same length
df_drugBank_grouped['Gene Name'] = df_drugBank_grouped['Gene Name'].apply(lambda x: x + [None] * (num_max_genes - len(x)))

# Convert them into a DataFrame
df_genes = pd.DataFrame(df_drugBank_grouped['Gene Name'].to_list(), columns=[f'gene{i+1}' for i in range(num_max_genes)])

# Merge the gene columns back with the Drug IDs
df_final = pd.concat([df_drugBank_grouped[['Drug IDs']], df_genes], axis=1)

# Include the additional columns that we need to track on each drug
additional_columns = {
    'Drug Name' : '',
    'Associated Conditions' : '',
    'ATC class': '',
    'MoA': '',
    'Drug_Description' : '',
    'URL': ''
}

# Add the additional columns to the DataFrame
for col, placeholder in additional_columns.items():
  df_final[col] = placeholder

# Reorder the columns in the df
column_order = ['Drug IDs', 'Drug Name', 'Associated Conditions', 'ATC class', 'MoA', 'Drug_Description', 'URL'] + [f'gene{i+1}' for i in range(num_max_genes)]

df_final = df_final[column_order]
print(f"Column of the dataset:")
print(df_final.columns)
# Export the final DataFrame to a CSV file
df_final.to_excel('DrugLabels-V1.xlsx', index=False)

Column of the dataset:
Index(['Drug IDs', 'Drug Name', 'Associated Conditions', 'ATC class', 'MoA',
       'Drug_Description', 'URL', 'gene1', 'gene2', 'gene3',
       ...
       'gene295', 'gene296', 'gene297', 'gene298', 'gene299', 'gene300',
       'gene301', 'gene302', 'gene303', 'gene304'],
      dtype='object', length=311)


2. **Preprocess OMIM (morbidmap) dataset.**  
Contrain diseases+genes

In [51]:
def preprocess_disease_name_old(disease_name):
      # Extract the text inside [] or {}
    match = re.search(r'[\{\[]([^}\]]+)[\}\]]', disease_name)
    if match:
        disease_name = match.group(1)
    else:
        # If there are no brackets, we just clean up the remaining numbers and commas
        disease_name = re.sub(r', \d+$', '', disease_name)

    # Extract text up to a number in parenthesis like (1), (2), etc.
    # This number explain which way the disease-gene interaction discover.
    disease_name = re.sub(r'\s+\(\d+\)', '', disease_name)
    # Remove all '?' characters
    disease_name = disease_name.replace('?', '')
    # Remove the unique ID numbers.
    disease_name = re.sub(r'\s+\d{3,}$', '', disease_name)
    # Remove commas and extra spaces from the new string
    disease_name = disease_name.replace(",", "")
    disease_name = ' '.join(disease_name.split())

    return disease_name

In [52]:
"""
We used hex notation for special characters and digit characters in the regular expression to resolve an error from GitHub.
"""
def preprocess_disease_name(disease_name):
    # First, try to extract text inside curly braces {} or square brackets []
    match_curly = re.search(r'\{([^}]+)\}', disease_name)
    # '\x5B([^\x5D]+)\x5D' = '\[([^\]]+)\]'
    match_square = re.search(r'\x5B([^\x5D]+)\x5D', disease_name)
    if match_curly:
        disease_name = match_curly.group(1)
    elif match_square:
        disease_name = match_square.group(1)
    else:
        # If there are no brackets, we just clean up the remaining numbers and commas
        disease_name = re.sub(r', \d+$', '', disease_name)

    # Extract text up to a number in parenthesis like (1), (2), etc.
    # This number explain which way the disease-gene interaction discover.
    disease_name = re.sub(r'\s+\([0-9]+\)', '', disease_name)

    # Remove all '?' characters
    disease_name = disease_name.replace('?', '')
    # Remove the unique ID numbers.
    disease_name = re.sub(r'\s+\d{3,}$', '', disease_name)
    # Remove commas and extra spaces from the new string
    disease_name = disease_name.replace(",", "")
    disease_name = ' '.join(disease_name.split())

    return disease_name


In [53]:
omim_dataset_path = "morbidmap.txt"

# Convert the dataset from txt to csv file
df_omim = pd.read_csv(omim_dataset_path, sep='\t', comment='#')

# Set the columns name
df_omim.columns = ["Disease","Gene","MIM Number","Cyto Location"]

# Keep only the disease and the genes that they are associated with.
df_omim = df_omim[['Disease', 'Gene']]

# Process the data
explode_data = []
for index, row in df_omim.iterrows():
    disease = row['Disease']
    disease = preprocess_disease_name(disease)
    genes = row['Gene'].split(',')

    for gene in genes:
        # In the data
        gene = gene.strip().upper()  # Remove white spaces and convert gene to uppercase
        explode_data.append({'Disease': disease, 'Gene': gene})

# Create a new DataFrame from the array
df_explode_omim = pd.DataFrame(explode_data)

# Drop rows where 'Gene Symbole' is NaN or empty string
df_explode_omim.dropna(subset=['Gene'], inplace=True)
df_explode_omim = df_explode_omim[df_explode_omim['Gene'] != '']

df_explode_omim.to_csv('processed_omim.csv', index=False)

print(df_explode_omim.head())

                                      Disease     Gene
0  17-alpha-hydroxylase/1720-lyase deficiency  CYP17A1
1  17-alpha-hydroxylase/1720-lyase deficiency    CYP17
2  17-alpha-hydroxylase/1720-lyase deficiency  P450C17
3         24-dienoyl-CoA reductase deficiency    NADK2
4         24-dienoyl-CoA reductase deficiency  C5ORF33


In [54]:
omim_dataset_path = "morbidmap.txt"

# Convert the dataset from txt to csv file
df_omim = pd.read_csv(omim_dataset_path, sep='\t', comment='#')

# Set the columns name
df_omim.columns = ["Disease","Gene","MIM Number","Cyto Location"]

# Keep only the disease and the genes that they are associated with.
df_omim = df_omim[['Disease', 'Gene']]

# Process the data
explode_data = []
for index, row in df_omim.iterrows():
    disease = row['Disease']
    disease = preprocess_disease_name_old(disease)
    genes = row['Gene'].split(',')

    for gene in genes:
        # In the data
        gene = gene.strip().upper()  # Remove white spaces and convert gene to uppercase
        explode_data.append({'Disease': disease, 'Gene': gene})

# Create a new DataFrame from the array
df_explode_omim = pd.DataFrame(explode_data)

# Drop rows where 'Gene Symbole' is NaN or empty string
df_explode_omim.dropna(subset=['Gene'], inplace=True)
df_explode_omim = df_explode_omim[df_explode_omim['Gene'] != '']

df_explode_omim.to_csv('processed_omim_old.csv', index=False)

print(df_explode_omim.head())

                                      Disease     Gene
0  17-alpha-hydroxylase/1720-lyase deficiency  CYP17A1
1  17-alpha-hydroxylase/1720-lyase deficiency    CYP17
2  17-alpha-hydroxylase/1720-lyase deficiency  P450C17
3         24-dienoyl-CoA reductase deficiency    NADK2
4         24-dienoyl-CoA reductase deficiency  C5ORF33


In [55]:
import pandas as pd

# Load the two datasets
df_new = pd.read_csv('processed_omim.csv')
df_old = pd.read_csv('processed_omim_old.csv')

# Check if the two DataFrames are identical
if df_new.equals(df_old):
    print("The files are identical.")
else:
    print("The files are not identical.")

    # Find differences
    df_diff = pd.concat([df_new, df_old]).drop_duplicates(keep=False)
    print("Differences:")
    print(df_diff)


The files are identical.


3. **Preprocess Biogrid (PPI) dataset.**  
 Contrain (PPI - genes+genes)

In [None]:
# read the csv file
df_biogrid = pd.read_csv('BIOGRID-ORGANISM-Homo_sapiens-4.4.217.tab3.txt', sep='\t', low_memory=False)


# Filter the DataFrame to keep rows with 'Homo sapiens' in both organism columns
df_filtered_biogrid = df_biogrid[(df_biogrid['Organism Name Interactor A'] == 'Homo sapiens') & (df_biogrid['Organism Name Interactor B'] == 'Homo sapiens')].copy()

# Convert genes name to uppercase for consistency
df_filtered_biogrid['Official Symbol Interactor A'] = df_filtered_biogrid['Official Symbol Interactor A'].str.upper()
df_filtered_biogrid['Official Symbol Interactor B'] = df_filtered_biogrid['Official Symbol Interactor B'].str.upper()

# Keep only the Official Symbol of the genes
df_filtered_biogrid = df_filtered_biogrid[['Official Symbol Interactor A',	'Official Symbol Interactor B']]

df_filtered_biogrid.to_csv('processed_biogrid.csv', index=False)

print(f"Head of processed_omim: ")
print(df_filtered_biogrid.head())

Head of processed_omim: 
  Official Symbol Interactor A Official Symbol Interactor B
0                       MAP2K4                         FLNC
1                         MYPN                        ACTN2
2                        ACVR1                         FNTA
3                        GATA2                          PML
4                         RPA2                        STAT3


# Union all the datasets

In [None]:
"""
It is important to use the dataset which we fill manually with labels and the other information of each drug.
You can find this dataset in the repository.
"""

df_drug = pd.read_excel('DrugLabels.xlsx')
# Convert and save as a CSV file
df_drug.to_csv('DrugLabels.csv', index=False)
df_disease_gene = pd.read_csv('processed_omim.csv')
df_gene_gene = pd.read_csv('processed_biogrid.csv')

# Extracting drug-gene interactions
gene_columns = [col for col in df_drug.columns if 'gene' in col]
df_drug_melted = df_drug.melt(id_vars=['Drug IDs', 'MoA', 'ATC class', 'Drug_Description_New'],
                              value_vars=gene_columns,
                              value_name='gene').dropna()

df_drug_interactions = df_drug_melted[['Drug IDs', 'gene', 'MoA', 'ATC class', 'Drug_Description_New']].copy()
df_drug_interactions.rename(columns={"Drug IDs": "Source Name", "gene": "Target Name"}, inplace=True)
df_drug_interactions['Source Type'] = 'Drug'
df_drug_interactions['Target Type'] = 'Gene'
df_drug_interactions['Interaction Type'] = 'drug_target'

print(f"Head of drug_target interaction: ")
print(df_drug_interactions.head())

# Extracting Disease-gene interactions
df_disease_interactions = df_disease_gene[['Disease', 'Gene']].copy()
df_disease_interactions.rename(columns={"Disease": "Source Name", "Gene": "Target Name"}, inplace=True)
df_disease_interactions['Source Type'] = 'Disease'
df_disease_interactions['Target Type'] = 'Gene'
df_disease_interactions['Interaction Type'] = 'association'
df_disease_interactions['MoA'] = '-'
df_disease_interactions['ATC class'] = '-'
df_disease_interactions['Drug_Description_New'] = '-'

print('\n'*3)
print(f"Head of association interaction: ")
print(df_disease_interactions.head())

# Extracting gene-gene interactions
df_gene_interactions = df_gene_gene[['Official Symbol Interactor A', 'Official Symbol Interactor B']].copy()
df_gene_interactions.rename(columns={"Official Symbol Interactor A": "Source Name", "Official Symbol Interactor B": "Target Name"}, inplace=True)
df_gene_interactions['Source Type'] = 'Gene'
df_gene_interactions['Target Type'] = 'Gene'
df_gene_interactions['Interaction Type'] = 'PPI'
df_gene_interactions['MoA'] = '-'
df_gene_interactions['ATC class'] = '-'
df_gene_interactions['Drug_Description_New'] = '-'

print('\n'*3)
print(f"Head of PPI interaction: ")
print(df_gene_interactions.head())

# Merging all the interaction dataframes
merged_df = pd.concat([df_drug_interactions, df_disease_interactions, df_gene_interactions], ignore_index=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv("comprehensive_dataset.csv.gz", index=False, compression='gzip')


Head of drug_target interaction: 
  Source Name Target Name MoA ATC class  \
0     DB00001          F2   p         B   
1     DB00002      FCGR1A   e         L   
2     DB00004       IL2RB   e         L   
3     DB00005         TNF   p         L   
4     DB00006          F2   p         B   

                                Drug_Description_New Source Type Target Type  \
0  Lepirudin is a protein-based direct thrombin i...        Drug        Gene   
1  Cetuximab is an endothelial growth factor rece...        Drug        Gene   
2  Denileukin diftitox is a recombinant cytotoxic...        Drug        Gene   
3  Etanercept is a protein therapy based on the b...        Drug        Gene   
4  Bivalirudin is a direct thrombin inhibitor use...        Drug        Gene   

  Interaction Type  
0      drug_target  
1      drug_target  
2      drug_target  
3      drug_target  
4      drug_target  




Head of association interaction: 
                                  Source Name Target Name Sour