### Testing file for preprocessing_functions.py ###

In [1]:
# Import everything needed
from preprocessing_functions import *

Test get reactome method

In [2]:
def test_get_reactome():
    lowest_pathways_df, all_pathways_df = get_reactome_data()
    assert lowest_pathways_df.shape == (51056, 3)
    assert all_pathways_df.shape == (149564, 3)

    prothrombin_lowest_pathways = lowest_pathways_df[lowest_pathways_df['UniProtKB_ID'] == 'P00734']
    assert prothrombin_lowest_pathways.shape[0] == 14

    prothrombin_all_pathways = all_pathways_df[all_pathways_df['UniProtKB_ID'] == 'P00734']
    assert prothrombin_all_pathways.shape[0] == 33


def test_get_processed_reactome():
    lowest_pathways_df = pd.read_csv('data_processed/reactome_lowest_pathways_homo_sapiens.csv')
    assert lowest_pathways_df.shape == (51056, 3)

    all_pathways_df = pd.read_csv('data_processed/reactome_all_pathways_homo_sapiens.csv')
    assert all_pathways_df.shape == (149564, 3)

test_get_reactome()
test_get_processed_reactome()
print("All tests passed!")

Original lowest pathways shape: (302009, 6)
Original all pathways shape: (871849, 6)
Filtered lowest pathways shape restricting to Homo sapiens and removing unnecessary columns: (51056, 3)
Filtered all pathways shape restricting to Homo sapiens and removing unnecessary columns: (149564, 3)
All tests passed!


Test get_drug_comb_data method

In [3]:
def test_get_drug_comb_df_all_relevant_values():
    drugcomb_df = get_drug_comb_data()
    assert drugcomb_df.shape == (722344, 26)

def test_remove_all_na_values():
    drugcomb_df = get_drug_comb_data(bliss=True, loewe=True, hsa=True, zip=True, s_max=True, s_mean=True, s_sum=True)
    assert drugcomb_df['synergy_bliss'].notna().all()
    assert drugcomb_df['synergy_loewe'].notna().all()
    assert drugcomb_df['synergy_hsa'].notna().all()
    assert drugcomb_df['synergy_zip'].notna().all()
    assert drugcomb_df['S_max'].notna().all()
    assert drugcomb_df['S_mean'].notna().all()
    assert drugcomb_df['S_sum'].notna().all()
    assert drugcomb_df.shape == (722032, 26)

def test_get_preprocessed_drugcomb():
    # Assumes using all synergy scores
    drugcomb_df = pd.read_csv('data_processed/drugcomb_data.csv')
    assert drugcomb_df.shape == (722032, 26)

test_get_drug_comb_df_all_relevant_values()
test_remove_all_na_values()
test_get_preprocessed_drugcomb()

print("All tests passed!")

  drugcomb_df = pd.read_csv('data/DrugComb/drugcomb_summary_v_1_5.csv', sep=',', index_col=False)


Original shape of drugcomb data:  (1432351, 26)
Final shape of filtered drugcomb data:  (722344, 26)


  drugcomb_df = pd.read_csv('data/DrugComb/drugcomb_summary_v_1_5.csv', sep=',', index_col=False)


Original shape of drugcomb data:  (1432351, 26)
Final shape of filtered drugcomb data:  (722032, 26)


  drugcomb_df = pd.read_csv('data_processed/drugcomb_data.csv')


All tests passed!


Test get_ddinter method

In [4]:
ddinter_df = get_ddinter_data()

def test_get_ddinter_data():
    assert ddinter_df.shape == (160235, 5)

def test_get_preprocessed_ddinter():
    ddinter_df = pd.read_csv('data_processed/ddinter_data.csv')
    assert ddinter_df.shape == (160235, 5)

test_get_ddinter_data()
test_get_preprocessed_ddinter()
print("All tests passed!")

DDInter Shape without NA values: (160235, 5)
All tests passed!


Test get_drugbank_ddi method

In [5]:
def test_get_drugbank_ddi():
    drugbank_ddi_df, _ = get_drugbank_ddi()
    # Test the severity levels are limited to 3 levels and there are no missing values
    assert len(drugbank_ddi_df['severity'].unique()) == 3
    assert drugbank_ddi_df['severity'].isna().sum() == 0
    # Test all drug names are lower case
    assert drugbank_ddi_df['subject_drug_name'].str.islower().all()
    assert drugbank_ddi_df['affected_drug_name'].str.islower().all()
    # Test no NA values for drug names
    assert drugbank_ddi_df['subject_drug_name'].notna().all()
    assert drugbank_ddi_df['affected_drug_name'].notna().all()
    # Test the shape of the dataframe
    assert drugbank_ddi_df.shape == (1430454, 18)

def test_get_preprocessed_drugbank_ddi():
    preprocessed_drugbank = pd.read_csv('data_processed/drugbank_ddi.csv')
    assert preprocessed_drugbank.shape == (1430454, 18)

test_get_drugbank_ddi()
test_get_preprocessed_drugbank_ddi()
print("All tests passed!")

  drugbank_ddi_df = pd.read_csv(drugbank_ddi_fp)


How many interactions are in each severity category in the drugbank database?
severity
1    742987
0    535009
2    152458
Name: count, dtype: int64
Shape of drugbank ddi dataframe:  (1430454, 18)


  preprocessed_drugbank = pd.read_csv('data_processed/drugbank_ddi.csv')


All tests passed!


Test get drugbank target method

In [2]:
drugs_to_targets_df = parse_drugbank_xml()

def test_parse_drugbank_xml():
    assert drugs_to_targets_df.shape == (19435, 9)
    
    # Check if there are any drugs with multiple SMILES
    drugs_with_multiple_smiles = drugs_to_targets_df.groupby('drug_name')['SMILES'].nunique()
    assert all(drugs_with_multiple_smiles <= 1)

test_parse_drugbank_xml()
print("All tests passed!")

All tests passed!


Test getting STRING graph

In [7]:
STRING_G = get_STRING_graph()

def test_get_string_graph():
    assert STRING_G.number_of_nodes() == 18382
    assert STRING_G.number_of_edges() == 591429

test_get_string_graph()
print("All tests passed!")

Original shape of STRING edge list, physical detailed: (1477610, 6)
All tests passed!


Test Jaccard Similarity

In [8]:
def test_jaccard_similarity_equal():
    # Equal sets
    s1 = set(["Alopecia", "Vomiting", "Diarrhea"])
    s2 = set(["Alopecia", "Vomiting", "Diarrhea"])
    assert jaccard_similarity(s1, s2) == 1.0

def test_jaccard_similarity_disjoint():
    # Disjoint sets
    s1 = set(["Alopecia", "Vomiting", "Diarrhea"])
    s2 = set(["Anorexia", "Hypertension", "Nausea"])
    assert jaccard_similarity(s1, s2) == 0.0

def test_jaccard_similarity_partial_overlap():
    # Partial overlap
    s1 = set(["Alopecia", "Vomiting", "Diarrhea"])
    s2 = set(["Alopecia", "Hypertension", "Nausea"])
    assert jaccard_similarity(s1, s2) == 1/5


test_jaccard_similarity_equal()
test_jaccard_similarity_disjoint()
test_jaccard_similarity_partial_overlap()
print("All tests passed!")

All tests passed!


Test Jonckheere-Terpestra

In [9]:
# Example from https://www.statext.com/practice/JonckheereTest03.php
statext_3_samples = [
    [40, 35, 38, 43, 44, 41],
    [38, 40, 47, 44, 40, 42],
    [48, 40, 45, 43, 46, 44],
]

print(jonckheere_terpestra_test(statext_3_samples)) # Expected: 2.02113, 0.021633 - matches both

(2.0211302086361083, 0.021633143978495584)


Test drugcomb_ddinter intersect

In [10]:
drugcomb_df = pd.read_csv('data_processed/drugcomb_data.csv')
ddinter_df = pd.read_csv('data_processed/ddinter_data.csv')

def test_find_drugcomb_ddinter_intersect():
    drug_syntox_df, major_pairs, moderate_pairs, minor_pairs, unknown_pairs = find_drugcomb_ddinter_intersect(drugcomb_df, ddinter_df)
    assert drug_syntox_df.shape == (37845, 11) # only known toxicity
    assert len(major_pairs) == 788 # removing duplicates, aka drug A, drug B is considered same as drug B, drug A
    assert len(moderate_pairs) == 2341
    assert len(minor_pairs) == 175
    assert len(unknown_pairs) == 1425

    # Verify if there are any repeated drug pairs
    example_one = ('methotrexate', 'fenoprofen')
    example_two = ('fenoprofen', 'methotrexate')
    assert example_one in major_pairs
    assert example_two not in major_pairs

def test_get_preprocessed_drugcomb_ddinter_known():
    drug_syntox_df = pd.read_csv('data_processed/ddinter_syntox_known.csv')
    assert drug_syntox_df.shape == (37845, 11) # only known, not including unknown

test_find_drugcomb_ddinter_intersect()
test_get_preprocessed_drugcomb_ddinter_known()
print("All tests passed!")

  drugcomb_df = pd.read_csv('data_processed/drugcomb_data.csv')


Number of drugs in common between drugcomb and ddinter [lowercase enforced]:  683
Major pairs in both DrugComb and in DDInter:  788
Moderate pairs in both DrugComb and in DDInter:  2341
Minor pairs in both DrugComb and in DDInter:  175
Unknown toxicity pairs in both DrugComb and in DDInter:  1425
Total common pairs:  4729
Total known pairs:  3304
All tests passed!


Test the find_drugcomb_drugbankddi_intersect method

In [13]:
drugbank_ddi_df = pd.read_csv('data_processed/drugbank_ddi.csv')

def test_find_drugcomb_drugbank_intersect():
    drug_syntox_df, major_pairs, moderate_pairs, minor_pairs, unknown_pairs = find_drugcomb_drugbankddi_intersect(drugcomb_df, drugbank_ddi_df)
    
    assert drug_syntox_df.shape == (88219, 11)
    assert len(major_pairs) == 3211 # removing duplicates, aka drug A, drug B is considered same as drug B, drug A
    assert len(moderate_pairs) == 5534
    assert len(minor_pairs) == 4799
    assert len(unknown_pairs) == 56177

    # Verify if there are any repeated drug pairs
    example_one = ('methotrexate', 'fenoprofen')
    example_two = ('fenoprofen', 'methotrexate')
    assert example_one in major_pairs
    assert example_two not in major_pairs

def test_get_preprocessed_drugcomb_drugbank_known():
    drug_syntox_df = pd.read_csv('data_processed/drugbank_syntox_known.csv')
    assert drug_syntox_df.shape == (88219, 11)

test_find_drugcomb_drugbank_intersect()
test_get_preprocessed_drugcomb_drugbank_known()
print("All tests passed!")

  drugbank_ddi_df = pd.read_csv('data_processed/drugbank_ddi.csv')


Number of drugs in common between drugcomb and drugbank [lowercase enforced]:  1079
Major pairs in both DrugComb and in DrugBank:  3211
Moderate pairs in both DrugComb and in DrugBank:  5534
Minor pairs in both DrugComb and in DrugBank:  4799
Unknown toxicity pairs in both DrugComb and in DrugBank:  56177
Total common pairs:  69721
Total known pairs:  13544
All tests passed!
