In [1]:
import pandas as pd

# Load PrimeKG data with a specified data type to handle mixed types
file_path = "/Users/akadirerol/Downloads/kg.csv"
dtype_spec = {'x_id': str, 'y_id': str, 'x_name': str, 'y_name': str}
chunk_size = 100000

In [2]:
# Function to load data in chunks and normalize text
def load_data_in_chunks(file_path, chunk_size, dtype_spec):
    chunk_iterator = pd.read_csv(file_path, chunksize=chunk_size, dtype=dtype_spec, low_memory=False)
    for chunk in chunk_iterator:
        chunk['x_name'] = chunk['x_name'].str.lower().str.strip()
        chunk['y_name'] = chunk['y_name'].str.lower().str.strip()
        yield chunk

In [3]:
# Function to find broader matches based on patterns
def find_broader_matches(patterns, all_names):
    return {name for name in all_names if any(pattern in name for pattern in patterns)}


In [4]:
# Load unique names
def get_unique_names(file_path, chunk_size, dtype_spec):
    unique_x_names, unique_y_names = set(), set()
    for chunk in load_data_in_chunks(file_path, chunk_size, dtype_spec):
        unique_x_names.update(chunk['x_name'].unique())
        unique_y_names.update(chunk['y_name'].unique())
    return sorted(unique_x_names.union(unique_y_names))


Generates pairs based on specified x_type and y_type (e.g., disease, drug, phenotype) and target names.
   
Parameters:

file_path (str): Path to the data file.

chunk_size (int): Number of rows per chunk to load.

dtype_spec (dict): Data type specifications for loading the file.

target_names (set): A set of names to search for in the x_name or y_name columns.

x_type (str): The type to match for x_name (e.g., "disease", "drug", "phenotype").

y_type (str): The type to match for y_name (e.g., "disease", "drug", "phenotype").

relationship_column (str, optional): Name of the column containing the relationship type information. If None, it will not be included.

   
Returns:

list of tuples: Pairs where either x_name or y_name matches the target names and the corresponding types match x_type and y_type.

In [59]:
def generate_pairs(file_path, chunk_size, dtype_spec, target_names, x_type, y_type, relationship_columns=None):
    pairs = []
    for chunk in load_data_in_chunks(file_path, chunk_size, dtype_spec):
        for _, row in chunk.iterrows():
            # Only check for pairs where x_name has x_type and y_name has y_type
            if row['x_name'] in target_names and y_type in row.get('y_type', '').lower():
                # Create dictionary with x_name and y_name
                pair = {'x_name': row['x_name'], 'y_name': row['y_name']}
                
                # Add relationship columns if specified
                if relationship_columns:
                    for col in relationship_columns:
                        pair[col] = row.get(col, '')
                
                pairs.append(pair)
    return pairs

In [16]:
# Get all unique names from the dataset
all_unique_names = get_unique_names(file_path, chunk_size, dtype_spec)

print(f"Total number of unique names: {len(all_unique_names)}")
for name in all_unique_names[:50]:  # Print the first 50 for a quick check
    print(name)

Total number of unique names: 128549
'de novo' actin filament nucleation
'de novo' amp biosynthetic process
'de novo' cotranslational protein folding
'de novo' ctp biosynthetic process
'de novo' gdp-l-fucose biosynthetic process
'de novo' imp biosynthetic process
'de novo' l-methionine biosynthetic process
'de novo' nad biosynthetic process
'de novo' nad biosynthetic process from aspartate
'de novo' nad biosynthetic process from tryptophan
'de novo' posttranslational protein folding
'de novo' protein folding
'de novo' pyridoxal 5'-phosphate biosynthetic process
'de novo' pyrimidine nucleobase biosynthetic process
'de novo' ump biosynthetic process
'de novo' xmp biosynthetic process
'psoriatic arthritis, susceptibility to
(+)-2-(4-biphenyl)propionic acid
(+)-2-epi-prezizaene synthase activity
(+)-3'-hydroxylarreatricin biosynthetic process
(+)-abscisic acid 8'-hydroxylase activity
(+)-abscisic acid d-glucopyranosyl ester transmembrane transport
(+)-abscisic acid d-glucopyranosyl ester t

In [23]:
breast_and_cervical_cancer = [
    "breast cancer", "ductal carcinoma", "invasive ductal carcinoma", "invasive lobular carcinoma",
    "triple-negative", "her2-positive", "metastatic breast", "dcis",
    "hormone receptor-positive", "cervical cancer", "cervical squamous cell carcinoma", "cervical adenocarcinoma",
    "stage i cervical", "stage ii cervical", "stage iii cervical", "stage iv cervical",
    "hpv"
]

In [19]:
# Find names related to breast and cervical cancer using broader patterns
breast_and_cervical_cancer_dataset = find_broader_matches(breast_and_cervical_cancer, all_unique_names)

# Display the number of broader matches found
print(f"Number of broader Breast and Cervical Cancer-related names: {len(breast_and_cervical_cancer_dataset)}")

# Display some of the related names for verification
print("\nSample of Breast and Cervical Cancer-related names:")
print(list(breast_and_cervical_cancer_dataset)[:10])  # Display the first 10 as a sample

Number of broader Breast and Cervical Cancer-related names: 15

Sample of Breast and Cervical Cancer-related names:
['cervical cancer', 'estrogen-receptor negative breast cancer', 'endocervical adenocarcinoma', 'progesterone-receptor positive breast cancer', 'triple-negative breast carcinoma', 'her2-receptor negative breast cancer', 'microinvasive cervical squamous cell carcinoma', 'estrogen-receptor positive breast cancer', 'progesterone-receptor negative breast cancer', 'sporadic breast cancer']


In [None]:
# Generate disease pairs using broader cancer matches
disease_disease_pairs_first = generate_pairs(file_path, chunk_size, dtype_spec, breast_and_cervical_cancer_dataset, 'disease', 'disease')

print(f"Total disease-disease pairs found: {len(disease_disease_pairs_first)}")
print("Sample of disease-disease pairs:", disease_disease_pairs_first[:10])

Total disease-disease pairs found: 104
Sample of disease-disease pairs: [('ductal carcinoma in situ', 'cowden disease'), ('endocervical adenocarcinoma', 'endocervical carcinoma'), ('cervical adenocarcinoma', 'endocervical adenocarcinoma'), ('progesterone-receptor positive breast cancer', 'breast carcinoma by gene expression profile'), ('progesterone-receptor negative breast cancer', 'breast carcinoma by gene expression profile'), ('her2-receptor negative breast cancer', 'breast carcinoma by gene expression profile'), ('breast cancer', 'malignant breast phyllodes tumor'), ('breast cancer', 'breast sarcoma'), ('cervical adenocarcinoma', 'cervical mucinous adenocarcinoma'), ('cervical squamous cell carcinoma', 'cervical verrucous carcinoma')]


In [21]:
def find_unique_second_elements(pairs):
    second_of_pairs = [pair[1] for pair in pairs]
    return sorted(set(second_of_pairs))

second_of_pairs_first = find_unique_second_elements(disease_disease_pairs_first)

# Display the result
print(f"Number of unique entries in the second of pairs: {len(second_of_pairs_first)}")
print("Second of pairs list:")
print(second_of_pairs_first)


Number of unique entries in the second of pairs: 48
Second of pairs list:
['adenocarcinoma of cervix uteri', 'adenoid basal carcinoma of the cervix uteri', 'basal-like breast carcinoma', 'breast carcinoma', 'breast carcinoma by gene expression profile', 'breast lymphoma', 'breast neoplasm', 'breast sarcoma', 'cervical adenocarcinoma', 'cervical adenosquamous carcinoma', 'cervical basaloid carcinoma', 'cervical carcinoma', 'cervical clear cell adenocarcinoma', 'cervical endometrioid adenocarcinoma', 'cervical intraepithelial neoplasia grade 2/3', 'cervical keratinizing squamous cell carcinoma', 'cervical lymphoepithelioma-like carcinoma', 'cervical metaplasia', 'cervical mucinous adenocarcinoma', 'cervical non-keratinizing squamous cell carcinoma', 'cervical serous adenocarcinoma', 'cervical squamous cell carcinoma', 'cervical verrucous carcinoma', 'cervical wilms tumor', 'cervix melanoma', 'cowden disease', 'early invasive cervical adenocarcinoma', 'endocervical adenocarcinoma', 'endoc

In [None]:
# Generate disease pairs using second pairs of cancer matches
disease_disease_pairs_second = generate_pairs(file_path, chunk_size, dtype_spec, second_of_pairs_first, 'disease', 'disease')

print(f"Total disease-disease pairs found: {len(disease_disease_pairs_second)}")
print("Sample of disease-disease pairs:", disease_disease_pairs_second[:10])

Total disease-disease pairs found: 642
Sample of disease-disease pairs: [('squamous cell carcinoma', 'junctional epidermolysis bullosa'), ('squamous cell carcinoma', 'late-onset junctional epidermolysis bullosa'), ('squamous cell carcinoma', 'recessive dystrophic epidermolysis bullosa'), ('squamous cell carcinoma', 'dystrophic epidermolysis bullosa'), ('squamous cell carcinoma', 'transient bullous dermolysis of the newborn'), ('breast carcinoma', 'cowden disease'), ('squamous cell carcinoma', 'schc6pf-schulz-passarge syndrome'), ('squamous cell carcinoma', 'xeroderma pigmentosum'), ('breast carcinoma', 'breast-ovarian cancer, familial, susceptibility to'), ('breast carcinoma', 'familial cutaneous telangiectasia and oropharyngeal predisposition cancer syndrome')]


In [25]:
breast_and_cervical_cancer_from_web = [disease.lower().strip() for disease in [
    "Ovarian Cancer",
    "Endometrial (Uterine) Cancer",
    "Vulvar Cancer",
    "Vaginal Cancer",
    "Fallopian Tube Cancer",
    "Lobular Carcinoma In Situ (LCIS)",
    "Ductal Carcinoma In Situ (DCIS)",
    "Triple-Negative Breast Cancer",
    "Inflammatory Breast Cancer",
    "Peritoneal Cancer"
]]

In [26]:
# Find names related to breast and cervical cancer using broader patterns
breast_and_cervical_cancer_dataset_from_web = find_broader_matches(breast_and_cervical_cancer_from_web, all_unique_names)

# Display the number of broader matches found
print(f"Number of broader Breast and Cervical Cancer-related names web: {len(breast_and_cervical_cancer_dataset_from_web)}")

# Display some of the related names for verification
print("\nSample of Breast and Cervical Cancer-related names web:")
print(list(breast_and_cervical_cancer_dataset_from_web)[:10])  # Display the first 10 as a sample

Number of broader Breast and Cervical Cancer-related names web: 10

Sample of Breast and Cervical Cancer-related names web:
['ovarian cancer', 'vaginal cancer', 'fallopian tube cancer', 'familial ovarian cancer', 'ovarian cancer, susceptibility to, 1', 'hereditary breast ovarian cancer syndrome', 'retroperitoneal cancer', 'hereditary site-specific ovarian cancer syndrome', 'breast-ovarian cancer, familial, susceptibility to', 'mucinous ovarian cancer']


In [None]:
# Generate disease pairs using online cancer matches
disease_disease_pairs_online = generate_pairs(file_path, chunk_size, dtype_spec, breast_and_cervical_cancer_dataset_from_web, 'disease', 'disease')

print(f"Total disease-disease pairs found: {len(disease_disease_pairs_online)}")
print("Sample of disease-disease pairs:", disease_disease_pairs_online[:10])

Total disease-disease pairs found: 96
Sample of disease-disease pairs: [('vaginal cancer', 'female reproductive organ cancer'), ('vaginal cancer', 'vaginal neoplasm'), ('retroperitoneal cancer', 'retroperitoneal lymphoma'), ('retroperitoneal cancer', 'retroperitoneal sarcoma'), ('retroperitoneal cancer', 'retroperitoneum carcinoma'), ('vaginal cancer', 'vagina sarcoma'), ('fallopian tube cancer', 'fallopian tube neoplasm'), ('fallopian tube cancer', 'female reproductive organ cancer'), ('fallopian tube cancer', 'fallopian tube leiomyosarcoma'), ('fallopian tube cancer', 'fallopian tube adenosarcoma')]


#### 1. disease_disease_pairs_first (from the related breast and cervical cancer)
#### 2. disease_disease_pairs_second (from the second pair of disease_disease_pairs_first)
#### 3. disease_disease_pairs_online (from the related breast and cervical cancer searched from web)

In [28]:
# Concatenate the three lists into one combined list
all_disease_disease_pairs = disease_disease_pairs_first + disease_disease_pairs_second + disease_disease_pairs_online

# Convert the combined list of tuples into a DataFrame
df_disease_pairs = pd.DataFrame(all_disease_disease_pairs, columns=['Disease_1', 'Disease_2'])

# Save the DataFrame to a CSV file locally
output_file_path = "/Users/akadirerol/Desktop/disease_disease_pairs.csv"
df_disease_pairs.to_csv(output_file_path, index=False)

print(f"Table saved as {output_file_path}")

Table saved as /Users/akadirerol/Desktop/disease_disease_pairs.csv


In [29]:
# Find all unique values from both the first and second elements of the pairs in all_disease_disease_pairs
unique_diseases = set([pair[0] for pair in all_disease_disease_pairs]).union(
    set([pair[1] for pair in all_disease_disease_pairs])
)

# Display the number of unique diseases found
print(f"Number of unique diseases: {len(unique_diseases)}")

# Display a sample of the unique diseases
print("Sample of unique diseases:")
print(list(unique_diseases)[:10])  # Display the first 10 as a sample


Number of unique diseases: 338
Sample of unique diseases:
['pleural cancer', 'lip and oral cavity squamous cell carcinoma', 'uterine cancer', 'breast cancer', 'fallopian tube carcinoma', 'uterine cervix neoplasm', 'cervical intraepithelial neoplasia', 'endometrial cancer', 'breast-ovarian cancer, familial, susceptibility to', 'high-grade neuroendocrine carcinoma of the cervix uteri']


### Disease - Drug

In [60]:
# Generate disease-drug pair using combined diseases
disease_drug_pairs = generate_pairs(file_path, chunk_size, dtype_spec, unique_diseases, 'disease', 'drug', relationship_columns=["relation", "display_relation"])

print(f"Total disease-drug pairs found: {len(disease_drug_pairs)}")
print("Sample of disease-drug pairs:", disease_drug_pairs[:10])

Total disease-drug pairs found: 808
Sample of disease-drug pairs: [{'x_name': 'epidermodysplasia verruciformis', 'y_name': 'imiquimod', 'relation': 'indication', 'display_relation': 'indication'}, {'x_name': 'epidermodysplasia verruciformis', 'y_name': 'pimecrolimus', 'relation': 'contraindication', 'display_relation': 'contraindication'}, {'x_name': 'lymphoma', 'y_name': 'vinblastine', 'relation': 'indication', 'display_relation': 'indication'}, {'x_name': 'lymphoma', 'y_name': 'methylprednisolone', 'relation': 'indication', 'display_relation': 'indication'}, {'x_name': 'lymphoma', 'y_name': 'uracil mustard', 'relation': 'indication', 'display_relation': 'indication'}, {'x_name': 'lymphoma', 'y_name': 'methotrexate', 'relation': 'indication', 'display_relation': 'indication'}, {'x_name': 'lymphoma', 'y_name': 'vincristine', 'relation': 'indication', 'display_relation': 'indication'}, {'x_name': 'lymphoma', 'y_name': 'prednisone', 'relation': 'indication', 'display_relation': 'indicati

In [61]:
# Convert the disease-drug pairs into a DataFrame
df_disease_drug = pd.DataFrame(disease_drug_pairs)

# Rename columns for clarity if desired
df_disease_drug.rename(columns={'x_name': 'Disease', 'y_name': 'Drug', 'relation': 'Relation', 'display_relation': 'Display Relation'}, inplace=True)

# Save DataFrame as a CSV file
df_disease_drug.to_csv("/Users/akadirerol/Desktop/disease_drug_pairs.csv", index=False)
print("Disease-drug pairs saved as /Users/akadirerol/Desktop/disease_drug_pairs.csv")

Disease-drug pairs saved as /Users/akadirerol/Desktop/disease_drug_pairs.csv


### Disease - Phenotype

In [62]:
# Generate disease-drug pair using combined diseases
disease_phenotype_pairs = generate_pairs(file_path, chunk_size, dtype_spec, unique_diseases, 'disease', 'phenotype', relationship_columns=["relation", "display_relation"])

print(f"Total disease-phenotype pairs found: {len(disease_phenotype_pairs)}")
print("Sample of disease-phenotype pairs:", disease_phenotype_pairs[:10])

Total disease-phenotype pairs found: 2309
Sample of disease-phenotype pairs: [{'x_name': 'breast carcinoma', 'y_name': 'multifocal breast carcinoma', 'relation': 'phenotype_phenotype', 'display_relation': 'parent-child'}, {'x_name': 'squamous cell carcinoma', 'y_name': 'squamous cell carcinoma of the skin', 'relation': 'phenotype_phenotype', 'display_relation': 'parent-child'}, {'x_name': 'ovarian neoplasm', 'y_name': 'ovarian papillary adenocarcinoma', 'relation': 'phenotype_phenotype', 'display_relation': 'parent-child'}, {'x_name': 'sarcoma', 'y_name': 'renal sarcoma', 'relation': 'phenotype_phenotype', 'display_relation': 'parent-child'}, {'x_name': 'soft tissue sarcoma', 'y_name': 'fibroma', 'relation': 'phenotype_phenotype', 'display_relation': 'parent-child'}, {'x_name': 'lymphoma', 'y_name': 'pulmonary lymphoma', 'relation': 'phenotype_phenotype', 'display_relation': 'parent-child'}, {'x_name': 'squamous cell carcinoma', 'y_name': 'oropharyngeal squamous cell carcinoma', 'relat

In [63]:
# Convert the disease-drug pairs into a DataFrame
df_disease_phenotype = pd.DataFrame(disease_phenotype_pairs)

# Rename columns for clarity if desired
df_disease_phenotype.rename(columns={'x_name': 'Disease', 'y_name': 'Phenotype', 'relation': 'Relation', 'display_relation': 'Display Relation'}, inplace=True)

# Save DataFrame as a CSV file
df_disease_phenotype.to_csv("/Users/akadirerol/Desktop/disease_phenotype_pairs.csv", index=False)
print("Disease-phenotype pairs saved as /Users/akadirerol/Desktop/disease_phenotype_pairs.csv")

Disease-phenotype pairs saved as /Users/akadirerol/Desktop/disease_phenotype_pairs.csv


### Phenotype - Drug

In [66]:
import pandas as pd

# Convert lists to DataFrames
df_disease_drug = pd.DataFrame(disease_drug_pairs)
df_disease_phenotype = pd.DataFrame(disease_phenotype_pairs)

# Rename columns to clearly identify entities and differentiate relations
df_disease_drug.rename(columns={
    'x_name': 'Disease', 
    'y_name': 'Drug', 
    'relation': 'drug_relation', 
    'display_relation': 'drug_display_relation'
}, inplace=True)

df_disease_phenotype.rename(columns={
    'x_name': 'Disease', 
    'y_name': 'Phenotype', 
    'relation': 'phenotype_relation', 
    'display_relation': 'phenotype_display_relation'
}, inplace=True)

# Merge on the 'Disease' column to find drug-phenotype pairs through shared diseases
drug_phenotype_df = pd.merge(
    df_disease_drug[['Disease', 'Drug', 'drug_relation', 'drug_display_relation']],
    df_disease_phenotype[['Disease', 'Phenotype', 'phenotype_relation', 'phenotype_display_relation']],
    on='Disease'
)

# Keep Disease in the final result to show the shared connection
drug_phenotype_relationships = drug_phenotype_df[
    ['Drug', 'Phenotype', 'Disease', 'drug_relation', 'drug_display_relation', 'phenotype_relation', 'phenotype_display_relation']
]

# Display the result
print("Drug-Phenotype relationships through shared diseases:")
print(drug_phenotype_relationships.head())


Drug-Phenotype relationships through shared diseases:
        Drug                              Phenotype  \
0  imiquimod  abnormality of metabolism/homeostasis   
1  imiquimod        autosomal recessive inheritance   
2  imiquimod                               verrucae   
3  imiquimod             hypopigmented skin patches   
4  imiquimod              recurrent skin infections   

                           Disease drug_relation drug_display_relation  \
0  epidermodysplasia verruciformis    indication            indication   
1  epidermodysplasia verruciformis    indication            indication   
2  epidermodysplasia verruciformis    indication            indication   
3  epidermodysplasia verruciformis    indication            indication   
4  epidermodysplasia verruciformis    indication            indication   

           phenotype_relation phenotype_display_relation  
0  disease_phenotype_positive          phenotype present  
1  disease_phenotype_positive          phenotype pre

In [67]:
# Save the DataFrame to a new CSV file
drug_phenotype_relationships.to_csv("/Users/akadirerol/Desktop/drug_phenotype_relationships.csv", index=False)
print("Drug-Phenotype relationships saved as /Users/akadirerol/Desktop/drug_phenotype_relationships.csv")


Drug-Phenotype relationships saved as /Users/akadirerol/Desktop/drug_phenotype_relationships.csv
