In [1]:
import pandas as pd
import random
import numpy as np
random.seed(2022)
np.random.seed(2022)
df = pd.read_csv("../../data/pretrain/disgenet_dda.csv")

In [2]:
df = df[["diseaseId1","diseaseId2","diseaseDes1","diseaseDes2"]]
df = df.dropna()
df["key1"] = df["diseaseId1"]+df["diseaseId2"]
df["key2"] = df["diseaseId2"]+df["diseaseId1"]
df.head()

Unnamed: 0,diseaseId1,diseaseId2,diseaseDes1,diseaseDes2,key1,key2
0,C0000737,C0000731,An unpleasant sensation characterized by physi...,Distention of the abdomen.,C0000737C0000731,C0000731C0000737
1,C0005745,C0000731,The upper eyelid margin is positioned 3 mm or ...,Distention of the abdomen.,C0005745C0000731,C0000731C0005745
2,C0006325,C0000731,Bruxism is characterized by the grinding of th...,Distention of the abdomen.,C0006325C0000731,C0000731C0006325
3,C0007642,C0000731,A bacterial infection and inflammation of the ...,Distention of the abdomen.,C0007642C0000731,C0000731C0007642
4,C0009806,C0000731,Infrequent or difficult evacuation of feces.,Distention of the abdomen.,C0009806C0000731,C0000731C0009806


In [3]:
disease_dict = {}
for _,item in df.iterrows():
    disease_dict[item["diseaseId1"]]=item["diseaseDes1"]

In [4]:
disease_ids = list(set(df.diseaseId1.unique()).union(df.diseaseId2.unique()))
len(disease_ids)

14403

In [5]:
keys = (set(df.key1.unique()).union(df.key2.unique()))

In [6]:
total_example_n = 4096+1024+1024
negative_example_n = total_example_n//2
positive_example_n = total_example_n - negative_example_n
df_sample = df.sample(positive_example_n)

In [7]:
feasible_negatives = []
while True:
    id1, id2 = random.sample(disease_ids, 2)
    if id1+id2 not in keys:
        feasible_negatives.append(id1+id2)
    if len(feasible_negatives)==negative_example_n:
        break

In [8]:
feasible_negatives = [[key[:8],key[8:]]for key in feasible_negatives]

In [9]:
df_sample["label"]=1
df_sample = df_sample[["diseaseId1","diseaseId2","diseaseDes1","diseaseDes2","label"]]
df_sample.head()

Unnamed: 0,diseaseId1,diseaseId2,diseaseDes1,diseaseDes2,label
1474484,C1843075,C1835671,A rare hereditary motor and sensory neuropathy...,#8208;normal nerve conduction velocities. Addi...,1
508690,C0002949,C0032914,A separation (dissection) of the layers of an ...,"Preeclampsia, which along with chronic hyperte...",1
1518880,C0426415,C1850049,Distance between subnasale and pronasale more ...,Clinodactyly refers to a bending or curvature ...,1
952600,C0027651,C0282313,An organ or organ-system abnormality that cons...,cellular state in which there is evidence of i...,1
997910,C0085261,C0345905,Proteus syndrome is characterized by progressi...,A carcinoma that arises from the intrahepatic ...,1


In [10]:
diseaseId1_list = []
diseaseId2_list = []
diseaseDes1_list = []
diseaseDes2_list = []
for diseaseId1,diseaseId2 in feasible_negatives:
    diseaseId1_list.append(diseaseId1)
    diseaseId2_list.append(diseaseId2)
    diseaseDes1_list.append(disease_dict[diseaseId1])
    diseaseDes2_list.append(disease_dict[diseaseId2])

In [11]:
label_list = [0]*len(diseaseDes2_list)

In [12]:
pd.DataFrame({
    "diseaseId1":diseaseId1_list,
    "diseaseId2":diseaseId2_list,
    "diseaseDes1":diseaseDes1_list,
    "diseaseDes2":diseaseDes2_list,
    "label":label_list,
})
df_sample = df_sample.append(pd.DataFrame({
    "diseaseId1":diseaseId1_list,
    "diseaseId2":diseaseId2_list,
    "diseaseDes1":diseaseDes1_list,
    "diseaseDes2":diseaseDes2_list,
    "label":label_list,
}))
df_sample

Unnamed: 0,diseaseId1,diseaseId2,diseaseDes1,diseaseDes2,label
1474484,C1843075,C1835671,A rare hereditary motor and sensory neuropathy...,#8208;normal nerve conduction velocities. Addi...,1
508690,C0002949,C0032914,A separation (dissection) of the layers of an ...,"Preeclampsia, which along with chronic hyperte...",1
1518880,C0426415,C1850049,Distance between subnasale and pronasale more ...,Clinodactyly refers to a bending or curvature ...,1
952600,C0027651,C0282313,An organ or organ-system abnormality that cons...,cellular state in which there is evidence of i...,1
997910,C0085261,C0345905,Proteus syndrome is characterized by progressi...,A carcinoma that arises from the intrahepatic ...,1
...,...,...,...,...,...
3067,C0419415,C4316787,Birth length,"A rare, primary bone dysplasia characterized b...",0
3068,C4552070,C4310712,Primary pulmonary arterial hypertension is a r...,Developmental and epileptic encephalopathy-43 ...,0
3069,C2931401,C1857389,Long QT syndrome type 3,"Cystinuria, Type B",0
3070,C0000822,C4017089,"Abortion, Tubal","LONG QT SYNDROME 1, RECESSIVE",0


In [13]:
df_sample = df_sample.sample(frac=1)
df_sample = df_sample.reset_index()
df_sample.head()

Unnamed: 0,index,diseaseId1,diseaseId2,diseaseDes1,diseaseDes2,label
0,724,C3809872,C1859775,Any periventricular nodular heterotopia in whi...,Underdevelopment of the anterior pituitary gland.,0
1,1855,C0206710,C0238198,A neoplastic proliferation of basal cells in t...,Gastrointestinal stromal tumors are mesenchyma...,0
2,2707,C0751733,C4022428,"Degenerative Diseases, Spinal Cord",Unilateral conductive hearing impairment,0
3,1329138,C0038356,C0887800,A tumor (abnormal growth of tissue) of the sto...,Psychogenic Inversion of Circadian Rhythm,1
4,215821,C1257931,C0014859,Tumor or cancer of the human MAMMARY GLAND.,A tumor (abnormal growth of tissue) of the eso...,1


In [14]:
df_sample_train = df_sample.iloc[:4096]
df_sample_valid = df_sample.iloc[4096:4096+1024]
df_sample_test = df_sample.iloc[4096+1024:]

In [15]:
df_sample_train.to_csv("../../data/downstream/disgenet_dda_6k_train.csv",index=None)
df_sample_valid.to_csv("../../data/downstream/disgenet_dda_6k_valid.csv",index=None)
df_sample_test.to_csv("../../data/downstream/disgenet_dda_6k_test.csv",index=None)

In [21]:
df_sample_valid

Unnamed: 0,index,diseaseId1,diseaseId2,diseaseDes1,diseaseDes2,label
4096,917,C0270969,C0206731,) gene may be involved.,Angiofibroma consist of many often dilated ves...,0
4097,1109364,C0086692,C0521174,A neoplasm which is characterized by the absen...,Microcalcification,1
4098,367,C0030437,C0086647,Parakeratosis Variegata,Mucopolysaccharidosis type III (MPS III) is a ...,0
4099,1236,C1857569,C3275476,Corneal endothelial dystrophy is characterized...,X-linked dominant chondrodysplasia Chassaing-L...,0
4100,692493,C3714644,C0151779,A tumor (abnormal growth of tissue) of the thy...,The presence of a melanoma of the skin.,1
...,...,...,...,...,...,...
5115,886949,C0015695,C0265326,Lipid infiltration of the hepatic parenchymal ...,The PTEN hamartoma tumor syndrome (PHTS) inclu...,1
5116,264633,C0031511,C0018923,Hereditary paraganglioma-pheochromocytoma (PGL...,A rare vascular tumor characterized by a malig...,1
5117,612100,C0007102,C0041755,A primary or metastatic malignant neoplasm tha...,An unpleasant or harmful reaction resulting fr...,1
5118,541279,C0027051,C0035317,Necrosis of the myocardium caused by an obstru...,Hemorrhage occurring within the retina.,1
