### Part 1: Preprocessing Duplicate After Scraping
##### Read data, Removing duplicate and get the frequencies

In [1]:
import pandas as pd

# Read the excel
data = pd.read_excel('./../data/1676917147-diseases-17607.xlsx')

# Drop any null values
data = data.dropna(axis=0)

# Delete duplicate while writing the occurence frequency
data = data.groupby(data.columns.tolist(), as_index=False).size().sort_values(by=["size"], ascending=False)

# Show the data
data

Unnamed: 0,Disease,size
431,breast cancer,3068
605,cancer,2091
3556,tumor,1201
3613,tumors,421
644,cancers,405
...,...,...
1442,haematoma and seroma,1
1443,haematoma rates,1
1444,haematoma rates reduced,1
1445,hallmark of cancer,1


##### Export result to Excel format

In [2]:
import calendar
import time

num_dis = data.shape[0]
ts = calendar.timegm(time.gmtime())

name = "./../data/{}-disease-unique-{}.xlsx".format(ts, num_dis)

data.to_excel(name)

### Part 2: Preprocessing Duplicate After Manual Annotation/Labeling
##### Read data, Limit only to DOID column, Drop NaN values

In [3]:
import pandas as pd
import warnings
warnings.simplefilter('ignore')

df = pd.read_excel('./../data/1682736366-disease-unique-3762-annotated.xlsx')

df = df[['DOID']]
df = df.dropna(axis=0)
df

Unnamed: 0,DOID
0,-
1,-
2,-
3,-
4,-
...,...
3757,-
3758,-
3759,-
3760,-


##### Search for mislabel data (human error)

In [4]:
error = df[df["DOID"].str.contains("\t")]
error

Unnamed: 0,DOID


In [5]:
error2 = df[~df["DOID"].str.startswith("DOID:")]
error2

Unnamed: 0,DOID
0,-
1,-
2,-
3,-
4,-
...,...
3757,-
3758,-
3759,-
3760,-


##### Removing duplicate and get the frequencies

In [6]:
df = df.groupby(df.columns.tolist(), as_index=False).size().sort_values(by=["size"], ascending=False)

df

Unnamed: 0,DOID,size
0,-,2088
202,DOID:0081246,14
269,DOID:10283,12
554,DOID:3008,12
513,DOID:2394,12
...,...,...
530,DOID:2691,1
191,DOID:0080964,1
532,DOID:2739,1
533,DOID:2741,1


In [7]:
df = df[~df["DOID"].str.match("-")]
df

Unnamed: 0,DOID,size
202,DOID:0081246,14
269,DOID:10283,12
554,DOID:3008,12
513,DOID:2394,12
774,DOID:5683,11
...,...,...
530,DOID:2691,1
191,DOID:0080964,1
532,DOID:2739,1
533,DOID:2741,1


##### Export result to Excel format

In [8]:
import calendar
import time

num_dis = df.shape[0]
ts = calendar.timegm(time.gmtime())

name = "./../data/{}-DOID-unique-{}.xlsx".format(ts, num_dis)

df.to_excel(name)