In [None]:
%%capture
!pip install --no-cache-dir transformers sentencepiece
!pip install sacremoses

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests as req
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

## Data Loading and Cleaning

In [None]:
# Data Source 1: Kaggle Dataset (https://www.kaggle.com/datasets/itachi9604/disease-symptom-description-dataset?select=dataset.csv)
df = pd.read_csv('disease_symptoms.csv')
df = df.drop_duplicates(subset='Disease').reset_index(drop=True)
df.head()

In [None]:
df['Symptoms'] = None
nums = [i for i in range(1, 18)]
for i, r in df.iterrows():
  symptoms = []
  for num in nums:
    if type(r[f'Symptom_{num}']) != float:
      r[f'Symptom_{num}'] = r[f'Symptom_{num}'].replace('_', ' ')
      symptoms.append(r[f'Symptom_{num}'])
    else:
      break
  r['Symptoms'] = ', '.join(symptoms)
  r['Disease'] = r['Disease'].lower().split('(')[0]

  if i == 36:
    r['Disease'] = 'vertigo'

In [None]:
ndf1 = df[['Disease', 'Symptoms']]
ndf1.head()

In [None]:
# Data Source 2: Kaggle Dataset (https://www.kaggle.com/datasets/hagari/disease-and-their-symptoms)
df = pd.read_csv('cleaned_disease_symptoms.csv')
df.head()

In [None]:
l = []
for i, r in df.iterrows():
  d = {}
  d['Disease'] = r['Disease'].lower()
  d['Symptoms'] = r['Symptoms'].replace(';', ',').lower()
  l.append(d)
ndf2 = pd.DataFrame(l)
ndf2.head()

In [None]:
# Data Source 3: Columbia University Disease-Symptom Knowledge Database
site = req.get('https://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html')
text = BeautifulSoup(site.text, 'html.parser')

In [None]:
search = text.find("table", {"class":"MsoTableWeb3"})
spl_search = search.findAll("p")

In [None]:
def scrape_text(text):
  ntext = ''
  spl_text = text.split('_')
  for i in range(1, len(spl_text)):
    for j in range(len(spl_text[i])):
      if spl_text[i][j].isalpha() or spl_text[i][j].isspace():
        ntext += spl_text[i][j]
      if spl_text[i][j] == '^':
        ntext += ', '
        break

  return ntext.replace('\n', '').replace('  ', ' ')

In [None]:
switch = False
d = {}
l = []
for i in range(3, len(spl_search)-1):
  if spl_search[i].text[0] == 'U':
    if 'align' in str(spl_search[i+1]):
      if i > 3:
        l.append(d)
        d = {}
      d['Disease'] = scrape_text(spl_search[i].text)
    else:
      if 'Symptoms' in d.keys():
        d['Symptoms'].append(scrape_text(spl_search[i].text))
      else:
        d['Symptoms'] = [scrape_text(spl_search[i].text)]

df = pd.DataFrame(l)
df = df.dropna().reset_index(drop=True)
l = []
for i, r in df.iterrows():
  r['Symptoms'] = ', '.join(r['Symptoms'])
  if ',' in r['Disease']:
    spl_dis = r['Disease'].split(',')
    for j in range(len(spl_dis)):
      d = {}
      if j == 0:
        r['Disease'] = spl_dis[j].strip()
      else:
        d['Disease'] = spl_dis[j]
        d['Symptoms'] = r['Symptoms']
        l.append(d)

ndf3 = pd.concat([df, pd.DataFrame(l)]).reset_index(drop=True)
ndf3.head()

In [None]:
raw_df = pd.concat([ndf1, ndf2, ndf3]).reset_index(drop=True)
raw_df.to_csv('agg_data.csv')

## Data Paraphrasing

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws", use_fast=False)  
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

In [None]:
df = pd.read_csv('agg_data.csv')
df = df.drop(columns='Unnamed: 0')

In [None]:
ndf = pd.DataFrame(columns=df.columns)
for i, r in df.iterrows():
  sentences = [f"I am experiencing {df.iloc[i]['Symptoms']}", f"I have {df.iloc[i]['Symptoms']}"]

  for sentence in sentences:
    r['Symptoms'] = sentence
    text =  f"paraphrase: {sentence}" 

    encoding = tokenizer.encode_plus(text, padding=True, return_tensors='pt')
    input_ids, attention_masks = encoding['input_ids'], encoding['attention_mask']
    outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        max_length=256,
        do_sample=True,
        top_k=120,
        top_p=0.95,
        early_stopping=True,
        num_return_sequences=5
    )

    for output in outputs:
        line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        ndf.loc[len(ndf)] = [r['Disease'], line]

In [None]:
for i, r in df.iterrows():
  ndf.loc[len(ndf)] = [r['Disease'], f"I am experiencing {df.iloc[i]['Symptoms']}"]

In [None]:
ndf.to_csv('para_data.csv')