In [2]:
# Dependencies
import pandas as pd
from pathlib import Path

In [16]:
# Read the dataset file with the Pandas 
dataset_df = pd.read_csv("../Resources/dataset.csv")

# Display the dataframe
print(dataset_df.head())

# Show the shape of the dataframe
dataset_df.shape

            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symp

(4920, 18)

In [13]:
# Gather all diseases into a list
diseases_list = dataset_df['Disease'].unique().tolist()
print(f"The diseases in the dataset: {diseases_list}")
print(f"Number of diseases in the dataset: {len(diseases_list)}")


The diseases in the dataset: ['Fungal infection', 'Allergy', 'GERD', 'Chronic cholestasis', 'Drug Reaction', 'Peptic ulcer diseae', 'AIDS', 'Diabetes ', 'Gastroenteritis', 'Bronchial Asthma', 'Hypertension ', 'Migraine', 'Cervical spondylosis', 'Paralysis (brain hemorrhage)', 'Jaundice', 'Malaria', 'Chicken pox', 'Dengue', 'Typhoid', 'hepatitis A', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Alcoholic hepatitis', 'Tuberculosis', 'Common Cold', 'Pneumonia', 'Dimorphic hemmorhoids(piles)', 'Heart attack', 'Varicose veins', 'Hypothyroidism', 'Hyperthyroidism', 'Hypoglycemia', 'Osteoarthristis', 'Arthritis', '(vertigo) Paroymsal  Positional Vertigo', 'Acne', 'Urinary tract infection', 'Psoriasis', 'Impetigo']
Number of diseases in the dataset: 41


In [27]:
# Create a new dataframe, which each row contains a disease and a symptom
df_melted = dataset_df.melt(id_vars=dataset_df.columns[0], value_vars=dataset_df.columns[1:], var_name='variable', value_name='symptom')

# Display the dataframe
df_melted

Unnamed: 0,Disease,variable,symptom
0,Fungal infection,Symptom_1,itching
1,Fungal infection,Symptom_1,skin_rash
2,Fungal infection,Symptom_1,itching
3,Fungal infection,Symptom_1,itching
4,Fungal infection,Symptom_1,itching
...,...,...,...
83635,(vertigo) Paroymsal Positional Vertigo,Symptom_17,
83636,Acne,Symptom_17,
83637,Urinary tract infection,Symptom_17,
83638,Psoriasis,Symptom_17,


In [29]:
# Drop any rows where the symptom is NaN
df_melted = df_melted.dropna(subset=['symptom'])

# Display the dataframe
df_melted

Unnamed: 0,Disease,variable,symptom
0,Fungal infection,Symptom_1,itching
1,Fungal infection,Symptom_1,skin_rash
2,Fungal infection,Symptom_1,itching
3,Fungal infection,Symptom_1,itching
4,Fungal infection,Symptom_1,itching
...,...,...,...
83461,Common Cold,Symptom_17,muscle_pain
83502,Common Cold,Symptom_17,muscle_pain
83543,Common Cold,Symptom_17,muscle_pain
83584,Common Cold,Symptom_17,muscle_pain


In [34]:
# Gather all symptoms into a list
symptoms_list = df_melted['symptom'].unique().tolist()
print(f'The symptoms in the dataset: {symptoms_list}')
print(f'Number of symptoms in the dataset: {len(symptoms_list)}')

The symptoms in the dataset: ['itching', ' skin_rash', ' continuous_sneezing', ' shivering', ' stomach_pain', ' acidity', ' vomiting', ' indigestion', ' muscle_wasting', ' patches_in_throat', ' fatigue', ' weight_loss', ' sunken_eyes', ' cough', ' headache', ' chest_pain', ' back_pain', ' weakness_in_limbs', ' chills', ' joint_pain', ' yellowish_skin', ' constipation', ' pain_during_bowel_movements', ' breathlessness', ' cramps', ' weight_gain', ' mood_swings', ' neck_pain', ' muscle_weakness', ' stiff_neck', ' pus_filled_pimples', ' burning_micturition', ' bladder_discomfort', ' high_fever', ' nodal_skin_eruptions', ' ulcers_on_tongue', ' loss_of_appetite', ' restlessness', ' dehydration', ' dizziness', ' weakness_of_one_body_side', ' lethargy', ' nausea', ' abdominal_pain', ' pain_in_anal_region', ' sweating', ' bruising', ' cold_hands_and_feets', ' anxiety', ' knee_pain', ' swelling_joints', ' blackheads', ' foul_smell_of urine', ' skin_peeling', ' blister', ' dischromic _patches', 

In [40]:
# Create a pivot table showing numbers of patients experience the same symptom for each disease 
pivot_table_df = df_melted.pivot_table(index=df_melted.columns[0], columns='symptom', aggfunc='size', fill_value=0)

# Display the dataframe
pivot_table_df

symptom,abdominal_pain,abnormal_menstruation,acidity,acute_liver_failure,altered_sensorium,anxiety,back_pain,belly_pain,blackheads,bladder_discomfort,...,watering_from_eyes,weakness_in_limbs,weakness_of_one_body_side,weight_gain,weight_loss,yellow_crust_ooze,yellow_urine,yellowing_of_eyes,yellowish_skin,itching
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(vertigo) Paroymsal Positional Vertigo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AIDS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acne,0,0,0,0,0,0,0,0,108,0,...,0,0,0,0,0,0,0,0,0,0
Alcoholic hepatitis,114,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,114,0
Allergy,0,0,0,0,0,0,0,0,0,0,...,108,0,0,0,0,0,0,0,0,0
Arthritis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bronchial Asthma,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cervical spondylosis,0,0,0,0,0,0,108,0,0,0,...,0,108,0,0,0,0,0,0,0,0
Chicken pox,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,114
Chronic cholestasis,114,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,114,114,114


In [49]:
# Create a new column named 'list_of_symptoms'
dataset_df["list_of_symptoms"] = 0

# Iterate each row and add all the symptoms into the value of 'list_of_symptoms' column as a list
for i in range(dataset_df.shape[0]):
    values = dataset_df.iloc[i].values
    values = values.tolist()
    if 0 in values:
        dataset_df["list_of_symptoms"][i] = values[1:values.index(0)]
    else:
        dataset_df["list_of_symptoms"][i] = values[1:]

# Display the dataframe
dataset_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_df["list_of_symptoms"][i] = values[1:values.index(0)]
  dataset_df["list_of_symptoms"][i] = values[1:values.index(0)]


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17,list_of_symptoms
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,"[itching, skin_rash, nodal_skin_eruptions, ..."
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,"[ skin_rash, nodal_skin_eruptions, dischromi..."
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,,"[itching, nodal_skin_eruptions, dischromic _..."
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,,"[itching, skin_rash, dischromic _patches, na..."
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,,"[itching, skin_rash, nodal_skin_eruptions, n..."


In [63]:
# Create a new dataframe with all the symptoms as columns, and have the same number of rows as dataset_df
final_df = pd.DataFrame(columns= symptoms_list, index= dataset_df.index)

# Add the column 'list_of_symptoms' from dataset_df to the new dataframe
final_df['list_of_symptoms'] = dataset_df['list_of_symptoms']

for i in final_df:
    final_df[i] = dataset_df.apply(lambda x:1 if i in x.list_of_symptoms else 0, axis=1)

final_df = final_df.drop(columns=['list_of_symptoms'], axis=1)

final_df['disease'] = dataset_df['Disease']

# Display the dataframe
final_df

  final_df['disease'] = dataset_df['Disease']


Unnamed: 0,itching,skin_rash,continuous_sneezing,shivering,stomach_pain,acidity,vomiting,indigestion,muscle_wasting,patches_in_throat,...,receiving_unsterile_injections,coma,sinus_pressure,palpitations,stomach_bleeding,runny_nose,congestion,blood_in_sputum,loss_of_smell,disease
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,(vertigo) Paroymsal Positional Vertigo
4916,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Acne
4917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Urinary tract infection
4918,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Psoriasis


In [61]:
final_df.iloc[0]

itching                 1
 skin_rash              1
 continuous_sneezing    0
 shivering              0
 stomach_pain           0
                       ..
 runny_nose             0
 congestion             0
 blood_in_sputum        0
 loss_of_smell          0
list_of_symptoms        0
Name: 0, Length: 132, dtype: int64