## **Data Preprocessing**
### **Steps**
* Handling Missing values 
* Handling Duplicates
* Check data type
* Outlier Handling
* Data Transformation
* Encoding
* Feature Creation
* Data Splitting
* Data Balancing

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
import joblib
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
import numpy as np

In [2]:
pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

In [3]:
df = pd.read_csv("../data/raw/Training.csv")

In [4]:
df.shape

(4920, 134)

In [5]:
df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,muscle_wasting,vomiting,burning_micturition,spotting_ urination,fatigue,weight_gain,anxiety,cold_hands_and_feets,mood_swings,weight_loss,restlessness,lethargy,patches_in_throat,irregular_sugar_level,cough,high_fever,sunken_eyes,breathlessness,sweating,dehydration,indigestion,headache,yellowish_skin,dark_urine,nausea,loss_of_appetite,pain_behind_the_eyes,back_pain,constipation,abdominal_pain,diarrhoea,mild_fever,yellow_urine,yellowing_of_eyes,acute_liver_failure,fluid_overload,swelling_of_stomach,swelled_lymph_nodes,malaise,blurred_and_distorted_vision,...,spinning_movements,loss_of_balance,unsteadiness,weakness_of_one_body_side,loss_of_smell,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,passage_of_gases,internal_itching,toxic_look_(typhos),depression,irritability,muscle_pain,altered_sensorium,red_spots_over_body,belly_pain,abnormal_menstruation,dischromic _patches,watering_from_eyes,increased_appetite,polyuria,family_history,mucoid_sputum,rusty_sputum,lack_of_concentration,visual_disturbances,receiving_blood_transfusion,receiving_unsterile_injections,coma,stomach_bleeding,distention_of_abdomen,history_of_alcohol_consumption,fluid_overload.1,blood_in_sputum,prominent_veins_on_calf,palpitations,painful_walking,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,Unnamed: 133
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection,
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection,
2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection,
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection,
4,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection,


In [6]:
df.drop("Unnamed: 133", axis=1, inplace=True)
df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,muscle_wasting,vomiting,burning_micturition,spotting_ urination,fatigue,weight_gain,anxiety,cold_hands_and_feets,mood_swings,weight_loss,restlessness,lethargy,patches_in_throat,irregular_sugar_level,cough,high_fever,sunken_eyes,breathlessness,sweating,dehydration,indigestion,headache,yellowish_skin,dark_urine,nausea,loss_of_appetite,pain_behind_the_eyes,back_pain,constipation,abdominal_pain,diarrhoea,mild_fever,yellow_urine,yellowing_of_eyes,acute_liver_failure,fluid_overload,swelling_of_stomach,swelled_lymph_nodes,malaise,blurred_and_distorted_vision,...,movement_stiffness,spinning_movements,loss_of_balance,unsteadiness,weakness_of_one_body_side,loss_of_smell,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,passage_of_gases,internal_itching,toxic_look_(typhos),depression,irritability,muscle_pain,altered_sensorium,red_spots_over_body,belly_pain,abnormal_menstruation,dischromic _patches,watering_from_eyes,increased_appetite,polyuria,family_history,mucoid_sputum,rusty_sputum,lack_of_concentration,visual_disturbances,receiving_blood_transfusion,receiving_unsterile_injections,coma,stomach_bleeding,distention_of_abdomen,history_of_alcohol_consumption,fluid_overload.1,blood_in_sputum,prominent_veins_on_calf,palpitations,painful_walking,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection


In [7]:
df.shape

(4920, 133)

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

In [10]:
df.shape

(304, 133)

In [11]:
# droped "fluid_overload" as it has constant value
df.drop("fluid_overload", axis=1, inplace=True)
df.rename(columns={"fluid_overload.1": "fluid_overload"}, inplace=True)
df.shape

(304, 132)

In [12]:
# Duplicate or irrelivant features.
duplicate_cols = ["belly_pain", "stomach_pain", "muscle_weakness", "muscle_wasting", "family_history", "extra_marital_contacts", "receiving_blood_transfusion", "receiving_unsterile_injections", "history_of_alcohol_consumption"]

In [13]:
# Removing duplicate or irrelivant features.
df.drop(columns=duplicate_cols, axis=1, inplace=True, errors="ignore")

In [14]:
df.shape

(304, 123)

In [15]:
x = df.drop(columns=["prognosis"])
y = df["prognosis"]

In [16]:
x

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,acidity,ulcers_on_tongue,vomiting,burning_micturition,spotting_ urination,fatigue,weight_gain,anxiety,cold_hands_and_feets,mood_swings,weight_loss,restlessness,lethargy,patches_in_throat,irregular_sugar_level,cough,high_fever,sunken_eyes,breathlessness,sweating,dehydration,indigestion,headache,yellowish_skin,dark_urine,nausea,loss_of_appetite,pain_behind_the_eyes,back_pain,constipation,abdominal_pain,diarrhoea,mild_fever,yellow_urine,yellowing_of_eyes,acute_liver_failure,swelling_of_stomach,swelled_lymph_nodes,malaise,blurred_and_distorted_vision,phlegm,throat_irritation,redness_of_eyes,...,drying_and_tingling_lips,slurred_speech,knee_pain,hip_joint_pain,stiff_neck,swelling_joints,movement_stiffness,spinning_movements,loss_of_balance,unsteadiness,weakness_of_one_body_side,loss_of_smell,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,passage_of_gases,internal_itching,toxic_look_(typhos),depression,irritability,muscle_pain,altered_sensorium,red_spots_over_body,abnormal_menstruation,dischromic _patches,watering_from_eyes,increased_appetite,polyuria,mucoid_sputum,rusty_sputum,lack_of_concentration,visual_disturbances,coma,stomach_bleeding,distention_of_abdomen,fluid_overload,blood_in_sputum,prominent_veins_on_calf,palpitations,painful_walking,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
402,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
403,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
405,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
406,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [39]:
y

0      Fungal infection
1      Fungal infection
2      Fungal infection
3      Fungal infection
4      Fungal infection
             ...       
402            Impetigo
403            Impetigo
405            Impetigo
406            Impetigo
407            Impetigo
Name: prognosis, Length: 304, dtype: object

In [40]:
y.isnull().sum()

np.int64(0)

In [18]:
var_thresh = VarianceThreshold(threshold=0.01)

In [19]:
x_var = var_thresh.fit_transform(x)
x_var

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 1, 1],
       [0, 1, 0, ..., 1, 0, 1],
       [0, 1, 0, ..., 1, 1, 0]], shape=(304, 122))

In [20]:
x_var = pd.DataFrame(x_var, columns=x.columns[var_thresh.get_support()])
x_var

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,acidity,ulcers_on_tongue,vomiting,burning_micturition,spotting_ urination,fatigue,weight_gain,anxiety,cold_hands_and_feets,mood_swings,weight_loss,restlessness,lethargy,patches_in_throat,irregular_sugar_level,cough,high_fever,sunken_eyes,breathlessness,sweating,dehydration,indigestion,headache,yellowish_skin,dark_urine,nausea,loss_of_appetite,pain_behind_the_eyes,back_pain,constipation,abdominal_pain,diarrhoea,mild_fever,yellow_urine,yellowing_of_eyes,acute_liver_failure,swelling_of_stomach,swelled_lymph_nodes,malaise,blurred_and_distorted_vision,phlegm,throat_irritation,redness_of_eyes,...,drying_and_tingling_lips,slurred_speech,knee_pain,hip_joint_pain,stiff_neck,swelling_joints,movement_stiffness,spinning_movements,loss_of_balance,unsteadiness,weakness_of_one_body_side,loss_of_smell,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,passage_of_gases,internal_itching,toxic_look_(typhos),depression,irritability,muscle_pain,altered_sensorium,red_spots_over_body,abnormal_menstruation,dischromic _patches,watering_from_eyes,increased_appetite,polyuria,mucoid_sputum,rusty_sputum,lack_of_concentration,visual_disturbances,coma,stomach_bleeding,distention_of_abdomen,fluid_overload,blood_in_sputum,prominent_veins_on_calf,palpitations,painful_walking,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
300,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
301,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
302,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [21]:
print(f"Low-variance filter: {x.shape[1]} -> {x_var.shape[1]} features")

Low-variance filter: 122 -> 122 features


In [22]:
corr_matrix = x_var.corr().abs()
corr_matrix

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,acidity,ulcers_on_tongue,vomiting,burning_micturition,spotting_ urination,fatigue,weight_gain,anxiety,cold_hands_and_feets,mood_swings,weight_loss,restlessness,lethargy,patches_in_throat,irregular_sugar_level,cough,high_fever,sunken_eyes,breathlessness,sweating,dehydration,indigestion,headache,yellowish_skin,dark_urine,nausea,loss_of_appetite,pain_behind_the_eyes,back_pain,constipation,abdominal_pain,diarrhoea,mild_fever,yellow_urine,yellowing_of_eyes,acute_liver_failure,swelling_of_stomach,swelled_lymph_nodes,malaise,blurred_and_distorted_vision,phlegm,throat_irritation,redness_of_eyes,...,drying_and_tingling_lips,slurred_speech,knee_pain,hip_joint_pain,stiff_neck,swelling_joints,movement_stiffness,spinning_movements,loss_of_balance,unsteadiness,weakness_of_one_body_side,loss_of_smell,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,passage_of_gases,internal_itching,toxic_look_(typhos),depression,irritability,muscle_pain,altered_sensorium,red_spots_over_body,abnormal_menstruation,dischromic _patches,watering_from_eyes,increased_appetite,polyuria,mucoid_sputum,rusty_sputum,lack_of_concentration,visual_disturbances,coma,stomach_bleeding,distention_of_abdomen,fluid_overload,blood_in_sputum,prominent_veins_on_calf,palpitations,painful_walking,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
itching,1.000000,0.260536,0.207941,0.080041,0.045591,0.181433,0.166718,0.089952,0.056025,0.055014,0.158300,0.251803,0.076183,0.060616,0.064910,0.060616,0.089952,0.084229,0.093063,0.335264,0.045591,0.064910,0.144710,0.063421,0.045591,0.120748,0.160261,0.045591,0.089952,0.044437,0.283090,0.238802,0.097096,0.219474,0.072818,0.089952,0.083452,0.241534,0.142420,0.147545,0.356215,0.151406,0.064910,0.060616,0.154729,0.250065,0.118190,0.120748,0.068964,0.068964,...,0.064910,0.068964,0.056025,0.056025,0.086752,0.076503,0.051058,0.056025,0.093063,0.056025,0.045591,0.068964,0.045591,0.045591,0.045591,0.056025,0.056025,0.064910,0.096094,0.142420,0.142420,0.045591,0.256107,0.096094,0.207941,0.045591,0.068964,0.068964,0.056025,0.068964,0.051058,0.068964,0.068964,0.068964,0.060616,0.060616,0.068964,0.060616,0.068964,0.076503,0.045591,0.045591,0.045591,0.056025,0.056025,0.056025,0.056025,0.051058,0.051058,0.051058
skin_rash,0.260536,1.000000,0.204742,0.081166,0.046232,0.017026,0.176761,0.091216,0.056812,0.159960,0.155037,0.248029,0.048255,0.061467,0.065822,0.061467,0.091216,0.137330,0.094371,0.111172,0.046232,0.065822,0.146743,0.161398,0.046232,0.122444,0.162513,0.046232,0.091216,0.150526,0.200602,0.158084,0.035908,0.125262,0.407183,0.304922,0.084625,0.218985,0.144422,0.143080,0.065822,0.194411,0.065822,0.061467,0.150252,0.243160,0.119851,0.122444,0.069933,0.069933,...,0.065822,0.069933,0.056812,0.056812,0.087971,0.077578,0.051775,0.056812,0.094371,0.056812,0.046232,0.069933,0.046232,0.046232,0.046232,0.056812,0.056812,0.065822,0.097445,0.144422,0.094513,0.046232,0.566118,0.097445,0.204742,0.046232,0.069933,0.069933,0.056812,0.069933,0.051775,0.069933,0.069933,0.069933,0.061467,0.061467,0.069933,0.061467,0.069933,0.077578,0.204742,0.204742,0.204742,0.285864,0.285864,0.285864,0.285864,0.248029,0.248029,0.248029
nodal_skin_eruptions,0.207941,0.204742,1.000000,0.023408,0.013333,0.053060,0.048757,0.026307,0.016385,0.091336,0.020169,0.014932,0.102517,0.017727,0.018983,0.017727,0.026307,0.039606,0.027217,0.039606,0.013333,0.018983,0.042321,0.074293,0.013333,0.035313,0.046869,0.013333,0.026307,0.065497,0.057854,0.045591,0.067251,0.069592,0.021296,0.026307,0.024406,0.063155,0.041651,0.036051,0.018983,0.056068,0.018983,0.017727,0.035313,0.053060,0.034565,0.035313,0.020169,0.020169,...,0.018983,0.020169,0.016385,0.016385,0.025371,0.022373,0.014932,0.016385,0.027217,0.016385,0.013333,0.020169,0.013333,0.013333,0.013333,0.016385,0.016385,0.018983,0.028103,0.041651,0.041651,0.013333,0.029814,0.028103,0.746667,0.013333,0.020169,0.020169,0.016385,0.020169,0.014932,0.020169,0.020169,0.020169,0.017727,0.017727,0.020169,0.017727,0.020169,0.022373,0.013333,0.013333,0.013333,0.016385,0.016385,0.016385,0.016385,0.014932,0.014932,0.014932
continuous_sneezing,0.080041,0.081166,0.023408,1.000000,0.421348,0.352109,0.085599,0.046184,0.028765,0.160351,0.035409,0.026215,0.058203,0.031122,0.033327,0.031122,0.046184,0.069533,0.047782,0.069533,0.023408,0.033327,0.291692,0.129452,0.023408,0.061996,0.082284,0.023408,0.046184,0.160568,0.101569,0.080041,0.118068,0.122177,0.037387,0.046184,0.042847,0.110877,0.073124,0.063291,0.033327,0.098434,0.033327,0.031122,0.360848,0.218530,0.060683,0.421255,0.761942,0.761942,...,0.033327,0.035409,0.028765,0.028765,0.044541,0.039279,0.026215,0.028765,0.047782,0.028765,0.023408,0.761942,0.023408,0.023408,0.023408,0.028765,0.028765,0.033327,0.049338,0.073124,0.350296,0.023408,0.052342,0.049338,0.023408,0.421348,0.035409,0.035409,0.028765,0.035409,0.026215,0.035409,0.035409,0.035409,0.031122,0.031122,0.035409,0.031122,0.035409,0.039279,0.023408,0.023408,0.023408,0.028765,0.028765,0.028765,0.028765,0.026215,0.026215,0.026215
shivering,0.045591,0.046232,0.013333,0.421348,1.000000,0.175199,0.048757,0.026307,0.016385,0.091336,0.020169,0.014932,0.102517,0.017727,0.018983,0.017727,0.026307,0.039606,0.027217,0.039606,0.013333,0.018983,0.042321,0.074293,0.013333,0.035313,0.046869,0.013333,0.026307,0.065497,0.057854,0.045591,0.067251,0.069592,0.021296,0.026307,0.024406,0.063155,0.041651,0.036051,0.018983,0.056068,0.018983,0.017727,0.035313,0.053060,0.034565,0.035313,0.020169,0.020169,...,0.018983,0.020169,0.016385,0.016385,0.025371,0.022373,0.014932,0.016385,0.027217,0.016385,0.013333,0.020169,0.013333,0.013333,0.013333,0.016385,0.016385,0.018983,0.028103,0.041651,0.041651,0.013333,0.029814,0.028103,0.013333,0.746667,0.020169,0.020169,0.016385,0.020169,0.014932,0.020169,0.020169,0.020169,0.017727,0.017727,0.020169,0.017727,0.020169,0.022373,0.013333,0.013333,0.013333,0.016385,0.016385,0.016385,0.016385,0.014932,0.014932,0.014932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
small_dents_in_nails,0.056025,0.285864,0.016385,0.028765,0.016385,0.065203,0.270052,0.032327,0.020134,0.112238,0.024784,0.018349,0.125978,0.021784,0.023327,0.021784,0.032327,0.048670,0.033445,0.048670,0.016385,0.023327,0.052006,0.091294,0.016385,0.043394,0.057595,0.016385,0.032327,0.080486,0.071093,0.056025,0.082642,0.085518,0.026169,0.032327,0.029991,0.077608,0.051183,0.044301,0.023327,0.068899,0.023327,0.021784,0.043394,0.065203,0.042475,0.043394,0.024784,0.024784,...,0.023327,0.024784,0.020134,0.020134,0.031177,0.027493,0.018349,0.020134,0.033445,0.020134,0.016385,0.024784,0.016385,0.016385,0.016385,0.020134,0.020134,0.023327,0.034534,0.051183,0.051183,0.016385,0.036637,0.034534,0.016385,0.016385,0.024784,0.024784,0.020134,0.024784,0.018349,0.024784,0.024784,0.024784,0.021784,0.021784,0.024784,0.021784,0.024784,0.027493,0.016385,0.016385,0.016385,0.829978,0.829978,1.000000,0.829978,0.018349,0.018349,0.018349
inflammatory_nails,0.056025,0.285864,0.016385,0.028765,0.016385,0.065203,0.270052,0.032327,0.020134,0.112238,0.024784,0.018349,0.125978,0.021784,0.023327,0.021784,0.032327,0.048670,0.033445,0.048670,0.016385,0.023327,0.052006,0.091294,0.016385,0.043394,0.057595,0.016385,0.032327,0.080486,0.071093,0.056025,0.082642,0.085518,0.026169,0.032327,0.029991,0.077608,0.051183,0.044301,0.023327,0.068899,0.023327,0.021784,0.043394,0.065203,0.042475,0.043394,0.024784,0.024784,...,0.023327,0.024784,0.020134,0.020134,0.031177,0.027493,0.018349,0.020134,0.033445,0.020134,0.016385,0.024784,0.016385,0.016385,0.016385,0.020134,0.020134,0.023327,0.034534,0.051183,0.051183,0.016385,0.036637,0.034534,0.016385,0.016385,0.024784,0.024784,0.020134,0.024784,0.018349,0.024784,0.024784,0.024784,0.021784,0.021784,0.024784,0.021784,0.024784,0.027493,0.016385,0.016385,0.016385,0.829978,0.829978,0.829978,1.000000,0.018349,0.018349,0.018349
blister,0.051058,0.248029,0.014932,0.026215,0.014932,0.059422,0.054603,0.029461,0.018349,0.102287,0.022587,0.016722,0.114809,0.019853,0.021259,0.019853,0.029461,0.044355,0.030480,0.044355,0.014932,0.021259,0.047395,0.144152,0.014932,0.039547,0.052488,0.014932,0.029461,0.073350,0.064790,0.051058,0.075315,0.077936,0.023849,0.029461,0.027332,0.070728,0.046645,0.040373,0.021259,0.062791,0.021259,0.019853,0.039547,0.059422,0.038709,0.039547,0.022587,0.022587,...,0.021259,0.022587,0.018349,0.018349,0.028413,0.025056,0.016722,0.018349,0.030480,0.018349,0.014932,0.022587,0.014932,0.014932,0.014932,0.018349,0.018349,0.021259,0.031473,0.046645,0.046645,0.014932,0.033389,0.031473,0.014932,0.014932,0.022587,0.022587,0.018349,0.022587,0.016722,0.022587,0.022587,0.022587,0.019853,0.019853,0.022587,0.019853,0.022587,0.025056,0.014932,0.014932,0.014932,0.018349,0.018349,0.018349,0.018349,1.000000,0.796656,0.796656
red_sore_around_nose,0.051058,0.248029,0.014932,0.026215,0.014932,0.059422,0.054603,0.029461,0.018349,0.102287,0.022587,0.016722,0.114809,0.019853,0.021259,0.019853,0.029461,0.044355,0.030480,0.044355,0.014932,0.021259,0.047395,0.144152,0.014932,0.039547,0.052488,0.014932,0.029461,0.073350,0.064790,0.051058,0.075315,0.077936,0.023849,0.029461,0.027332,0.070728,0.046645,0.040373,0.021259,0.062791,0.021259,0.019853,0.039547,0.059422,0.038709,0.039547,0.022587,0.022587,...,0.021259,0.022587,0.018349,0.018349,0.028413,0.025056,0.016722,0.018349,0.030480,0.018349,0.014932,0.022587,0.014932,0.014932,0.014932,0.018349,0.018349,0.021259,0.031473,0.046645,0.046645,0.014932,0.033389,0.031473,0.014932,0.014932,0.022587,0.022587,0.018349,0.022587,0.016722,0.022587,0.022587,0.022587,0.019853,0.019853,0.022587,0.019853,0.022587,0.025056,0.014932,0.014932,0.014932,0.018349,0.018349,0.018349,0.018349,0.796656,1.000000,0.796656


In [23]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
upper

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,acidity,ulcers_on_tongue,vomiting,burning_micturition,spotting_ urination,fatigue,weight_gain,anxiety,cold_hands_and_feets,mood_swings,weight_loss,restlessness,lethargy,patches_in_throat,irregular_sugar_level,cough,high_fever,sunken_eyes,breathlessness,sweating,dehydration,indigestion,headache,yellowish_skin,dark_urine,nausea,loss_of_appetite,pain_behind_the_eyes,back_pain,constipation,abdominal_pain,diarrhoea,mild_fever,yellow_urine,yellowing_of_eyes,acute_liver_failure,swelling_of_stomach,swelled_lymph_nodes,malaise,blurred_and_distorted_vision,phlegm,throat_irritation,redness_of_eyes,...,drying_and_tingling_lips,slurred_speech,knee_pain,hip_joint_pain,stiff_neck,swelling_joints,movement_stiffness,spinning_movements,loss_of_balance,unsteadiness,weakness_of_one_body_side,loss_of_smell,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,passage_of_gases,internal_itching,toxic_look_(typhos),depression,irritability,muscle_pain,altered_sensorium,red_spots_over_body,abnormal_menstruation,dischromic _patches,watering_from_eyes,increased_appetite,polyuria,mucoid_sputum,rusty_sputum,lack_of_concentration,visual_disturbances,coma,stomach_bleeding,distention_of_abdomen,fluid_overload,blood_in_sputum,prominent_veins_on_calf,palpitations,painful_walking,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
itching,,0.260536,0.207941,0.080041,0.045591,0.181433,0.166718,0.089952,0.056025,0.055014,0.158300,0.251803,0.076183,0.060616,0.064910,0.060616,0.089952,0.084229,0.093063,0.335264,0.045591,0.064910,0.144710,0.063421,0.045591,0.120748,0.160261,0.045591,0.089952,0.044437,0.283090,0.238802,0.097096,0.219474,0.072818,0.089952,0.083452,0.241534,0.142420,0.147545,0.356215,0.151406,0.064910,0.060616,0.154729,0.250065,0.118190,0.120748,0.068964,0.068964,...,0.064910,0.068964,0.056025,0.056025,0.086752,0.076503,0.051058,0.056025,0.093063,0.056025,0.045591,0.068964,0.045591,0.045591,0.045591,0.056025,0.056025,0.064910,0.096094,0.142420,0.142420,0.045591,0.256107,0.096094,0.207941,0.045591,0.068964,0.068964,0.056025,0.068964,0.051058,0.068964,0.068964,0.068964,0.060616,0.060616,0.068964,0.060616,0.068964,0.076503,0.045591,0.045591,0.045591,0.056025,0.056025,0.056025,0.056025,0.051058,0.051058,0.051058
skin_rash,,,0.204742,0.081166,0.046232,0.017026,0.176761,0.091216,0.056812,0.159960,0.155037,0.248029,0.048255,0.061467,0.065822,0.061467,0.091216,0.137330,0.094371,0.111172,0.046232,0.065822,0.146743,0.161398,0.046232,0.122444,0.162513,0.046232,0.091216,0.150526,0.200602,0.158084,0.035908,0.125262,0.407183,0.304922,0.084625,0.218985,0.144422,0.143080,0.065822,0.194411,0.065822,0.061467,0.150252,0.243160,0.119851,0.122444,0.069933,0.069933,...,0.065822,0.069933,0.056812,0.056812,0.087971,0.077578,0.051775,0.056812,0.094371,0.056812,0.046232,0.069933,0.046232,0.046232,0.046232,0.056812,0.056812,0.065822,0.097445,0.144422,0.094513,0.046232,0.566118,0.097445,0.204742,0.046232,0.069933,0.069933,0.056812,0.069933,0.051775,0.069933,0.069933,0.069933,0.061467,0.061467,0.069933,0.061467,0.069933,0.077578,0.204742,0.204742,0.204742,0.285864,0.285864,0.285864,0.285864,0.248029,0.248029,0.248029
nodal_skin_eruptions,,,,0.023408,0.013333,0.053060,0.048757,0.026307,0.016385,0.091336,0.020169,0.014932,0.102517,0.017727,0.018983,0.017727,0.026307,0.039606,0.027217,0.039606,0.013333,0.018983,0.042321,0.074293,0.013333,0.035313,0.046869,0.013333,0.026307,0.065497,0.057854,0.045591,0.067251,0.069592,0.021296,0.026307,0.024406,0.063155,0.041651,0.036051,0.018983,0.056068,0.018983,0.017727,0.035313,0.053060,0.034565,0.035313,0.020169,0.020169,...,0.018983,0.020169,0.016385,0.016385,0.025371,0.022373,0.014932,0.016385,0.027217,0.016385,0.013333,0.020169,0.013333,0.013333,0.013333,0.016385,0.016385,0.018983,0.028103,0.041651,0.041651,0.013333,0.029814,0.028103,0.746667,0.013333,0.020169,0.020169,0.016385,0.020169,0.014932,0.020169,0.020169,0.020169,0.017727,0.017727,0.020169,0.017727,0.020169,0.022373,0.013333,0.013333,0.013333,0.016385,0.016385,0.016385,0.016385,0.014932,0.014932,0.014932
continuous_sneezing,,,,,0.421348,0.352109,0.085599,0.046184,0.028765,0.160351,0.035409,0.026215,0.058203,0.031122,0.033327,0.031122,0.046184,0.069533,0.047782,0.069533,0.023408,0.033327,0.291692,0.129452,0.023408,0.061996,0.082284,0.023408,0.046184,0.160568,0.101569,0.080041,0.118068,0.122177,0.037387,0.046184,0.042847,0.110877,0.073124,0.063291,0.033327,0.098434,0.033327,0.031122,0.360848,0.218530,0.060683,0.421255,0.761942,0.761942,...,0.033327,0.035409,0.028765,0.028765,0.044541,0.039279,0.026215,0.028765,0.047782,0.028765,0.023408,0.761942,0.023408,0.023408,0.023408,0.028765,0.028765,0.033327,0.049338,0.073124,0.350296,0.023408,0.052342,0.049338,0.023408,0.421348,0.035409,0.035409,0.028765,0.035409,0.026215,0.035409,0.035409,0.035409,0.031122,0.031122,0.035409,0.031122,0.035409,0.039279,0.023408,0.023408,0.023408,0.028765,0.028765,0.028765,0.028765,0.026215,0.026215,0.026215
shivering,,,,,,0.175199,0.048757,0.026307,0.016385,0.091336,0.020169,0.014932,0.102517,0.017727,0.018983,0.017727,0.026307,0.039606,0.027217,0.039606,0.013333,0.018983,0.042321,0.074293,0.013333,0.035313,0.046869,0.013333,0.026307,0.065497,0.057854,0.045591,0.067251,0.069592,0.021296,0.026307,0.024406,0.063155,0.041651,0.036051,0.018983,0.056068,0.018983,0.017727,0.035313,0.053060,0.034565,0.035313,0.020169,0.020169,...,0.018983,0.020169,0.016385,0.016385,0.025371,0.022373,0.014932,0.016385,0.027217,0.016385,0.013333,0.020169,0.013333,0.013333,0.013333,0.016385,0.016385,0.018983,0.028103,0.041651,0.041651,0.013333,0.029814,0.028103,0.013333,0.746667,0.020169,0.020169,0.016385,0.020169,0.014932,0.020169,0.020169,0.020169,0.017727,0.017727,0.020169,0.017727,0.020169,0.022373,0.013333,0.013333,0.013333,0.016385,0.016385,0.016385,0.016385,0.014932,0.014932,0.014932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
small_dents_in_nails,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.829978,0.018349,0.018349,0.018349
inflammatory_nails,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.018349,0.018349,0.018349
blister,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.796656,0.796656
red_sore_around_nose,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.796656


In [24]:
to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]
to_drop

['redness_of_eyes',
 'sinus_pressure',
 'runny_nose',
 'congestion',
 'enlarged_thyroid',
 'brittle_nails',
 'swollen_extremeties',
 'slurred_speech',
 'loss_of_smell',
 'abnormal_menstruation',
 'increased_appetite',
 'polyuria',
 'coma',
 'stomach_bleeding',
 'palpitations']

In [25]:
x_uncorr = x_var.drop(columns=to_drop)
x_uncorr

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,acidity,ulcers_on_tongue,vomiting,burning_micturition,spotting_ urination,fatigue,weight_gain,anxiety,cold_hands_and_feets,mood_swings,weight_loss,restlessness,lethargy,patches_in_throat,irregular_sugar_level,cough,high_fever,sunken_eyes,breathlessness,sweating,dehydration,indigestion,headache,yellowish_skin,dark_urine,nausea,loss_of_appetite,pain_behind_the_eyes,back_pain,constipation,abdominal_pain,diarrhoea,mild_fever,yellow_urine,yellowing_of_eyes,acute_liver_failure,swelling_of_stomach,swelled_lymph_nodes,malaise,blurred_and_distorted_vision,phlegm,throat_irritation,chest_pain,...,dizziness,cramps,bruising,obesity,swollen_legs,swollen_blood_vessels,puffy_face_and_eyes,excessive_hunger,drying_and_tingling_lips,knee_pain,hip_joint_pain,stiff_neck,swelling_joints,movement_stiffness,spinning_movements,loss_of_balance,unsteadiness,weakness_of_one_body_side,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,passage_of_gases,internal_itching,toxic_look_(typhos),depression,irritability,muscle_pain,altered_sensorium,red_spots_over_body,dischromic _patches,watering_from_eyes,mucoid_sputum,rusty_sputum,lack_of_concentration,visual_disturbances,distention_of_abdomen,fluid_overload,blood_in_sputum,prominent_veins_on_calf,painful_walking,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
300,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
301,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
302,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [26]:
print(f"Removed {len(to_drop)} highly correlated features")

Removed 15 highly correlated features


In [27]:
selector = SelectKBest(score_func=chi2, k=40)
x_best = selector.fit_transform(x_uncorr, y)
best_cols = x_uncorr.columns[selector.get_support()]
x = pd.DataFrame(x_best, columns=best_cols)
x

Unnamed: 0,ulcers_on_tongue,weight_gain,anxiety,cold_hands_and_feets,irregular_sugar_level,pain_behind_the_eyes,back_pain,mild_fever,yellow_urine,acute_liver_failure,swelling_of_stomach,swelled_lymph_nodes,phlegm,throat_irritation,fast_heart_rate,cramps,bruising,swollen_legs,swollen_blood_vessels,puffy_face_and_eyes,drying_and_tingling_lips,knee_pain,hip_joint_pain,spinning_movements,unsteadiness,toxic_look_(typhos),depression,irritability,muscle_pain,red_spots_over_body,mucoid_sputum,rusty_sputum,visual_disturbances,distention_of_abdomen,fluid_overload,blood_in_sputum,prominent_veins_on_calf,silver_like_dusting,small_dents_in_nails,inflammatory_nails
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
300,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
302,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [48]:
best_cols

Index(['ulcers_on_tongue', 'weight_gain', 'anxiety', 'cold_hands_and_feets',
       'irregular_sugar_level', 'pain_behind_the_eyes', 'back_pain',
       'mild_fever', 'yellow_urine', 'acute_liver_failure',
       'swelling_of_stomach', 'swelled_lymph_nodes', 'phlegm',
       'throat_irritation', 'fast_heart_rate', 'cramps', 'bruising',
       'swollen_legs', 'swollen_blood_vessels', 'puffy_face_and_eyes',
       'drying_and_tingling_lips', 'knee_pain', 'hip_joint_pain',
       'spinning_movements', 'unsteadiness', 'toxic_look_(typhos)',
       'depression', 'irritability', 'muscle_pain', 'red_spots_over_body',
       'mucoid_sputum', 'rusty_sputum', 'visual_disturbances',
       'distention_of_abdomen', 'fluid_overload', 'blood_in_sputum',
       'prominent_veins_on_calf', 'silver_like_dusting',
       'small_dents_in_nails', 'inflammatory_nails'],
      dtype='object')

In [47]:
joblib.dump(list(x.columns), "../models/features.joblib")

['../models/features.joblib']

In [42]:
x = x[best_cols]
x["prognosis"] = y.values
x.shape

(304, 41)

In [44]:
x.to_csv("../data/processed/clean_training.csv", index=False)

In [30]:
df_test = pd.read_csv("../data/raw/Testing.csv")

In [31]:
df_test

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,muscle_wasting,vomiting,burning_micturition,spotting_ urination,fatigue,weight_gain,anxiety,cold_hands_and_feets,mood_swings,weight_loss,restlessness,lethargy,patches_in_throat,irregular_sugar_level,cough,high_fever,sunken_eyes,breathlessness,sweating,dehydration,indigestion,headache,yellowish_skin,dark_urine,nausea,loss_of_appetite,pain_behind_the_eyes,back_pain,constipation,abdominal_pain,diarrhoea,mild_fever,yellow_urine,yellowing_of_eyes,acute_liver_failure,fluid_overload,swelling_of_stomach,swelled_lymph_nodes,malaise,blurred_and_distorted_vision,...,movement_stiffness,spinning_movements,loss_of_balance,unsteadiness,weakness_of_one_body_side,loss_of_smell,bladder_discomfort,foul_smell_of urine,continuous_feel_of_urine,passage_of_gases,internal_itching,toxic_look_(typhos),depression,irritability,muscle_pain,altered_sensorium,red_spots_over_body,belly_pain,abnormal_menstruation,dischromic _patches,watering_from_eyes,increased_appetite,polyuria,family_history,mucoid_sputum,rusty_sputum,lack_of_concentration,visual_disturbances,receiving_blood_transfusion,receiving_unsterile_injections,coma,stomach_bleeding,distention_of_abdomen,history_of_alcohol_consumption,fluid_overload.1,blood_in_sputum,prominent_veins_on_calf,palpitations,painful_walking,pus_filled_pimples,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Drug Reaction
5,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Peptic ulcer diseae
6,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,AIDS
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Diabetes
8,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Gastroenteritis
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Bronchial Asthma


In [32]:
df_test.isnull().sum()

itching                 0
skin_rash               0
nodal_skin_eruptions    0
continuous_sneezing     0
shivering               0
                       ..
inflammatory_nails      0
blister                 0
red_sore_around_nose    0
yellow_crust_ooze       0
prognosis               0
Length: 133, dtype: int64

In [33]:
test_cols = list(best_cols)
test_cols.append("prognosis")
df_test = df_test[test_cols]

In [38]:
df_test["prognosis"]

0                            Fungal infection
1                                     Allergy
2                                        GERD
3                         Chronic cholestasis
4                               Drug Reaction
5                         Peptic ulcer diseae
6                                        AIDS
7                                   Diabetes 
8                             Gastroenteritis
9                            Bronchial Asthma
10                              Hypertension 
11                                   Migraine
12                       Cervical spondylosis
13               Paralysis (brain hemorrhage)
14                                   Jaundice
15                                    Malaria
16                                Chicken pox
17                                     Dengue
18                                    Typhoid
19                                hepatitis A
20                                Hepatitis B
21                                

In [34]:
df_test.shape

(42, 41)

In [36]:
df_test.to_csv("../data/processed/clean_testing.csv", index=False)