In [9]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv("drug200_v2.csv")
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [11]:
df[df["BP"].isna()]

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
14,50,F,,HIGH,12.703,drugX
27,49,F,,NORMAL,9.381,drugX
42,50,M,,NORMAL,15.79,DrugY


In [12]:
#fill empty rows in BP column with the most frequently appearing value from records within the same age range(49-51)
female_mode = df[(df["Age"] >= 49) & (df["Age"] <=51)& (df["Sex"] =="F")]["BP"].mode().iat[0]
male_mode = df[(df["Age"] >= 49) & (df["Age"] <=51)& (df["Sex"] =="M")]["BP"].mode().iat[0]

df.loc[(df["Sex"]=="M") & (df["BP"].isna())] = df.loc[(df["Sex"]=="M") & (df["BP"].isna())].fillna(male_mode)
df.loc[(df["Sex"]=="F") & (df["BP"].isna())] = df.loc[(df["Sex"]=="F") & (df["BP"].isna())].fillna(female_mode)

In [14]:
df[df["BP"].isna()]

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug


In [15]:
stdev = df["Na_to_K"].std()
mean = df["Na_to_K"].mean()
df["Na_to_K_z_normalised"] = df["Na_to_K"].apply(lambda x: (x-mean)/stdev)
print("Mean of normalised z-score of Na_to_K:", df["Na_to_K_z_normalised"].mean())
print("Variance of normalised z-score of Na_to_K:", df["Na_to_K_z_normalised"].var())

Mean of normalised z-score of Na_to_K: -8.43769498715119e-17
Variance of normalised z-score of Na_to_K: 1.0


In [16]:
print(df["Na_to_K_z_normalised"].min())
print(df["Na_to_K_z_normalised"].max())

-1.358741061089162
3.067919633874889


In [17]:
bins = [-1.5, -1, 0, 1, 2, 3.1]
labels = [1,2,3,4]
df["bin"] = pd.qcut(df["Na_to_K_z_normalised"], 4)
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Na_to_K_z_normalised,bin
0,23,F,HIGH,HIGH,25.355,DrugY,1.283302,"(0.456, 3.068]"
1,47,M,LOW,HIGH,13.093,drugC,-0.414106,"(-0.781, -0.297]"
2,47,M,LOW,HIGH,10.114,drugC,-0.826484,"(-1.3599999999999999, -0.781]"
3,28,F,NORMAL,HIGH,7.798,drugX,-1.147084,"(-1.3599999999999999, -0.781]"
4,61,F,LOW,HIGH,18.043,DrugY,0.271114,"(-0.297, 0.456]"
...,...,...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC,-0.625348,"(-0.781, -0.297]"
196,16,M,LOW,HIGH,12.006,drugC,-0.564578,"(-0.781, -0.297]"
197,52,M,NORMAL,HIGH,9.894,drugX,-0.856938,"(-1.3599999999999999, -0.781]"
198,23,M,NORMAL,NORMAL,14.020,drugX,-0.285783,"(-0.297, 0.456]"


In [18]:
df["bin"].value_counts()

bin
(-1.3599999999999999, -0.781]    50
(-0.781, -0.297]                 50
(-0.297, 0.456]                  50
(0.456, 3.068]                   50
Name: count, dtype: int64

In [19]:
chol_ohe = pd.get_dummies(df["Cholesterol"])


Unnamed: 0,HIGH,NORMAL
0,True,False
1,True,False
2,True,False
3,True,False
4,True,False
...,...,...
195,True,False
196,True,False
197,True,False
198,False,True


In [26]:
#Task1 qn4
# We create an age range feature. This reduces the number of unique values and makes it easier for classification models to learn
age_bins = [15,25,35,45,55,65,74]
df["age_range"] = pd.cut(df["Age"], bins=age_bins)
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug,Na_to_K_z_normalised,bin,age_range
0,23,F,HIGH,HIGH,25.355,DrugY,1.283302,"(0.456, 3.068]","(15, 25]"
1,47,M,LOW,HIGH,13.093,drugC,-0.414106,"(-0.781, -0.297]","(45, 55]"
2,47,M,LOW,HIGH,10.114,drugC,-0.826484,"(-1.3599999999999999, -0.781]","(45, 55]"
3,28,F,NORMAL,HIGH,7.798,drugX,-1.147084,"(-1.3599999999999999, -0.781]","(25, 35]"
4,61,F,LOW,HIGH,18.043,DrugY,0.271114,"(-0.297, 0.456]","(55, 65]"
...,...,...,...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC,-0.625348,"(-0.781, -0.297]","(55, 65]"
196,16,M,LOW,HIGH,12.006,drugC,-0.564578,"(-0.781, -0.297]","(15, 25]"
197,52,M,NORMAL,HIGH,9.894,drugX,-0.856938,"(-1.3599999999999999, -0.781]","(45, 55]"
198,23,M,NORMAL,NORMAL,14.020,drugX,-0.285783,"(-0.297, 0.456]","(15, 25]"
