In [1]:
# import library
import pandas as pd
import numpy as np
import scipy.stats as stats

# Q.1

In [2]:
# load the dataset
df = pd.read_csv('diabetic_data.csv')

# include only time_in_hospital ,insulin and num_lab_procedures column
df = df[[ 'time_in_hospital', 'insulin' , 'num_lab_procedures']]

# Q.2

In [3]:
df.isna().any() # check for any missing values

time_in_hospital      False
insulin               False
num_lab_procedures    False
dtype: bool

In [4]:
# compute Z score
df['time_in_hospital_normalized'] = stats.zscore(df['time_in_hospital'])

In [5]:
df # display dataframe

Unnamed: 0,time_in_hospital,insulin,num_lab_procedures,time_in_hospital_normalized
0,1,No,41,-1.137649
1,3,Up,59,-0.467653
2,2,No,11,-0.802651
3,2,Up,44,-0.802651
4,1,Steady,51,-1.137649
...,...,...,...,...
101761,3,Down,51,-0.467653
101762,5,Steady,33,0.202343
101763,1,Down,53,-1.137649
101764,10,Up,45,1.877333


# Q.3

In [6]:
df['insulin'].unique() # get the uniques values of insulin

array(['No', 'Up', 'Steady', 'Down'], dtype=object)

In [7]:
# library for one hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# [1] represent column number which you want to encode
transform = ColumnTransformer([('encoder', OneHotEncoder(), [1])], remainder='passthrough')
df = np.array(transform.fit_transform(df), dtype = np.float)

In [8]:
df

array([[ 0.        ,  1.        ,  0.        , ...,  1.        ,
        41.        , -1.13764856],
       [ 0.        ,  0.        ,  0.        , ...,  3.        ,
        59.        , -0.46765271],
       [ 0.        ,  1.        ,  0.        , ...,  2.        ,
        11.        , -0.80265063],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
        53.        , -1.13764856],
       [ 0.        ,  0.        ,  0.        , ..., 10.        ,
        45.        ,  1.87733278],
       [ 0.        ,  1.        ,  0.        , ...,  6.        ,
        13.        ,  0.53734107]])

In [9]:
# get column name
column = [i.split('__')[1] if '__' in i else i for i in transform.get_feature_names()]
column

['x0_Down',
 'x0_No',
 'x0_Steady',
 'x0_Up',
 'time_in_hospital',
 'num_lab_procedures',
 'time_in_hospital_normalized']

In [10]:
df = pd.DataFrame(df,columns=column)
df # dataframe with new column added

Unnamed: 0,x0_Down,x0_No,x0_Steady,x0_Up,time_in_hospital,num_lab_procedures,time_in_hospital_normalized
0,0.0,1.0,0.0,0.0,1.0,41.0,-1.137649
1,0.0,0.0,0.0,1.0,3.0,59.0,-0.467653
2,0.0,1.0,0.0,0.0,2.0,11.0,-0.802651
3,0.0,0.0,0.0,1.0,2.0,44.0,-0.802651
4,0.0,0.0,1.0,0.0,1.0,51.0,-1.137649
...,...,...,...,...,...,...,...
101761,1.0,0.0,0.0,0.0,3.0,51.0,-0.467653
101762,0.0,0.0,1.0,0.0,5.0,33.0,0.202343
101763,1.0,0.0,0.0,0.0,1.0,53.0,-1.137649
101764,0.0,0.0,0.0,1.0,10.0,45.0,1.877333


# Q.4

In [11]:
# Create 5 bins for column num_lab_procedures
# size of bins is automatically decided by pandas library
df['num_lab_procedures_BIN'] = pd.qcut(df['num_lab_procedures'],q=5)

In [12]:
df

Unnamed: 0,x0_Down,x0_No,x0_Steady,x0_Up,time_in_hospital,num_lab_procedures,time_in_hospital_normalized,num_lab_procedures_BIN
0,0.0,1.0,0.0,0.0,1.0,41.0,-1.137649,"(40.0, 49.0]"
1,0.0,0.0,0.0,1.0,3.0,59.0,-0.467653,"(49.0, 60.0]"
2,0.0,1.0,0.0,0.0,2.0,11.0,-0.802651,"(0.999, 27.0]"
3,0.0,0.0,0.0,1.0,2.0,44.0,-0.802651,"(40.0, 49.0]"
4,0.0,0.0,1.0,0.0,1.0,51.0,-1.137649,"(49.0, 60.0]"
...,...,...,...,...,...,...,...,...
101761,1.0,0.0,0.0,0.0,3.0,51.0,-0.467653,"(49.0, 60.0]"
101762,0.0,0.0,1.0,0.0,5.0,33.0,0.202343,"(27.0, 40.0]"
101763,1.0,0.0,0.0,0.0,1.0,53.0,-1.137649,"(49.0, 60.0]"
101764,0.0,0.0,0.0,1.0,10.0,45.0,1.877333,"(40.0, 49.0]"
