In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('../diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# Let's generate the split ourselves.
np_random = np.random.RandomState(seed=12345)
rand_unifs = np_random.uniform(0,1,size=df.shape[0])
division_thresh = np.percentile(rand_unifs, 80)
train_indicator = rand_unifs < division_thresh
eval_indicator = rand_unifs >= division_thresh

In [6]:
train_df = df[train_indicator].reset_index(drop=True)
train_features = train_df.loc[:, train_df.columns != 'Outcome'].values
train_labels = train_df['Outcome'].values
train_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [7]:
eval_df = df[eval_indicator].reset_index(drop=True)
eval_features = eval_df.loc[:, eval_df.columns != 'Outcome'].values
eval_labels = eval_df['Outcome'].values
eval_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,3,78,50,32,88,31.0,0.248,26,1
2,10,168,74,0,0,38.0,0.537,34,1
3,0,118,84,47,230,45.8,0.551,31,1
4,7,107,74,0,0,29.6,0.254,31,1


In [8]:
train_features.shape, train_labels.shape, eval_features.shape, eval_labels.shape

((614, 8), (614,), (154, 8), (154,))

In [9]:
train_df_with_nans = train_df.copy(deep=True)
eval_df_with_nans = eval_df.copy(deep=True)
for col_with_nans in ['BloodPressure', 'SkinThickness', 'BMI', 'Age']:
    train_df_with_nans[col_with_nans] = train_df_with_nans[col_with_nans].replace(0, np.nan)
    eval_df_with_nans[col_with_nans] = eval_df_with_nans[col_with_nans].replace(0, np.nan)
train_features_with_nans = train_df_with_nans.loc[:, train_df_with_nans.columns != 'Outcome'].values
eval_features_with_nans = eval_df_with_nans.loc[:, eval_df_with_nans.columns != 'Outcome'].values

In [10]:
print('Here are the training rows with at least one missing values.')
print('')
print('You can see that such incomplete data points constitute a substantial part of the data.')
print('')
nan_training_data = train_df_with_nans[train_df_with_nans.isna().any(axis=1)]
nan_training_data.shape

Here are the training rows with at least one missing values.

You can see that such incomplete data points constitute a substantial part of the data.



(186, 9)

In [11]:
def log_prior(train_labels):

    py1 = np.sum(train_labels) / train_labels.shape[0]
    log_py = np.array([[np.log(1 - py1)], [np.log(py1)]])

    assert log_py.shape == (2,1)

    return log_py

some_labels = np.array([0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1])
some_log_py = log_prior(some_labels)
print(some_log_py)
assert np.array_equal(some_log_py.round(3), np.array([[-0.916], [-0.511]]))

[[-0.91629073]
 [-0.51082562]]


In [12]:
def cc_mean_ignore_missing2(train_features, train_labels):
    N, d = train_features.shape
    tf1, tf0 = train_features[train_labels == 1,:], \
               train_features[train_labels == 0,:]

    mu_y = np.hstack([tf0.mean(axis=0).reshape(-1, 1),
                      tf1.mean(axis=0).reshape(-1, 1)])

    assert mu_y.shape == (d, 2)
    return mu_y

def cc_mean_ignore_missing(train_features, train_labels):
    N, d = train_features.shape
    ## Try to implement with matrix multiplication
    tf01 = (train_labels[:,np.newaxis] == np.array([0, 1])).astype(np.int)
    mu_y = (train_features.T @ tf01) / tf01.sum(axis=0)

    assert mu_y.shape == (d, 2)
    return mu_y

# Performing sanity checks on your implementation

some_feats = np.array([[  1. ,  85. ,  66. ,  29. ,   0. ,  26.6,   0.4,  31. ],
                       [  8. , 183. ,  64. ,   0. ,   0. ,  23.3,   0.7,  32. ],
                       [  1. ,  89. ,  66. ,  23. ,  94. ,  28.1,   0.2,  21. ],
                       [  0. , 137. ,  40. ,  35. , 168. ,  43.1,   2.3,  33. ],
                       [  5. , 116. ,  74. ,   0. ,   0. ,  25.6,   0.2,  30. ]])
some_labels = np.array([0, 1, 0, 1, 0])

some_mu_y = cc_mean_ignore_missing2(some_feats, some_labels)
print(some_mu_y)
assert np.array_equal(some_mu_y.round(2), np.array([[  2.33,   4.  ],
                                                    [ 96.67, 160.  ],
                                                    [ 68.67,  52.  ],
                                                    [ 17.33,  17.5 ],
                                                    [ 31.33,  84.  ],
                                                    [ 26.77,  33.2 ],
                                                    [  0.27,   1.5 ],
                                                    [ 27.33,  32.5 ]]))

#print(cc_mean_ignore_missing2(some_feats, some_labels))

[[  2.33333333   4.        ]
 [ 96.66666667 160.        ]
 [ 68.66666667  52.        ]
 [ 17.33333333  17.5       ]
 [ 31.33333333  84.        ]
 [ 26.76666667  33.2       ]
 [  0.26666667   1.5       ]
 [ 27.33333333  32.5       ]]


In [13]:
def cc_std_ignore_missing(train_features, train_labels):
    N, d = train_features.shape

    # your code here
    sty1 = np.std(train_features[train_labels == 1,:], axis=0).reshape(-1, 1)
    sty0 = np.std(train_features[train_labels == 0,:], axis=0).reshape(-1, 1)
    sigma_y = np.hstack([sty0, sty1])

    assert sigma_y.shape == (d, 2)

    return sigma_y

# Performing sanity checks on your implementation

some_feats = np.array([[  1. ,  85. ,  66. ,  29. ,   0. ,  26.6,   0.4,  31. ],
                       [  8. , 183. ,  64. ,   0. ,   0. ,  23.3,   0.7,  32. ],
                       [  1. ,  89. ,  66. ,  23. ,  94. ,  28.1,   0.2,  21. ],
                       [  0. , 137. ,  40. ,  35. , 168. ,  43.1,   2.3,  33. ],
                       [  5. , 116. ,  74. ,   0. ,   0. ,  25.6,   0.2,  30. ]])
some_labels = np.array([0, 1, 0, 1, 0])

some_std_y = cc_std_ignore_missing(some_feats, some_labels)
print(some_std_y)
assert np.array_equal(some_std_y.round(3), np.array([[ 1.886,  4.   ],
                                                     [13.768, 23.   ],
                                                     [ 3.771, 12.   ],
                                                     [12.499, 17.5  ],
                                                     [44.312, 84.   ],
                                                     [ 1.027,  9.9  ],
                                                     [ 0.094,  0.8  ],
                                                     [ 4.497,  0.5  ]]))

[[ 1.88561808  4.        ]
 [13.76791762 23.        ]
 [ 3.77123617 12.        ]
 [12.49888884 17.5       ]
 [44.31202495 84.        ]
 [ 1.02740233  9.9       ]
 [ 0.0942809   0.8       ]
 [ 4.49691252  0.5       ]]


In [14]:
def log_prob(train_features, mu_y, sigma_y, log_py):
    N, d = train_features.shape

    mu_01 = np.hstack([mu_y[:, 0], mu_y[:, 1]])
    #print(mu_01)
    st_01 = np.hstack([sigma_y[:, 0], sigma_y[:, 1]])
    #logpy_0 = some_log_py[0, 0]
    x2t = np.hstack([train_features, train_features])
    #print(x2t)
    a = ((x2t - mu_01)**2) / (st_01**2)
    b = np.log(1 / ( st_01 * (np.sqrt(2*np.pi)) ))
    c = b - (a / 2)
    log_p_x_y = c.reshape(-1,2,d).sum(axis=2) + log_py.reshape(1, -1)


    assert log_p_x_y.shape == (N,2)
    return log_p_x_y

some_feats = np.array([[  1. ,  85. ,  66. ,  29. ,   0. ,  26.6,   0.4,  31. ],
                       [  8. , 183. ,  64. ,   0. ,   0. ,  23.3,   0.7,  32. ],
                       [  1. ,  89. ,  66. ,  23. ,  94. ,  28.1,   0.2,  21. ],
                       [  0. , 137. ,  40. ,  35. , 168. ,  43.1,   2.3,  33. ],
                       [  5. , 116. ,  74. ,   0. ,   0. ,  25.6,   0.2,  30. ]])
some_labels = np.array([0, 1, 0, 1, 0])

some_mu_y = cc_mean_ignore_missing(some_feats, some_labels)
#print(f'some_mu_y={some_mu_y}')
some_std_y = cc_std_ignore_missing(some_feats, some_labels)
#print(f'some_std_y={some_std_y}')
some_log_py = log_prior(some_labels)
#print(f'some_log_py={some_log_py}')

some_log_p_x_y = log_prob(some_feats, some_mu_y, some_std_y, some_log_py)
assert np.array_equal(some_log_p_x_y.round(3), np.array([[ -20.822,  -36.606],
                                                         [ -60.879,  -27.944],
                                                         [ -21.774, -295.68 ],
                                                         [-417.359,  -27.944],
                                                         [ -23.2  ,  -42.6  ]]))

In [15]:
## Calculate only for p(y=0) -> logpy_0
x1 = np.array([[  1. ,  85. ,  66. ,  29. ,   0. ,  26.6,   0.4,  31. ],
                 [  8. , 183. ,  64. ,   0. ,   0. ,  23.3,   0.7,  32. ],
                 [  1. ,  89. ,  66. ,  23. ,  94. ,  28.1,   0.2,  21. ],
                 [  0. , 137. ,  40. ,  35. , 168. ,  43.1,   2.3,  33. ],
                 [  5. , 116. ,  74. ,   0. ,   0. ,  25.6,   0.2,  30. ]])
mu_0 = some_mu_y[:, 0]
st_0 = some_std_y[:, 0]
logpy_0 = some_log_py[0, 0]
N = x1.shape[0]
#mymu = np.array([[1, 2],[1, 2],[1, 2]])
a = ((x1 - mu_0)**2) / (st_0**2)
#x = (x / -2)
b = np.log(1 / ( st_0 * (np.sqrt(2*np.pi)) ))
c = b - (a / 2)
print(c.sum(axis=1) + logpy_0)

[ -20.8224128   -60.87893018  -21.77385572 -417.35938061  -23.20026355]


In [16]:
def cc_mean_consider_missing(train_features_with_nans, train_labels):
    N, d = train_features_with_nans.shape

    # your code here
    tf1, tf0 = train_features_with_nans[train_labels == 1,:], \
               train_features_with_nans[train_labels == 0,:]

    mu_y = np.hstack([np.nanmean(tf0, axis=0).reshape(-1, 1),
                      np.nanmean(tf1, axis=0).reshape(-1, 1)])

    assert not np.isnan(mu_y).any()
    assert mu_y.shape == (d, 2)
    return mu_y

some_feats = np.array([[  1. ,  85. ,  66. ,  29. ,   0. ,  26.6,   0.4,  31. ],
                       [  8. , 183. ,  64. ,   0. ,   0. ,  23.3,   0.7,  32. ],
                       [  1. ,  89. ,  66. ,  23. ,  94. ,  28.1,   0.2,  21. ],
                       [  0. , 137. ,  40. ,  35. , 168. ,  43.1,   2.3,  33. ],
                       [  5. , 116. ,  74. ,   0. ,   0. ,  25.6,   0.2,  30. ]])
some_labels = np.array([0, 1, 0, 1, 0])

for i,j in [(0,0), (1,1), (2,3), (3,4), (4, 2)]:
    some_feats[i,j] = np.nan

some_mu_y = cc_mean_consider_missing(some_feats, some_labels)
print(some_mu_y)

assert np.array_equal(some_mu_y.round(2), np.array([[  3.  ,   4.  ],
                                                    [ 96.67, 137.  ],
                                                    [ 66.  ,  52.  ],
                                                    [ 14.5 ,  17.5 ],
                                                    [ 31.33,   0.  ],
                                                    [ 26.77,  33.2 ],
                                                    [  0.27,   1.5 ],
                                                    [ 27.33,  32.5 ]]))

[[  3.           4.        ]
 [ 96.66666667 137.        ]
 [ 66.          52.        ]
 [ 14.5         17.5       ]
 [ 31.33333333   0.        ]
 [ 26.76666667  33.2       ]
 [  0.26666667   1.5       ]
 [ 27.33333333  32.5       ]]


In [17]:
def cc_std_consider_missing(train_features_with_nans, train_labels):
    N, d = train_features_with_nans.shape

    # your code here
    sty1 = np.nanstd(train_features_with_nans[train_labels == 1,:], axis=0).reshape(-1, 1)
    sty0 = np.nanstd(train_features_with_nans[train_labels == 0,:], axis=0).reshape(-1, 1)
    sigma_y = np.hstack([sty0, sty1])


    assert not np.isnan(sigma_y).any()
    assert sigma_y.shape == (d, 2)
    return sigma_y

some_feats = np.array([[  1. ,  85. ,  66. ,  29. ,   0. ,  26.6,   0.4,  31. ],
                       [  8. , 183. ,  64. ,   0. ,   0. ,  23.3,   0.7,  32. ],
                       [  1. ,  89. ,  66. ,  23. ,  94. ,  28.1,   0.2,  21. ],
                       [  0. , 137. ,  40. ,  35. , 168. ,  43.1,   2.3,  33. ],
                       [  5. , 116. ,  74. ,   0. ,   0. ,  25.6,   0.2,  30. ]])
some_labels = np.array([0, 1, 0, 1, 0])

for i,j in [(0,0), (1,1), (2,3), (3,4), (4, 2)]:
    some_feats[i,j] = np.nan

some_std_y = cc_std_consider_missing(some_feats, some_labels)
print(some_std_y)

assert np.array_equal(some_std_y.round(2), np.array([[ 2.  ,  4.  ],
                                                     [13.77,  0.  ],
                                                     [ 0.  , 12.  ],
                                                     [14.5 , 17.5 ],
                                                     [44.31,  0.  ],
                                                     [ 1.03,  9.9 ],
                                                     [ 0.09,  0.8 ],
                                                     [ 4.5 ,  0.5 ]]))

[[ 2.          4.        ]
 [13.76791762  0.        ]
 [ 0.         12.        ]
 [14.5        17.5       ]
 [44.31202495  0.        ]
 [ 1.02740233  9.9       ]
 [ 0.0942809   0.8       ]
 [ 4.49691252  0.5       ]]
