In [338]:
import numpy as np
import pandas as pd
import matplotlib as plt
import math
import re

In [339]:
def import_dataset(split_percent = 70):
    dataset = pd.read_csv('E:\ELL_project\problem1\health_data.csv')
    dataset = dataset.sample(frac = 1)
    X = dataset.iloc[:,:-1].values
    y = dataset.iloc[:,-1:].values
    datasize = X.shape[0]

    split_point = split_percent//10

    X_train = X[:(datasize*split_point)//10,:]
    y_train = y[:(datasize*split_point)//10,:]
    X_test = X[(datasize*split_point)//10:,:]
    y_test = y[(datasize*split_point)//10:,:]

    return(X_train,X_test,y_train,y_test)


In [340]:
def feature_scaling(X_train,X_test):
    X_mean = np.sum(X_train,axis=0)
    X_var = np.sqrt(np.sum(np.square(X_train - X_mean), axis=0))

    X_train_feat_scaled = (X_train - X_mean ) / X_var
    X_test_feat_scaled = (X_test - X_mean) / X_var

    return (X_mean,X_var, X_train_feat_scaled,X_test_feat_scaled)

In [341]:
def sep_01(X_train,y_train):
    x_0_train = X_train[y_train[:,0]==0]
    x_1_train = X_train[y_train[:,0]==1]
    return(x_0_train,x_1_train)

In [512]:
def random_generate_models(X_train_avg,num_feat,modes_1=3,modes_0=3):
    para_0 = []
    para_1 = []
    for i in range(modes_1):
        m = np.random.rand(1,num_feat) + X_train_avg.reshape(1,-1)
        v = np.random.rand(num_feat,num_feat) / 100
        para_1.append((1/modes_1,m,v))           # 1: prior 2: mean 3: covariance
    for i in range(modes_0):
        m = np.random.rand(1,num_feat) + X_train_avg.reshape(1,-1)
        v = np.random.rand(num_feat,num_feat) / 100
        para_0.append((1/modes_0,m,v))          # 1: prior 2: mean 3: covariance
    return(modes_1,modes_0,para_1,para_0)
# print(random_generate_models()

In [513]:
def calc_prob(X,meu,cov):
    num_of_feat = cov.shape[0]
    det = np.abs(np.linalg.det(cov))
    p0 = 1/math.pow((2*math.pi),num_of_feat/2)
    p0 = p0 / (det**0.5)
    # print('p0: ',p0)
    # p = np.exp(-0.5 * (np.dot(X-meu,np.linalg.inv(cov)))*(X-meu))
    p = np.exp(-0.5 * np.sum(np.multiply(np.dot(X-meu,np.linalg.inv(cov)) , (X-meu)),axis=1).reshape(-1,1))
    # print(p)
    return (p0*p)
# run_model(1,1)


In [514]:
def ret_alpha_mean_var(modes,prob,X_train):
    num_elements = X_train.shape[0]
    num_in_class_i = {}
    all_probs = prob[0]
    updated_params = []

    # print(prob[0].shape)

    for i in range(1,modes):
        all_probs = np.append(all_probs,prob[i],axis=1)
    mode_distribute = np.argmax(all_probs,axis=1)
    for i in range(modes):
        class_i = np.zeros((num_elements))
        class_i[[mode_distribute==i]] = 1
        num_in_class_i[i] = np.sum(class_i,axis=0)

        # print('herenow',prob[i].shape)

        u_m = np.dot(np.transpose(prob[i]),X_train) / np.sum(prob[i],axis=0)
        dif = X_train - u_m
        u_c = np.dot(np.transpose(dif),dif) / num_elements

        updated_params.append( (num_in_class_i[i]/num_elements , u_m , u_c) )
    
    return ( updated_params )


In [515]:
def one_step_optimize(X_train,modes,params):
    num_points = X_train.shape[0]
    alpha = {}
    meu = {}
    covar = {}
    prob = {}
    sum_mat = np.zeros((params[0][1].shape[0],1))

    for i in range(modes):
        alpha[i],meu[i],covar[i] = params[i]
        prob[i] = calc_prob(X_train,meu[i],covar[i])
        # if(i==0):
        #     print(prob[i])
        # print('checkk',prob[i].shape)
        prob[i] = prob[i] * (3 + alpha[i])
        #EDIT
        sum_mat = sum_mat + prob[i]
    for i in range(modes):
        prob[i] = prob[i] / (sum_mat + (3*modes))
    
    updated_params = ret_alpha_mean_var(modes,prob,X_train)

    return(updated_params)




In [547]:
def optimize_the_params(X_train,modes,params,iter=100):
    updated_params = params
    for iterator in range(iter):
        updated_params = one_step_optimize(X_train,modes,updated_params)
    
    return (updated_params)

In [540]:
def GMM_training(X_train, y_train, modes_1 = 1, modes_0 = 1):
    X_0_train,X_1_train = sep_01(X_train,y_train)
    num_feat = X_train.shape[1]
    X_train_avg = np.sum(X_train,axis=0).reshape(1,-1) / X_train.shape[0]
    modes_1,modes_0,para_1,para_0 = random_generate_models(X_train_avg,num_feat,modes_1,modes_0)
    # print('init generated 1: ',para_1)
    # print('init generated 0: ',para_0)
    opt_para_0 = optimize_the_params(X_0_train,modes_0,para_0)
    opt_para_1 = optimize_the_params(X_1_train,modes_1,para_1)

    prior = [np.sum(y_train),0]
    prior[1] = y_train.shape[0]-prior[0]

    return(opt_para_0,opt_para_1,prior)
    

In [541]:
def GMM_testing(X_test,y_test, prior, opt_para_0, opt_para_1, modes_1 = 1, modes_0 = 1):
    test_size = X_test.shape[0]
    prob_0, prob_1 = np.zeros((test_size,1)) , np.zeros((test_size,1))
    for i in range(modes_1):
        alpha,meu,cov = opt_para_1[i]
        prob = alpha*calc_prob(X_test,meu,cov)
        prob_1 += prob
    for i in range(modes_0):
        alpha,meu,cov = opt_para_0[i]
        prob = alpha*calc_prob(X_test,meu,cov)
        prob_0 += prob
    all_probs = prob_0
    all_probs = np.append(all_probs,prob_1,axis=1)
    all_probs[:,0] = all_probs[:,0] * prior[1]
    all_probs[:,1] = all_probs[:,1] * prior[0]
    ##########################################################################
    ############ SEE IF YOU WANT TO DELETE THE ABOVE 2 LINES #################
    ##########################################################################
    y_pred = np.argmax(all_probs,axis=1)

    return (y_pred.reshape(-1,1))
    
    

In [674]:
def accuracy_metrics(y_pred_thresh, y_test):

    # print(y_pred_thresh.shape,y_test.shape)

    test_size = y_test.shape[0]

    tp = np.sum((y_pred_thresh+y_test)==2 , axis=0)[0]
    tn = np.sum(y_pred_thresh==y_test , axis=0)[0] - tp
    fp = np.sum(y_pred_thresh , axis=0)[0]-tp
    fn = test_size-tp-tn-fp


    print('tp: {} , tn: {} , fp: {} , fn: {}'.format(tp,tn,fp,fn))

    acc = (tp+tn)/test_size
    prec = (tp)/(tp+fp)
    recl = (tp)/(tp+fn)
    f1 = 2*prec*recl/(prec+recl)
    print('total number of y=1 in test: ', np.sum(y_test))
    print('Accuracy: {}'.format( acc  ))
    print('Precision: {}'.format( prec  ))
    print('Recall: {}'.format( recl  ))
    print('F1 score: {}'.format( f1  ))

In [786]:
def run_model(modes_0=1,modes_1=1):
    X_train,X_test,y_train,y_test = import_dataset()
    # print(np.sum(y_train)+np.sum(y_test))
    X_mean,X_var, X_train_feat_scaled,X_test_feat_scaled = feature_scaling(X_train,X_test)
    opt_para_0,opt_para_1,prior = GMM_training(X_train_feat_scaled, y_train, modes_1, modes_0)
    # print('opt_para_0: ', opt_para_0)
    # print('opt_para_1: ', opt_para_1)
    # opt_para_0 =[(1.0, np.array([[ 31.36872631, 113.26210433, 230.2234253 ]]), np.array([[ 85.29047877,  93.62669958, -12.91715177],
    #    [ 93.62669958, 169.03096647, -11.18666143],
    #    [-12.91715177, -11.18666143, 976.00337193]]))]
    # opt_para_1 =  [(1.0, np.array([[ 54.98721629, 141.71574088, 292.22935313]]), np.array([[  91.83235288,   34.83131231, 441.01092185],
    #    [  34.83131231,  131.8626332 ,  230.38970845],
    #    [ 441.01092185,  230.38970845, 3637.23236746]]))]
    y_pred = GMM_testing(X_test_feat_scaled,y_test, prior, opt_para_0, opt_para_1, modes_1, modes_0)
    accuracy_metrics(y_pred, y_test)

In [787]:
run_model(10,10)

tp: 77 , tn: 105 , fp: 10 , fn: 18
total number of y=1 in test:  95
Accuracy: 0.8666666666666667
Precision: 0.8850574712643678
Recall: 0.8105263157894737
F1 score: 0.8461538461538461
