In [1]:
import numpy as np
import csv
import random
from numpy.linalg import det, inv

def load_data(x_path = None, y_path = None, mode = 'train'):
    #read X data
    with open(x_path, 'r') as f:
        csv_file = csv.reader(f)
        x_data = []
        for row in csv_file:
            x_data.append(row)
        x_data = np.array(x_data)
        x_data = np.delete(x_data, 0, 0)
        x_data = x_data.astype(float)
    if mode == 'test':
        return x_data
    # if mode is train, read Y data
    else:
        with open(y_path, 'r') as f:
            csv_file = csv.reader(f)
            y_data = []
            for row in csv_file:
                y_data.append(row)
            y_data = np.array(y_data)
            y_data = np.delete(y_data, 0, 0)
            y_data = y_data.astype(int)
        return x_data, y_data

In [2]:
def Gaussian_function(mu, sigma, x):
    D = mu.shape[0]
    first_term = 1/((2*np.pi)**(D/2))
    second_term = 1/(det(sigma)**(1/2))
    exp_term = np.exp((-1/2) * np.dot(np.dot((x - mu), inv(sigma)), (x - mu).transpose()))
    return first_term * second_term * exp_term

def generative_model(X_data, Y_data, X_test):
    np.seterr(divide='ignore',invalid='ignore')
    X1_data = []
    X2_data = []
    for i, y in enumerate(Y_data):
        if(y == 0):
            X1_data.append(X_data[i])
        else:
            X2_data.append(X_data[i])
    X1_data = np.array(X1_data)
    X2_data = np.array(X2_data)
    mu1 = np.mean(X1_data, axis = 0)
    mu2 = np.mean(X2_data, axis = 0)
    
    prob1 = len(X1_data) / len(X_data)
    prob2 = 1 - prob1
    
    sigma1 = np.dot((X1_data - mu1).transpose(), (X1_data - mu1)) / len(X1_data)
    sigma2 = np.dot((X2_data - mu2).transpose(), (X2_data - mu2)) / len(X2_data)
    sigma = prob1 * sigma1 + prob2 * sigma2
    
    with open('ans_generative.csv', 'w') as f:
        f.write('id,label\n')
        for i, x in enumerate(X_test):
            f.write(str(i+1) + ',')
            # P(x|C1)
            Pcon1 = Gaussian_function(mu1, sigma, x)
            # P(x|C2)
            Pcon2 = Gaussian_function(mu2, sigma, x)
            p = (prob1 * Pcon1) / ((prob1 * Pcon1) + (prob2 * Pcon2))
            
            if p >= 0.5:
                f.write('0\n')
            else:
                f.write('1\n')
                
    # loss
    '''
    acc = 0
    for i in range(X_data.shape[0]):
        x = X_data[i]
        # P(x|C1)
        Pcon1 = Gaussian_function(mu1, sigma, x)
        # P(x|C2)
        Pcon2 = Gaussian_function(mu2, sigma, x)
        
        p = (prob1 * Pcon1) / ((prob1 * Pcon1) + (prob2 * Pcon2))
        
        if p >= 0.5:
            if Y_data[i] == 1: acc += 1
        else:
            if Y_data[i] == 0: acc += 1
        if i % 5000 == 0:
            print('loss: {}'.format(acc / (i + 1)))
    print('loss: {}'.format(acc / X_data.shape[0]))
    '''

In [54]:
def sigmoid(x):
    res =  1 / (1 + np.exp(-1 * x))
    return np.clip(res, 0.00000000000001, 0.99999999999999)

def log_reg_model(X_data, Y_data, batch_size, epoch, lr, mr = 0.3):
    # scaling
    mean = np.mean(X_data, axis=0) #shape: (106,)
    std = np.std(X_data, axis=0) #shape: (106,)
    X_data = (X_data - mean) / (std + 1e-100)

    W = np.ones(X_data.shape[1])
    b = 0.0
    Y_data = Y_data.reshape(Y_data.shape[0]) #shape: (32561,)
    for i in range(epoch):
        #randomize batch
        rand_series = random.sample(range(X_data.shape[0]), X_data.shape[0])
        lr_w, lr_b = 0, 0
        W_grad = np.zeros(X_data.shape[1])
        b_grad = 0.0
        for batch_i in range((X_data.shape[0] + 1)//batch_size):
            data_picker = rand_series[batch_i*batch_size:batch_i*batch_size+batch_size]
            X = X_data[data_picker, :]
            Y = Y_data[data_picker]
            
            z = np.dot(X, W) + b
            f_x = sigmoid(z)
            error = Y - f_x
            
            W_grad = mr*W_grad + -(1-mr)*np.dot(error.T, X)
            b_grad = mr*b_grad + -(1-mr)*np.sum(error)*1
            
            #adagrad
            lr_w = lr_w + W_grad**2
            lr_b = lr_b + b_grad**2
            W = W - lr/np.sqrt(lr_w) * W_grad
            b = b - lr/np.sqrt(lr_b) * b_grad
            
        if(i%100 == 99):
            z = np.dot(X_data, W) + b
            f_x = sigmoid(z)
            acc = 0
            for i in range(f_x.shape[0]):
                if(f_x[i] >= 0.5) and (Y_data[i] == 1):
                    acc += 1
                elif(f_x[i] < 0.5) and (Y_data[i] == 0):
                    acc += 1
            print(acc/f_x.shape[0])
            
    return W, b

def predict(X_test, W, b):
    # scaling
    mean = np.mean(X_test, axis=0) #shape: (106,)
    std = np.std(X_test, axis=0) #shape: (106,)
    X_test = (X_test - mean) / (std + 1e-100)
    Y_test = sigmoid(np.dot(X_test, W) + b)
    with open('ans_log_reg.csv', 'w') as f:
        f.write('id,label\n')
        for i, y in enumerate(Y_test):
            if y >= 0.5 :
                f.write(str(i + 1) + ',' + '1\n')
            else:
                f.write(str(i + 1) + ',' + '0\n')
        

In [5]:
X_data, Y_data = load_data('data/X_train', 'data/Y_train', mode = 'train')
X_test = load_data('data/X_test', mode = 'test')

In [45]:
generative_model(X_data, Y_data, X_test)

In [57]:
W,b = log_reg_model(X_data, Y_data, 1000, 1000, 0.001, 0.5)
predict(X_test, W, b)

0.8449986179785633
0.853075765486318
0.8539049783483308
0.8534750161235835
0.8534443045361014
0.8538128435858849
0.8531986118362458
0.8533214581861737
0.8534443045361014
0.8532907465986916
