In [1]:
# import library 

import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import utils.mnist_reader as mnist_reader

In [2]:
x_train, y_train = mnist_reader. load_mnist('data/fashion',kind ='train')
x_test, y_test = mnist_reader.load_mnist('data/fashion', kind = 't10k')

x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

x_train_temp = []
y_train_temp = []
x_test_temp = []
y_test_temp = []
sneaker = 0
sandal = 0
train_len = len(x_train)
test_len = len(x_test)

In [3]:
for i in range(train_len):
    if y_train[i]== 5 and sandal<4000:
        y_train_temp.append(0)
        x_train_temp.append(x_train[i])
        sandal+=1
    
    if y_train[i]== 7 and sneaker<2000:
        y_train_temp.append(1)
        x_train_temp.append(x_train[i])
        sneaker+=1

for i in range(test_len):
    if y_test[i]==5:
        y_test_temp.append(0)
        x_test_temp.append(x_test[i])

        # Let sneaker data be class 1.
    if y_test[i] == 7:
        y_test_temp.append(1)
        x_test_temp.append(x_test[i])

x_train = np.array(x_train_temp) / 255
y_train = y_train_temp
x_test = np.array(x_test_temp) /255
y_test = y_test_temp

print("Training set size: " + str(len(x_train)))
print("Test set size: " + str(len(x_test)))

Training set size: 6000
Test set size: 2000


In [47]:
# c_vals = [0.0000001, 0.0000005, 0.000001, 0.000005, 0.00001, 0.00005]
# c_vals = [1, 5, 10, 50, 100, 500]
c_vals = [0.01, 0.05, 0.1, 0.5, 1, 5]

test_accs = []
train_accs = []

for c in c_vals:
    
    clf = LogisticRegression(penalty = 'l2', max_iter = 1000000, C = c)
    clf.fit(x_train, y_train)
    
    y_test_predict = clf.predict(x_test)
    test_accuracy = metrics.accuracy_score(y_test, y_test_predict)
    train_accuracy = clf.score(x_train, y_train)
    test_accs.append(test_accuracy)
    train_accs.append(train_accuracy)
    


In [48]:
plt.title("Logistic Regression")
plt.xlabel("Regularization Parameter C")
plt.xscale('log')
plt.ylabel("Accuracy")
plt.plot(c_vals, test_accs, 'r', label = "Test Accuracy")
plt.plot(c_vals, train_accs, 'b', label = "Training Accuracy")
plt.legend()
    
filename = "LR_underfit.png"
    
plt.savefig(filename)
plt.clf()

<Figure size 432x288 with 0 Axes>

In [6]:
k_vals = [5, 6, 7, 8, 9, 10]
c_vals = [0.115, 0.12, 0.125, 0.13, 0.135, 0.14, 0.145]
test_accs = []
for k in k_vals:
    fold_accs = []
    x_length = len(x_train)
    y_length = len(y_train)
    fold = x_length / k
    
    
    for c in c_vals:
        
        current_c = []
        
        for group in range(k):
            
            next_group = group + 1
            start = int(fold * group)
            stop = int(fold * next_group)
            
            if group == (k - 1):
                x_train_training = x_train[0 : start]
                y_train_training = y_train[0 : start]
                
            elif group == 0:
                
                x_train_training = x_train[stop + 1 : x_length]
                y_train_training = y_train[stop + 1 : y_length]
                
            else:
                x_train_training_temp1 = np.array(x_train[0 : start])
                x_train_training_temp2 = np.array(x_train[stop + 1 : x_length])
                x_train_training = np.concatenate((x_train_training_temp1, x_train_training_temp2))
                
                y_train_training_temp1 = np.array(y_train[0 : start])
                y_train_training_temp2 = np.array(y_train[stop + 1 : y_length])
                y_train_training = np.concatenate((y_train_training_temp1, y_train_training_temp2))
            
            
            x_train_test = x_train[start : stop]
            y_train_test = y_train[start : stop]
            
            clf = LogisticRegression(penalty = 'l2', max_iter = 1000000, C = c)
            clf.fit(x_train_training, y_train_training)
            
            y_test_pred = clf.predict(x_train_test)
            test_acc = metrics.accuracy_score(y_train_test, y_test_pred)
            train_acc = clf.score(x_train_training, y_train_training)
            
            current_c.append(test_acc)
            
        avg = sum(current_c)/len(current_c)
        fold_accs.append(avg)
    test_accs.append(fold_accs)

In [7]:
plt.title("Logistic Regression (with K-Fold Validation)")
plt.xlabel("Regularization Parameter C")
plt.ylabel("Accuracy")

plt.plot(c_vals, test_accs[0], 'r', label = "k = 5")
plt.plot(c_vals, test_accs[1], 'g', label = "k = 6")
plt.plot(c_vals, test_accs[2], 'b', label = "k = 7")
plt.plot(c_vals, test_accs[3], 'y', label = "k = 8")
plt.plot(c_vals, test_accs[4], 'k', label = "k = 9")
plt.plot(c_vals, test_accs[5], 'm', label = "k = 10")
plt.legend()

filename = "LR_kfold.png"
plt.savefig(filename)
plt.clf()

<Figure size 432x288 with 0 Axes>