In [15]:
import pandas as pd
import numpy as np
from Layer import *

df = pd.read_csv("./data/salary/train_sep.csv")
df_test = pd.read_csv("./data/salary/test_sep.csv")
test_x = df_test.values.astype('float').T #(dim,n)
#remember feature scaling
test_x = (test_x - np.mean(test_x,axis=1).reshape((-1,1)))/(np.std(test_x,axis=1)+ 1e-10).reshape((-1,1))


In [18]:
df_label = df.loc[:, 'income']
df_train = df.loc[:, :].drop('income', axis=1)

data_x = df_train.values.astype(np.float).T
data_y = df_label.values.astype(np.float).reshape((1, -1))

data_x = (data_x - np.mean(data_x, axis=1).reshape((-1, 1))) / np.std(data_x, axis=1).reshape((-1, 1))

In [19]:
def loss(pred, y):
    pred_log = np.log(pred + 1e-20)  # q(1) = f(x)
    no_pred_log = np.log(1.0 - pred + 1e-20)  # q(0) = 1 - f(x) ; f(x) : probability of class A(1)
    loss_val = -1. * (np.matmul(y, pred_log.T) + np.matmul((1.0 - y), no_pred_log.T))
    return loss_val

def accuarcy(pred,y):
    pred = np.rint(pred)
    return np.mean(np.equal(pred,y).astype('float'))

def grad_cross_entropy(pred,y):
    d_c_pred = -1.0 * (y / (pred + 1e-20) - (1. - y) / (1. - pred + 1e-20))  # (1,n) it is grad not loss
    return d_c_pred

def shuffle():
    seed = np.random.randint(0,1e5)
    
    np.random.seed(seed)
    #I make a big error : shuffle
    #np shuffle is shuffled by the first dim.in this example,is row
    #but we make the data_x [dim,n].so, we need transpose the data_x first and shuffle next
    np.random.shuffle(data_x.T)
    np.random.seed(seed)
    np.random.shuffle(data_y.T)

dim = data_x.shape[0]
dim1 = 64
dim2 = 16
dim3 = 4
dim4 = 2
dim5 = 1

layers = [Layer(dim,dim3,relu,relu_rev),
          #Layer(dim2,dim4,relu,relu_rev),
          #Layer(dim2,dim3,relu,relu_rev),
          #Layer(dim3,dim4,relu,relu_rev),
          Layer(dim3,dim5,sigmoid,sigmoid_rev)]
n = np.floor(data_x.shape[1] * 0.2).astype("int")
lr = 0.3
epochs = 2000
batch_size = 512
batch_count = n // batch_size
loss_list = []

def predict(epoch):
    pred = test_x
    for layer in layers:
        pred = layer.forward(pred)
    
    pred = np.rint(pred).reshape((-1,))#<=50K : 0; >50K : 1
    pred_list = list(pred)
    id_list = range(1,len(pred)+1)
    df_out = pd.DataFrame({'id':id_list,'label':pred_list},dtype='int')
    df_out.to_csv("./result/salary/pred_{}.csv".format(epoch),index=False)

shuffle()
for epoch in range(epochs):
    loss_val = 0
    
    for i in range(batch_count):
        a = data_x[:,i*batch_size:(i+1)*batch_size]
        y = data_y[:,i*batch_size:(i+1)*batch_size]
    
        # forward
        for layer in layers:
            a = layer.forward(a)
    
        # cross entropy
        d_c_a = grad_cross_entropy(a,y)  # (1,n) it is grad not loss
    
        # backpropagation
        for layer in reversed(layers):
            d_c_a = layer.backpropagation(d_c_a,lr)
        
        loss_val += loss(a, y)[0][0]
    
    loss_list.append(loss_val)
    a = data_x[:,n:]
    y = data_y[:,n:]
    for layer in layers:
            a = layer.forward(a)
    acc = accuarcy(a,y)
    if epoch %1 == 0:
            print("{:<5d},loss:{:.5f},acc={:.5f}".format(epoch,loss_val,acc))
    if (epoch+1) % 10 == 0:
        #predict(epoch)
        print("save")

0    ,loss:2895.35251,acc=0.75926
1    ,loss:2269.40539,acc=0.82852
2    ,loss:2160.44843,acc=0.83147
3    ,loss:2094.55878,acc=0.83424
4    ,loss:2050.81131,acc=0.83581
5    ,loss:2020.71547,acc=0.83711
6    ,loss:1994.40363,acc=0.83761
7    ,loss:1974.79521,acc=0.83773
8    ,loss:1956.69760,acc=0.83807
9    ,loss:1941.97882,acc=0.83811
save
10   ,loss:1931.31626,acc=0.83819
11   ,loss:1921.58282,acc=0.83854
12   ,loss:1912.26679,acc=0.83827
13   ,loss:1903.93237,acc=0.83796
14   ,loss:1895.58593,acc=0.83788
15   ,loss:1887.67572,acc=0.83796
16   ,loss:1883.60721,acc=0.83815
17   ,loss:1878.75818,acc=0.83777
18   ,loss:1873.09401,acc=0.83804
19   ,loss:1869.59426,acc=0.83800
save
20   ,loss:1865.62662,acc=0.83830
21   ,loss:1860.39763,acc=0.83819
22   ,loss:1858.08724,acc=0.83777
23   ,loss:1853.76997,acc=0.83792
24   ,loss:1849.57635,acc=0.83796
25   ,loss:1847.63160,acc=0.83773
26   ,loss:1845.31582,acc=0.83796
27   ,loss:1841.88579,acc=0.83784
28   ,loss:1839.70016,acc=0.83800
29  

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.plot(loss_list)
plt.show()