In [1]:
import pandas as pd
import numpy as np

In [2]:

def split(n,percentage = 0.7,rng=None):
    if rng:
        np.random.seed(rng)
    seq = np.random.permutation(n)
    train_ids = seq[:int(percentage*n)]
    test_ids = seq[int(percentage*n):]
    return train_ids, test_ids

In [3]:
df = pd.read_csv("v3data.csv")
df.head()

Unnamed: 0,Responden,Y,EAR,MAR,MOE
0,1,0,0.231,0.74,3.203
1,1,0,0.355,0.716,2.017
2,1,0,0.258,0.635,2.461
3,1,0,0.155,0.611,3.942
4,1,0,0.186,0.644,3.462


In [4]:
df.drop(columns=['Responden', 'MOE'], inplace=True)

In [5]:
X = df.drop(columns=['Y']).to_numpy()
y = df['Y'].to_numpy().reshape(X.shape[0], 1)

print(X.shape,y.shape)


(19500, 2) (19500, 1)


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=99)

In [7]:
train_ids, test_ids = split(n = X.shape[0], percentage = 0.8, rng = 10000)

In [8]:
train_x, train_y = X[train_ids], y[train_ids]
test_x, test_y = X[test_ids], y[test_ids]

In [9]:
# sigmoid activation function
def sigmoid(x):
    return 1.0/(1+np.exp(-x))

# Negative CE
def lossfunc(t,y):
    # t is the target (actual)
    # y is the prediction
    NCE = -(t*np.log(y)+(1-t)*np.log(1-y)).mean()
    return NCE

# Accuracy Metric 
def accuracy(t,y):
    # t is the target (actual)
    # y is the prediction
    y = (y>0.5).astype(int)
    return (t==y).mean()

In [10]:
class LogisticModel():
    def __init__(self,ni):
        # ni is the number of input features
        self.w = np.random.randn(ni,1)
        self.b = np.random.rand()
        
    def feed(self,x):
        # feed is a method for outputing the prediction 
        # based on your input data x
        output = np.dot(x, self.w)+self.b
        return sigmoid(output)
    
    def predict(self, X):
        preds = self.feed(X)
      
        # Empty List to store predictions.
        pred_class = []
        # if preds >= 0.5 --> round up to 1
        # if preds < 0.5 --> round up to 0
        pred_class = np.array([1 if i > 0.5 else 0 for i in preds])
        
        return pred_class.reshape(X.shape[0],1)

In [11]:
def train(train_x,train_y, epoch=10000, learning_rate=0.001, verbose = True):
    n_samples, n_features = train_x.shape

    mdl = LogisticModel(ni = n_features)

    loss_record=[]

    # training loop
    for i in range(epoch):
        # feed your data and compute the loss 
        y_pred = mdl.feed(train_x)
        loss = lossfunc(train_y,y_pred)
        loss_record.append(loss)

        # the loss should be decreasing if you compute the gradient properly
        if verbose:
          if i%20==0:
              print(f'epoch {i+1}/{epoch} \t: loss: {round(loss,4)},  \tacc: {round(accuracy(train_y,y_pred),4)}')
          
        # compute grad here (of course manually :)
        # you will have gradient for each sample  
        # use the mean gradient instead --> .mean(axis=0)
        grad_w = np.dot(train_x.T, (y_pred-train_y)).mean(axis=1).reshape(2,1)
        grad_b = (1/n_samples)*np.sum(y_pred - train_y)

        # update your parameter
        # ensure your grad is the same shape/dimension with your parameter
        mdl.w -= learning_rate*grad_w
        mdl.b -= learning_rate*grad_b
    
    return mdl, y_pred, loss_record

In [12]:
model,y_pred, loss_record = train(train_x, train_y, epoch = 10000, learning_rate = 0.01)

epoch 1/10000 	: loss: 0.7723,  	acc: 0.3645
epoch 21/10000 	: loss: nan,  	acc: 0.4506
epoch 41/10000 	: loss: nan,  	acc: 0.8956
epoch 61/10000 	: loss: nan,  	acc: 0.8938
epoch 81/10000 	: loss: nan,  	acc: 0.8862


  NCE = -(t*np.log(y)+(1-t)*np.log(1-y)).mean()
  NCE = -(t*np.log(y)+(1-t)*np.log(1-y)).mean()


epoch 101/10000 	: loss: nan,  	acc: 0.8825
epoch 121/10000 	: loss: nan,  	acc: 0.8796
epoch 141/10000 	: loss: nan,  	acc: 0.8773
epoch 161/10000 	: loss: nan,  	acc: 0.8734
epoch 181/10000 	: loss: nan,  	acc: 0.8712
epoch 201/10000 	: loss: nan,  	acc: 0.8701
epoch 221/10000 	: loss: nan,  	acc: 0.8692
epoch 241/10000 	: loss: nan,  	acc: 0.8676
epoch 261/10000 	: loss: nan,  	acc: 0.866
epoch 281/10000 	: loss: nan,  	acc: 0.8648
epoch 301/10000 	: loss: nan,  	acc: 0.8639
epoch 321/10000 	: loss: nan,  	acc: 0.8629
epoch 341/10000 	: loss: nan,  	acc: 0.8624
epoch 361/10000 	: loss: nan,  	acc: 0.8615
epoch 381/10000 	: loss: nan,  	acc: 0.8608
epoch 401/10000 	: loss: nan,  	acc: 0.8598
epoch 421/10000 	: loss: nan,  	acc: 0.8594
epoch 441/10000 	: loss: nan,  	acc: 0.8587
epoch 461/10000 	: loss: nan,  	acc: 0.8585
epoch 481/10000 	: loss: nan,  	acc: 0.8579
epoch 501/10000 	: loss: nan,  	acc: 0.8579
epoch 521/10000 	: loss: nan,  	acc: 0.8576
epoch 541/10000 	: loss: nan,  	a

In [14]:
y_tested = model.predict(test_x)

In [15]:

def cross_val_score(X, y, scoring='accuracy', *, cv):
    
    scores = []

    for train_idx, test_idx in cv.split(X):
        X_train_val, X_test_val = X[train_idx], X[test_idx]
        y_train_val, y_test_val = y[train_idx], y[test_idx]

        model,_,_ = train(X_train_val, y_train_val, epoch = 20000, 
                          learning_rate = 0.01, verbose = False)
        
        pred = model.predict(X_test_val)
        scores.append(accuracy(y_test_val, pred))
    return scores

In [16]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, random_state = 42, shuffle = True)
scores = cross_val_score(X, y, scoring='accuracy', cv=cv)

  NCE = -(t*np.log(y)+(1-t)*np.log(1-y)).mean()
  NCE = -(t*np.log(y)+(1-t)*np.log(1-y)).mean()


In [23]:
scores

[0.9253846153846154,
 0.8682051282051282,
 0.8753846153846154,
 0.8748717948717949,
 0.9325641025641026]

In [24]:
print(f"K-Fold Cross Validation Mean Scores: {np.mean(scores)}")

K-Fold Cross Validation Mean Scores: 0.8952820512820512


In [25]:
print(model.w)

[[-122.66536712]
 [  37.90871274]]


In [26]:
print(model.b)

-1.8760894255664542
