# Section 1: Data Import and Mini Feature Exploration

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [3]:
digits = pd.read_csv("digits.csv", header=None)
digits = digits.sample(frac=1, random_state=200).reset_index(drop=True)

In [4]:
digits.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
1028,0,0,1,9,15,12,5,0,0,0,...,0,0,0,0,11,5,0,0,0,7
300,0,0,0,15,16,16,12,4,0,0,...,0,0,0,2,16,2,0,0,0,7
488,0,0,1,12,6,0,0,0,0,0,...,0,0,0,1,10,15,14,4,0,6
860,0,0,0,13,9,0,0,0,0,0,...,0,0,0,0,12,15,0,0,0,1
237,0,0,1,7,13,10,0,0,0,2,...,0,0,0,0,7,15,16,10,0,9


In [5]:
# dimensions of dataset
digits.shape

(1797, 65)

In [6]:
print("Sample size:", digits.shape[0])
print("Number of features in dataset:", digits.shape[1])

Sample size: 1797
Number of features in dataset: 65


In [7]:
# global variables
sample_size = digits.shape[0]
num_of_features = digits.shape[1] - 1
k = 10
train_test_split_ratio = 0.2

In [8]:
X = digits.iloc[:,:-1] # features
y = digits.iloc[:,-1]  # labels

In [9]:
X.shape

(1797, 64)

In [10]:
X.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
count,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,...,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0
mean,0.0,0.30384,5.204786,11.835838,11.84808,5.781859,1.36227,0.129661,0.005565,1.993879,...,3.725097,0.206455,0.000556,0.279354,5.557596,12.089037,11.809126,6.764051,2.067891,0.364496
std,0.0,0.907192,4.754826,4.248842,4.287388,5.666418,3.325775,1.037383,0.094222,3.19616,...,4.919406,0.984401,0.02359,0.934302,5.103019,4.374694,4.933947,5.900623,4.090548,1.860122
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,11.0,10.0,0.0,0.0,0.0
50%,0.0,0.0,4.0,13.0,13.0,4.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,4.0,13.0,14.0,6.0,0.0,0.0
75%,0.0,0.0,9.0,15.0,15.0,11.0,0.0,0.0,0.0,3.0,...,7.0,0.0,0.0,0.0,10.0,16.0,16.0,12.0,2.0,0.0
max,0.0,8.0,16.0,16.0,16.0,16.0,16.0,15.0,2.0,16.0,...,16.0,13.0,1.0,9.0,16.0,16.0,16.0,16.0,16.0,16.0


---

# Section 2: Building our Logistic Regression from Scratch

In [11]:
def sigmoid(scores):
    return 1 / (1 + np.exp(-scores))

In [12]:
def cost(features, weights, binary_labels):
    m = features.shape[0]
    
    scores = np.dot(features, weights)
    predictions = sigmoid(scores)
    
    total_cost = np.sum(binary_labels * np.log(predictions) + (1 - binary_labels) * np.log(1 - predictions))
    total_cost = -(1 / m) * total_cost
    
    gradient = (1/m) + np.dot((binary_labels - predictions), features)
    return total_cost, gradient

In [13]:
def gradient_ascent(weight, learning_rate, gradient):
    return weight + learning_rate * gradient

In [14]:
def log_likelihood(features, label, weights):
    scores = np.dot(features, weights)
    ll = np.sum( label*scores - np.log(1 + np.exp(scores)) )
    return ll

In [15]:
def logistic_regression(features, label, num_iter, learning_rate):
    b = np.ones((features.shape[0], 1))
    features = np.concatenate((b, features), axis=1)
    
    all_weights = []
    costs = np.zeros(num_iter)
    classes = np.unique(y)
    
    for c in classes:
        binary_label = np.where(label == c, 1, 0)
        weights = np.zeros(features.shape[1])
        
        for i in range(num_iter):
            costs[i], grad = cost(features, weights, binary_label)
            weights = gradient_ascent(weights, learning_rate, grad)
        
        all_weights.append(weights)
        
    return all_weights, classes, costs

In [16]:
def predict(classes, weights, features):
    b = np.ones((features.shape[0], 1))
    features = np.concatenate((b, features), axis=1)
    
    preds = [np.argmax([sigmoid(np.dot(x, w)) for w in weights]) for x in features]
    return [classes[p] for p in preds]

In [17]:
def score(classes, weights, features, labels):
    return (predict(classes, weights, features) != labels).mean()

---

# Section 3: First we do 80-20 train test split

In [18]:
train_test_split_ratio = 0.8
num_iter = 1000
learning_rate = 0.1

In [19]:
def split_train_test(X, y, ratio):
    m = X.shape[0]
    cut = int(m * ratio)
    
    tr_x = X.iloc[0:cut,:]
    tr_y = y.iloc[0:cut]
    te_x = X.iloc[cut:,:]
    te_y = y.iloc[cut:]
    
    return tr_x, tr_y, te_x, te_y

In [20]:
train_X, train_y, test_X, test_y = split_train_test(X, y, train_test_split_ratio)

In [21]:
w, classes, costs = logistic_regression(train_X, train_y, num_iter, learning_rate)

In [22]:
print("Train error: %.4f" % score(classes, w, train_X, train_y))

Train error: 0.0605


In [23]:
print("Test error: %.4f" % score(classes, w, test_X, test_y))

Test error: 0.1028


---

# Section 4: Now we apply same steps for 10-fold cross validation

In [24]:
# global variables
k_fold = 10
seed = 200

In [25]:
# randomise the dataset (with seed for reproducing)
digits = digits.sample(frac=1, random_state=seed).reset_index(drop=True)

# break down indices in 10 folds and save it
folds_indices = []
start_index = 0
steps = sample_size / k_fold
for i in range(k_fold):
    end_index = start_index + steps
    folds_indices.append([round(start_index), round(end_index)])
    start_index = end_index

folds_indices

[[0, 180],
 [180, 359],
 [359, 539],
 [539, 719],
 [719, 898],
 [898, 1078],
 [1078, 1258],
 [1258, 1438],
 [1438, 1617],
 [1617, 1797]]

In [26]:
kfold_train_errors = []
kfold_test_errors = []

for i in range(len(folds_indices)):    
    test = digits.iloc[folds_indices[i][0]:folds_indices[i][1],:]
    train = digits.drop(test.index)
    
    train_x = train.iloc[:,:-1]
    train_y = train.iloc[:,-1]
    test_x = test.iloc[:,:-1]
    test_y = test.iloc[:,-1]
    
    w, classes, costs = logistic_regression(train_x, train_y, num_iter, learning_rate)
    train_error = score(classes, w, train_x, train_y)
    kfold_train_errors.append(train_error)
    
    test_error = score(classes, w, test_x, test_y)
    kfold_test_errors.append(test_error)

In [27]:
kfold_train_errors

[0.04329004329004329,
 0.06983930778739184,
 0.15893630179344465,
 0.08225108225108226,
 0.06613102595797281,
 0.06431663574520717,
 0.05256648113790971,
 0.06431663574520717,
 0.04758961681087762,
 0.05194805194805195]

In [28]:
kfold_test_errors

[0.08333333333333333,
 0.13966480446927373,
 0.2111111111111111,
 0.15,
 0.11731843575418995,
 0.07777777777777778,
 0.1,
 0.12222222222222222,
 0.0446927374301676,
 0.1111111111111111]

In [29]:
np.mean(kfold_train_errors)

0.07011851824671884

In [30]:
np.mean(kfold_test_errors)

0.11572315332091869

---

# Section 7: We explore with Scikit-learn and compare with our code

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [32]:
# Train-test with scikit LogReg
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_test_split_ratio, random_state=0)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("Train:", (1-logreg.score(X_train, y_train)))
print("Test:", (1-logreg.score(X_test, y_test)))

Train: 0.0
Test: 0.0521557719054242


In [33]:
# Kfold with scikit LogReg
from sklearn.model_selection import KFold

kf = KFold(n_splits=k)
kf.get_n_splits(X)
fold_index = 1

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    print("Fold:", fold_index, ", Train error:", (1-logreg.score(X_train, y_train)), ", Test error:", (1-logreg.score(X_test, y_test)))
    fold_index = fold_index + 1
    

Fold: 1 , Train error: 0.0 , Test error: 0.02777777777777779
Fold: 2 , Train error: 0.0 , Test error: 0.02777777777777779
Fold: 3 , Train error: 0.0 , Test error: 0.0444444444444444
Fold: 4 , Train error: 0.0 , Test error: 0.01666666666666672
Fold: 5 , Train error: 0.0 , Test error: 0.03888888888888886
Fold: 6 , Train error: 0.0 , Test error: 0.02777777777777779
Fold: 7 , Train error: 0.0 , Test error: 0.033333333333333326
Fold: 8 , Train error: 0.0 , Test error: 0.05586592178770955
Fold: 9 , Train error: 0.0 , Test error: 0.027932960893854775
Fold: 10 , Train error: 0.0 , Test error: 0.04469273743016755
