In [1]:
import numpy as np
import pandas as pd

In [2]:
class LinearRegression:
    
    def __init__(self, lr=0.001, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights, self.bias = None, None
        
    def fit(self, X, y):
        # First, randomly initialize the weights
        n_samples, n_features = X.shape
        self.weights = np.random.rand(n_features)
        # I'm setting the bias (intercept) to zero
        self.bias = 0.0
        
        for _ in range(self.n_iters):
            y_approx = np.dot(X, self.weights) + self.bias
            
            # Gradient calculations w.r.t "w" and "b"
            dw = float(1/n_samples) * np.dot(X.T, (y_approx - y))
            db = float(1/n_samples) * np.sum(y_approx - y)
            
            # Update the params
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
    
    def predict(self, X):
        y_pred = np.dot(X, self.weights) + self.bias
        return y_pred

In [3]:
class Dataset:
    
    def __init__(self):
        self.data = self.read_data()
    
    # Read data and make dataset
    def read_data(self):
        data = pd.read_csv('data/iris.data', names=['f1', 'f2', 'f3', 'f4', 'species'])
        # Replace class strings with numbers to apply linear regression
        data.loc[(data.species == 'Iris-setosa'), 'species'] = -1.0
        data.loc[(data.species == 'Iris-versicolor'), 'species'] = 0.0
        data.loc[(data.species == 'Iris-virginica'), 'species'] = 1.0
        return data
    
    def split_data(self, test_pct=0.2):
        train_df = self.data.sample(frac=1-test_pct)
        test_df = self.data.drop(train_df.index)
        X_train = train_df[['f1', 'f2', 'f3', 'f4',]].to_numpy().astype('float64')
        y_train = train_df[['species']].to_numpy().astype('float64').ravel()
        X_test = test_df[['f1', 'f2', 'f3', 'f4',]].to_numpy().astype('float64')
        y_test = test_df[['species']].to_numpy().astype('float64').ravel()
        return X_train, y_train, X_test, y_test
        

In [4]:
dataset = Dataset()

In [5]:
def round_class(prediction):
    if prediction <= -0.33:
        return -1
    elif prediction > -0.33 and prediction < 0.33:
        return 0
    else:
        return 1

In [6]:
def test_model(regressor, X_test, y_test):
    total, correct = 0, 0
    for i, x in enumerate(X_test):
        y_pred = round_class(regressor.predict(x))
        y_true = int(y_test[i])
        if y_pred == y_true:
            correct += 1
        total += 1
    
    return correct/total

In [7]:
def k_fold(k=5):
    accs = []
    for i in range(k):
        X_train, y_train, X_test, y_test = dataset.split_data(test_pct=0.2)
        regressor = LinearRegression(lr=0.001, n_iters=1000)
        regressor.fit(X_train, y_train)
        acc = test_model(regressor, X_test, y_test)
        print(f'{i}. Accuracy: {acc}')
        accs.append(acc)
        
    print(accs)
    print(f'Mean classification accuracy: {sum(accs)/len(accs)}')
        
    

In [15]:
k_fold(k=5)

0. Accuracy: 1.0
1. Accuracy: 0.9333333333333333
2. Accuracy: 0.9666666666666667
3. Accuracy: 1.0
4. Accuracy: 1.0
[1.0, 0.9333333333333333, 0.9666666666666667, 1.0, 1.0]
Mean classification accuracy: 0.9800000000000001
