In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [109]:
# >> FEATURE SELECTION << #
def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped


def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped


##############################


# >> MODEL TRAINING << #
def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)

    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost


# I haven't tested it but this same function should work for
# vanilla and mini-batch gradient descent as well
def calculate_cost_gradient(W, X_batch, Y_batch):
    # if only one example is passed (eg. in case of SGD)
    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  # gives multidimensional array
    
    distance = [1 - (Y_batch * np.dot(X_batch, W))]

    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch * X_batch)
        dw += di

    dw = dw  # average
    return dw


def sgd(features, outputs):
    max_epochs = 100
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        # X, Y = shuffle(features, outputs)
        X, Y = features, outputs
        
        
        for ind, x in enumerate(X):
            cost = compute_cost(weights, features, outputs)
            
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            # print(x)
            # print(f"ascent : {ascent}")
            weights = weights - (learning_rate * ascent)
        # print(f"weights : {weights}")
        print(f"cost : {cost}")
        
        # print(f"x : {x}")
        
        

        # convergence check on 2^nth epoch
        # if epoch == 2 ** nth or epoch == max_epochs - 1:
        #     cost = compute_cost(weights, features, outputs)
        #     # print("Epoch is: {} and Cost is: {}".format(epoch, cost))
        #     # stoppage criterion
        #     if abs(prev_cost - cost) < cost_threshold * prev_cost:
        #         return weights
        #     prev_cost = cost
        #     nth += 1
    return weights

In [110]:
print("reading dataset...")
# set hyper-parameters and call init
regularization_strength = 10000
learning_rate = 0.000001

# read data in pandas (pd) data frame
data = pd.read_csv(r'Data/Iris.csv')

## Iris
class_map = {'Iris-setosa':0, 'Iris-versicolor':1,'Iris-virginica':1}
data['class'] = data['class'].map(class_map)

# drop last column (extra column added by pd)
# and unnecessary first column (id)
# data.drop(data.columns[[-1, 0]], axis=1, inplace=True)

print("applying feature engineering...")
# convert categorical labels to numbers
# diag_map = {'M': 1.0, 'B': -1.0}
# data['diagnosis'] = data['diagnosis'].map(diag_map)

# put features & outputs in different data frames
# Y = data.loc[:, 'diagnosis']
Y = data.loc[:, 'class'].values
X = data.iloc[:, 1:-1].values

reading dataset...
applying feature engineering...


In [111]:

# filter features
# remove_correlated_features(X)
# remove_less_significant_features(X, Y)

# normalize data for better convergence and to prevent overflow
X_normalized = MinMaxScaler().fit_transform(X)
X = pd.DataFrame(X_normalized)

# insert 1 in every row for intercept b
X.insert(loc=len(X.columns), column='intercept', value=1)

# split data into train and test set
print("splitting dataset into train and test sets...")
X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)

splitting dataset into train and test sets...


In [112]:

# train the model
print("training started...")
W = sgd(X_train.to_numpy(), y_train)
print("training finished.")
print("weights are: {}".format(W))

# # testing the model
# print("testing the model...")
# y_train_predicted = np.array([])
# for i in range(X_train.shape[0]):
#     yp = np.sign(np.dot(X_train.to_numpy()[i], W))
#     y_train_predicted = np.append(y_train_predicted, yp)

# y_test_predicted = np.array([])
# for i in range(X_test.shape[0]):
#     yp = np.sign(np.dot(X_test.to_numpy()[i], W))
#     y_test_predicted = np.append(y_test_predicted, yp)

# print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
# print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
# print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))


training started...
cost : 3398.5673052932966
cost : 3350.9696621085204
cost : 3334.684691813188
cost : 3333.740963397126
cost : 3333.7408655776003
cost : 3333.740767781549
cost : 3333.7406700089655
cost : 3333.740572259845
cost : 3333.7404745341814
cost : 3333.740376831969
cost : 3333.740279153202
cost : 3333.7401814978757
cost : 3333.7400838659837
cost : 3333.7399862575207
cost : 3333.739888672481
cost : 3333.7397911108587
cost : 3333.7396935726483
cost : 3333.739596057844
cost : 3333.739498566441
cost : 3333.739401098433
cost : 3333.7393036538147
cost : 3333.740919552825
cost : 3333.749174426288
cost : 3333.749074636351
cost : 3333.7489748703606
cost : 3333.7488751283117
cost : 3333.7487754101976
cost : 3333.748675716013
cost : 3333.748576045752
cost : 3333.7484763994094
cost : 3333.748376776979
cost : 3333.7482771784553
cost : 3333.7481776038317
cost : 3333.7480780531037
cost : 3333.747978526265
cost : 3333.74787902331
cost : 3333.7477795442323
cost : 3333.747680089027
cost : 3333.

In [118]:
X_train.to_numpy()[0]

array([0.08333333, 0.66666667, 0.        , 0.04166667, 1.        ])

In [120]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

print("testing the model...")
ytrain_predicted = np.array([])
for i in range(X_train.shape[0]):
    yp = np.sign(np.dot(X_train.to_numpy()[i], W))
    ytrain_predicted = np.append(ytrain_predicted, yp)

ytest_predicted = np.array([])
for i in range(X_test.shape[0]):
    yp = np.sign(np.dot(X_test.to_numpy()[i], W))
    ytest_predicted = np.append(ytest_predicted, yp)

print("accuracy on test dataset: {}".format(accuracy_score(y_test, ytest_predicted)))
print("recall on test dataset: {}".format(recall_score(y_test, ytest_predicted)))
print("precision on test dataset: {}".format(precision_score(y_test, ytest_predicted)))

testing the model...
accuracy on test dataset: 0.6666666666666666
recall on test dataset: 1.0
precision on test dataset: 0.6666666666666666
