## Data Science Assignment 6
##### Name: Venkata Sai Manoj Boganadham
##### Roll no: 197121
##### Section: A

### 1. Multi-class Logistic Regression from scratch

Here we are going to perform Logistic Regression on the Iris dataset from Kaggle.

Dataset link: https://www.kaggle.com/uciml/iris

First let us import the required modules

In [2]:
# Importing required libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Importing the dataset
raw_data = pd.read_csv('Iris.csv')
print(raw_data.shape)
raw_data.head()

(150, 6)


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


#### First, let us replace the 'Species' labels with numbers

In [3]:
# Printing the unique labels in species
print("The unique labels in the species column are:")
print(raw_data['Species'].unique())

# Replace labels with numbers
raw_data['Species']=np.where(raw_data['Species'] =='Iris-setosa', 1, raw_data['Species'])
raw_data['Species']=np.where(raw_data['Species'] =='Iris-versicolor', 2, raw_data['Species'])
raw_data['Species']=np.where(raw_data['Species'] =='Iris-virginica', 3, raw_data['Species'])

# Storing these labels in a separate dictionary
label_dict = {1:'Iris-setosa', 2:'Iris-versicolor', 3:'Iris-virginica'}

raw_data.head()


The unique labels in the species column are:
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,1
1,2,4.9,3.0,1.4,0.2,1
2,3,4.7,3.2,1.3,0.2,1
3,4,4.6,3.1,1.5,0.2,1
4,5,5.0,3.6,1.4,0.2,1


### Normalising the data

In [4]:
# Normalising the data columns except species
raw_data['SepalLengthCm'] = (raw_data['SepalLengthCm'] - raw_data['SepalLengthCm'].mean())/raw_data['SepalLengthCm'].std()
raw_data['SepalWidthCm'] = (raw_data['SepalWidthCm'] - raw_data['SepalWidthCm'].mean())/raw_data['SepalWidthCm'].std()
raw_data['PetalLengthCm'] = (raw_data['PetalLengthCm'] - raw_data['PetalLengthCm'].mean())/raw_data['PetalLengthCm'].std()
raw_data['PetalWidthCm'] = (raw_data['PetalWidthCm'] - raw_data['PetalWidthCm'].mean())/raw_data['PetalWidthCm'].std()

raw_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,-0.897674,1.028611,-1.336794,-1.308593,1
1,2,-1.1392,-0.12454,-1.336794,-1.308593,1
2,3,-1.380727,0.33672,-1.39347,-1.308593,1
3,4,-1.50149,0.10609,-1.280118,-1.308593,1
4,5,-1.018437,1.259242,-1.336794,-1.308593,1


#### Splitting the data into training and testing datasets

In [5]:
# Splitting the data into training and testing data
train_data = raw_data.sample(frac=0.8, random_state=25)
test_data = raw_data.drop(train_data.index)

# Printing the training and testing datasets shapes
print("The training dataset has {} rows and {} columns".format(train_data.shape[0], train_data.shape[1]))
print("The testing dataset has {} rows and {} columns".format(test_data.shape[0], test_data.shape[1]))

The training dataset has 120 rows and 6 columns
The testing dataset has 30 rows and 6 columns


#### Dividing the dependent and independent columns

In [6]:
# Defining the training and testing data
X_train = train_data[train_data.columns[:-1]]
Y_train = train_data[train_data.columns[-1]]
X_test = test_data[test_data.columns[:-1]]
Y_test = test_data[test_data.columns[-1]]

print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(120, 5) (30, 5) (120,) (30,)


#### Generating the numpy arrays out of data

In [7]:
# Generating the numpy arrays
X_train = X_train.values
Y_train = Y_train.values
X_test = X_test.values
Y_test = Y_test.values

# Reshaping train data into 2d matrices
Y_train = Y_train.reshape(len(Y_train), 1)
Y_test = Y_test.reshape(len(Y_test), 1)

print("Shapes of the matrices after splitting from the raw data")
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

# Concatenating the training data with a column of ones
X_train = np.concatenate((np.ones(shape = Y_train.shape, dtype = np.float64), X_train), axis = 1)
X_test = np.concatenate((np.ones(shape = Y_test.shape, dtype = np.float64), X_test), axis = 1)

print("Shapes of the matrices after concatenating ones")
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

Shapes of the matrices after splitting from the raw data
(120, 5)
(120, 1)
(30, 5)
(30, 1)
Shapes of the matrices after concatenating ones
(120, 6)
(120, 1)
(30, 6)
(30, 1)


Since the data is split and scaled, now we can proceed to build our model
### Building the logistic regression model

#### Softmax Hypothesis

In [15]:
# Hypothesis of logistic regression
def softmax(X_train, theta_list):
    n = theta_list.shape[1]
    g = X_train @ theta_list
    row_sum = np.sum(g, axis = 1, keepdims = True)
    row_sum = np.reciprocal(row_sum)
    g = np.multiply(g, row_sum)
    return g

#### Cost Function

In [25]:
def J(X_train, Y_train, hyp):
    g = np.log(hyp)
    J = np.zeros(shape=(len(X_train), 1))
    for i in range(0, len(Y_train)):
        for j in range(0, len(g[0])):
            J[i][0] += (Y_train[i][0]==j+1)*g[i][j]
    print(J)
    return J

In [None]:
# Function to calculate the gradient
def gradient(X_train, Y_train, hyp):
    # X_train is X_transpose here
    n = len(hyp[0])
    grad = np.zeros(X_train.shape[1], n)
    hyp = np.transpose(hyp)
    for i in range(0, n):
        

#### Function to perform Binary Logistic regression

In [9]:
def binLogiRegression(X, Y, theta, alpha, epochs):
    cost_list = [];
    m = len(Y)
    for i in range(epochs):
        # Calculating the hypothesis
        g = h(X, theta)
        # Calculating the gradient
        grad = (1/m) * (X.T @ (g - Y))
        # Updating theta
        theta = theta - alpha * grad
        # Calculating the cost
        cost = J(X, Y, theta)
        cost_list.append(cost[0][0])
        # print("Iteration: {} => Cost: {}".format(i, cost))
    
    # Plotting cost vs epoch
    plt.plot(list(range(0, epochs)), cost_list)
    return theta

##### For multi-class logistic regression, we take combination of several one-vs-rest binary regression models

In [29]:
# Creating list of theta values
theta_list = []

# Intialising vector forms
X, Y = X_train, Y_train
X_t = np.transpose(X)

# Initialising the hyperparameters
alpha = 0.0005
epochs = 10000

# Initialise theta for softmax regression
theta_list = np.ones(shape = (X.shape[1], label_dict.__len__()))
# print(theta_list.shape)

# Softmax
# softmax(X_train, theta_list)

# Cost
# print(J(X_train, Y_train, theta_list))

# Start softmax regression
# cost_list = [];
# m = len(Y)
# for i in range(epochs):
#     # Calculating the hypothesis
#     g = softmax(X, theta_list)
#     # Calculating the gradient
#     grad = (1/m) * (X.T @ (g - Y))
#     # Updating theta
#     theta = theta - alpha * grad
#     # Calculating the cost
#     cost = J(X, Y, theta)
#     cost_list.append(cost[0][0])
#     print("Iteration: {} => Cost: {}".format(i, cost))

# Plotting cost vs epoch
# plt.plot(list(range(0, epochs)), cost_list)

# for i in label_dict:
#     print("Training for {}".format(label_dict[i]))
    
#     # Modify the Y_train to have the correct label
#     Y_new = np.where(Y == i, 1, 0)
#     Y_new = Y_new.reshape(len(Y_new), 1)
    
#     # Creating theta vector
#     theta = np.zeros((X.shape[1], 1), dtype = np.float64)

#     # Training the model
#     theta = binLogiRegression(X, Y_new, theta, alpha, epochs)

#     # Storing theta values
#     theta_list.append(theta)

#     print("-----------------------------------------------------------")


#### Predicting the labels of the test data

In [11]:
def get_predictions(X_test, theta_list):
    predictions = []
    for i in theta_list:
        prediction = h(X_test, i)
        predictions.append(prediction);
    
    return predictions

In [12]:
# Getting the predictions of three models on the test data
predictions_list = get_predictions(X_test, theta_list)


for i in range(len(predictions_list[0])):
    a = predictions_list[0][i][0]
    b = predictions_list[1][i][0]
    c = predictions_list[2][i][0]
    if a > b and a > c:
        predictions_list[0][i][0] = 1
    elif b > a and b > c:
        predictions_list[0][i][0] = 2
    else:
        predictions_list[0][i][0] = 3

predictions = predictions_list[0]

print(predictions.shape)


(30, 1)


#### Evaluating the model accuracy

In [13]:
# Calculating the accuracy
accuracy = (predictions == Y_test).sum()/len(Y_test)
print("The accuracy of the model is {}".format(accuracy))

The accuracy of the model is 0.7666666666666667


In [28]:
a = np.array([[1,2],[3,4]]) 
print(np.where(a>=3))

(array([1, 1]), array([0, 1]))
