<a href="https://colab.research.google.com/github/aryankr30/ML-projects/blob/main/TASK_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

***LOGISTIC REGRESSION FOR BINARY CLASSIFICATION FROM SCRATCH***

In [507]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

In [508]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iters=1000000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # initializing parameters
        self.weights = np.zeros(n_features)
        self.bias = 0

        # gradient descent
        for i in range(self.n_iters):
            # approximate y with linear combination of weights and x, plus bias
            linear_model = np.dot(X, self.weights) + self.bias
            # apply sigmoid function
            y_predicted = self._sigmoid(linear_model)
            
            #cost function
            cost = -(1/n_samples)*np.sum( y*np.log(y_predicted) + (1-y)*np.log(1-y_predicted))

            # compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            # update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

            #keeping track of cost function value
            if(i%(self.n_iters/10) == 0):
               print("cost after ", i, "iteration is : ", cost)

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i >= 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

In [509]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [510]:
def normalize(X): 
    # m-> number of training examples
    # n-> number of features 
    m, n = X.shape
    
    # Normalizing all the n features of X
    for i in range(n):
        X = (X - X.mean(axis=0))/X.std(axis=0)
        
    return X

In [511]:
df=pd.read_csv("crack_detection.csv")
y1=df.iloc[:,24]
x1=df.iloc[:,1:24]
x1=np.array(x1)
y1=np.array(y1)

#splitting of datasets into 60%,20%,20% as train set ,test set, validation set
x_train, x_Combine, y_train, y_Combine = train_test_split(x1,y1,train_size=0.6,random_state=11,shuffle=True)

x_val, x_test, y_val, y_test = train_test_split(x_Combine,y_Combine,test_size=0.5,random_state=1)


#Normalization            
X_train=normalize(x_train)
X_test=normalize(x_test)
X_val=normalize(x_val)

regressor = LogisticRegression(learning_rate=0.01, n_iters=1000000)
regressor.fit(X_train, y_train)
y_test_pred = regressor.predict(X_test)
y_val_pred = regressor.predict(X_val)
y_train_pred=regressor.predict(X_train)

#accuracy using user defined function
print("accuracy of training set", accuracy(y_train, y_train_pred)*100)
print("accuracy of validating set", accuracy(y_val, y_val_pred)*100)
print("accuracy of testing set", accuracy(y_test, y_test_pred)*100)

from sklearn.metrics import f1_score
#f1 score using sklearn
print("f1 score of training set", f1_score(y_train, y_train_pred)*100)
print("f1 score of validating set", f1_score(y_val, y_val_pred)*100)
print("f1 score of testing set", f1_score(y_test, y_test_pred)*100)

cost after  0 iteration is :  0.6931471805599453
cost after  100000 iteration is :  0.28456020033833296
cost after  200000 iteration is :  0.25611149005809125
cost after  300000 iteration is :  0.24028860566616914
cost after  400000 iteration is :  0.23017783608983808
cost after  500000 iteration is :  0.22325777550026796
cost after  600000 iteration is :  0.21829091615864352
cost after  700000 iteration is :  0.21459495407404502
cost after  800000 iteration is :  0.21176469023285033
cost after  900000 iteration is :  0.20954599676049468
accuracy of training set 91.42857142857143
accuracy of validating set 84.28571428571429
accuracy of testing set 79.28571428571428
f1 score of training set 91.96428571428572
f1 score of validating set 84.93150684931507
f1 score of testing set 81.29032258064515
