# Logistic Regression using gradient descent

In [1]:
import numpy as np

class LogisticRegression:
    def __init__(self, lr=0.01,n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None

    def fit(self, X, y):
        # init parameters
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)

        #gradient descent
        for _ in range(self.n_iters):
            linear_model = np.dot(X, self.weights)
            y_pred = self.sigmoid(linear_model)

            dw = (1/n_samples) * np.dot(X.T, (y_pred - y))
            self.weights -= self.lr * dw

    def predict(self, X):
        linear_model = np.dot(X, self.weights)
        y_pred = self.sigmoid(linear_model)
        y_pred_cls = [1 if i > 0.5 else 0 for i in y_pred]
        return y_pred_cls

    def sigmoid(self, x):
        return 1/(1 + np.exp(-x))


In [2]:
class NaiveBayes:

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)

        #init mean, var, priors
        self.mean = np.zeros((n_classes, n_features))
        self.var = np.zeros((n_classes, n_features))
        self.priors = np.zeros(n_classes)

        for c in range(n_classes):
            X_c = X[c==y]
            self.mean[c,:] = X_c.mean(axis=0)
            self.var[c,:] = X_c.var(axis=0)
            self.priors[c] = X_c.shape[0] / float(n_samples)


    def predict(self, X):
        posteriors = []

        for idx, c in enumerate(self.classes):
            prior = np.log(self.priors[idx])
            llhood = np.sum(np.log(self._pdf(idx, X)))
            posterior = prior + llhood
            posteriors.append(posterior)

        return self.classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp(-(x - mean)**2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator/denominator

# Train test split

In [3]:
def train_test_split(X, y, test_size=.2):
    indices = np.random.permutation(X.shape[0])
    n = int(X.shape[0]*(1-.2))
    training_idx, test_idx = indices[:n], indices[n:]
    X_train, X_test = X.iloc[training_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[training_idx], y.iloc[test_idx]
    
    return X_train,X_test,y_train,y_test

Standard scalar

In [4]:
def standard_scalar(x):
    m = np.mean(x)
    s = np.sqrt(np.var(x))
    return ((x - m)/s)

# Data preprocessing

In [5]:
import pandas as pd 

data = pd.read_csv("/home/ibab/Downloads/Assignment/classificationdataset/train.csv")
data2 = pd.read_csv("/home/ibab/Downloads/Assignment/classificationdataset/test.csv")
data = pd.concat([data,data2])
data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
401,402,0.0,3,"Adams, Mr. John",male,26.0,0,0,341826,8.05,,S
34,35,0.0,1,"Meyer, Mr. Edgar Joseph",male,28.0,1,0,PC 17604,82.1708,,C
555,556,0.0,1,"Wright, Mr. George",male,62.0,0,0,113807,26.55,,S
316,317,1.0,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24.0,1,0,244367,26.0,,S
587,588,1.0,1,"Frolicher-Stehli, Mr. Maxmillian",male,60.0,1,1,13567,79.2,B41,C


In [6]:
req_data = data.copy()
print(req_data.count())

#fill na values in Embarked with the mode
req_data['Embarked'].fillna(req_data["Embarked"].mode()[0], inplace=True)

#fill na values in Age with mean
req_data["Age"].fillna(int(req_data["Age"].mean()), inplace=True)

#fill missing price row
req_data["Fare"].fillna(int(req_data["Fare"].mean()), inplace=True)
print('-' * 40)
print(req_data.count())

PassengerId    1309
Survived        891
Pclass         1309
Name           1309
Sex            1309
Age            1046
SibSp          1309
Parch          1309
Ticket         1309
Fare           1308
Cabin           295
Embarked       1307
dtype: int64
----------------------------------------
PassengerId    1309
Survived        891
Pclass         1309
Name           1309
Sex            1309
Age            1309
SibSp          1309
Parch          1309
Ticket         1309
Fare           1309
Cabin           295
Embarked       1309
dtype: int64


In [7]:
#one hot encoding

req_data = pd.get_dummies(req_data, columns=['Pclass', 'Sex', 'Embarked' ], drop_first= True)

#drop uninformative columns

req_data.drop(columns=["Cabin","PassengerId","Name","Ticket"], inplace=True)
req_data

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,0.0,22.0,1,0,7.2500,0,1,1,0,1
1,1.0,38.0,1,0,71.2833,0,0,0,0,0
2,1.0,26.0,0,0,7.9250,0,1,0,0,1
3,1.0,35.0,1,0,53.1000,0,0,0,0,1
4,0.0,35.0,0,0,8.0500,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
413,,29.0,0,0,8.0500,0,1,1,0,1
414,,39.0,0,0,108.9000,0,0,0,0,0
415,,38.5,0,0,7.2500,0,1,1,0,1
416,,29.0,0,0,8.0500,0,1,1,0,1


In [8]:
#check
print(req_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    float64
 1   Age         1309 non-null   float64
 2   SibSp       1309 non-null   int64  
 3   Parch       1309 non-null   int64  
 4   Fare        1309 non-null   float64
 5   Pclass_2    1309 non-null   uint8  
 6   Pclass_3    1309 non-null   uint8  
 7   Sex_male    1309 non-null   uint8  
 8   Embarked_Q  1309 non-null   uint8  
 9   Embarked_S  1309 non-null   uint8  
dtypes: float64(3), int64(2), uint8(5)
memory usage: 67.8 KB
None


In [9]:
#scale the features

features = ["Age", "Fare"]
req_data[features] = req_data[features].apply(standard_scalar)
req_data

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,0.0,-0.598000,1,0,-0.503590,0,1,1,0,1
1,1.0,0.643936,1,0,0.734507,0,0,0,0,0
2,1.0,-0.287516,0,0,-0.490539,0,1,0,0,1
3,1.0,0.411073,1,0,0.382929,0,0,0,0,1
4,0.0,0.411073,0,0,-0.488122,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
413,,-0.054653,0,0,-0.488122,0,1,1,0,1
414,,0.721557,0,0,1.461834,0,0,0,0,0
415,,0.682746,0,0,-0.503590,0,1,1,0,1
416,,-0.054653,0,0,-0.488122,0,1,1,0,1


In [10]:
train_data = req_data.iloc[:891,:] # split the train data 
test_data = req_data.iloc[891:,:]

X = train_data.drop(columns=["Survived"])
y = train_data["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)


def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred)/len(y_true)
    return accuracy

model = LogisticRegression(lr=.1,n_iters=2000)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print("Logistic Regression classification accuracy:", accuracy(y_test, predictions))

model2 = NaiveBayes()
model2.fit(X_train, y_train)
predictions2=[]
for i in range(X_test.shape[0]):
    pred = model2.predict(X_test.iloc[i,])
    predictions2.append(pred)
    


print("Naive bayes classification accuracy:", accuracy(y_test, predictions2))

Logistic Regression classification accuracy: 0.770949720670391
Naive bayes classification accuracy: 0.8268156424581006


In [11]:
#check against the test data

pred = model.predict(test_data.drop(columns="Survived"))
print(pred[:20])

[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0]


In [12]:
#Newton's method 
def newton_method(n_iters, X, y):
    n_samples, n_features = X.shape
    weights = np.zeros(n_features)
    bias = 0
    for _ in range(n_iters):
        y_pred = np.dot(X, weights) + bias

        dw = (2/n_samples) * np.dot(X.T, (y_pred - y))
        db = (2/n_samples) * np.sum(y_pred - y)
        
        ddw = (2/n_samples) * np.sum(X ** 2)
        ddb = (2/n_samples) * n_samples

        weights -= dw/ddw
        bias -= db/ddb
        
    return weights, bias


def predict(X, weights, bias):
    y_pred = np.dot(X, weights) + bias
    return y_pred

        

In [13]:
X = train_data.drop(columns=["Survived"])
y = train_data["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)


In [14]:
weight, bias = newton_method(100,X_train.values,y_train)
weight, bias

(array([-0.07132893, -0.04656809, -0.00676707,  0.05076288, -0.02737781,
        -0.22324986, -0.47887317, -0.01553228, -0.05293599]),
 0.8954055998665815)

In [15]:
#check 
pred = predict(X_test, weight, bias)
print(pred.round()[:10], np.array(y_test)[:10])

[1. 0. 0. 0. 1. 0. 1. 1. 1. 0.] [1. 0. 0. 1. 1. 0. 1. 1. 1. 0.]


In [16]:
print("Logistic Regression classification accuracy:", accuracy(y_test, pred.round()))

Logistic Regression classification accuracy: 0.8324022346368715
