In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
X = pd.read_csv("Logistic_X_Train.csv")
y = pd.read_csv("Logistic_Y_Train.csv")
print(X.shape,y.shape)

(3000, 3) (3000, 1)


In [7]:
X.head()

Unnamed: 0,f1,f2,f3
0,-1.239375,0.749101,-0.528515
1,-1.03607,0.801436,-1.283712
2,-0.615579,1.579521,-1.391927
3,1.335978,1.348651,1.433564
4,0.658925,1.300019,0.571603


## Data preparation

In [8]:
df =  X
df["label"] = y.values

In [9]:
df.head()

Unnamed: 0,f1,f2,f3,label
0,-1.239375,0.749101,-0.528515,1
1,-1.03607,0.801436,-1.283712,1
2,-0.615579,1.579521,-1.391927,1
3,1.335978,1.348651,1.433564,0
4,0.658925,1.300019,0.571603,0


In [10]:
#almost normalised data
print(df.mean(axis = 0))
print(df.std(axis = 0))

f1       0.007453
f2       0.999286
f3       0.029487
label    0.497000
dtype: float64
f1       1.334138
f2       1.023853
f3       1.209558
label    0.500074
dtype: float64


In [11]:
df["label"].value_counts()

0    1509
1    1491
Name: label, dtype: int64

In [12]:
train_size = 0.8
size = int(train_size*df.shape[0])
df = np.asarray(df)
X_train = df[ :size , :-1]
y_train = df[ :size , -1]
X_test = df[ size: , :-1]
y_test = df[ size: , -1]

In [17]:
y_train = y_train.reshape(-1,1).ravel()
y_test = y_test.reshape(-1,1).ravel()

In [18]:
print(X_train.shape , y_train.shape)
print(X_test.shape , y_test.shape)

(2400, 3) (2400,)
(600, 3) (600,)


In [19]:
print(type(y_train),y_train.shape)

<class 'numpy.ndarray'> (2400,)


## Modeling using scikit learn

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [52]:
model = LogisticRegression()

In [53]:
model.fit(X_train,y_train)

LogisticRegression()

In [65]:
print(model.coef_)
print(model.intercept_)
y_pred = model.predict(X_test)

[[-3.95677446  3.14792524 -2.34455887]]
[-4.34570812]


In [66]:
# model accuracy on test data
model.score(X_test,y_test)

0.9933333333333333

In [67]:
# model accuracy on training data
model.score(X_train,y_train)

0.9958333333333333

In [70]:
conf = confusion_matrix(y_test,y_pred)
print(conf)

[[267   3]
 [  1 329]]


generating solution csv for final prediction

In [80]:
df2 = pd.read_csv("Logistic_X_Test.csv")

In [81]:
df2 = np.asarray(df2)
print(type(df2), df2.shape)

<class 'numpy.ndarray'> (1000, 3)


In [74]:
output = model.predict(df2)

In [77]:
output = pd.DataFrame(output,columns=["label"])

In [79]:
output.to_csv("output.csv",index=False)

## Modelling using python from scratch

In [49]:
def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))
def hypothesis(X,theta):
    h_ = sigmoid(np.dot(X,theta))
    return h_

In [50]:
def gradient(X,y,theta):
    m = X.shape[0]
    n = X.shape[1]
    hypo = hypothesis(X,theta)
    grad = np.dot(X.T , (y-hypo) )
    grad = grad/m
    theta = theta + (0.3) * grad
    return theta


In [51]:


def gradient_descent(X,y,iter_number):
    m = X.shape[0]
    n = X.shape[1]
    #initializing theta to zeroes
    theta = np.zeros((n,1))
    
    for i in range(iter_number):
        hypo = hypothesis(X,theta)
        #when y=1
        one =  y * np.log(hypo) 
        # when y=0
        zero =  (1-y).T * np.log(1-hypo) 
        # total cost
        J = (-1.0)*np.mean(one+zero)
        
        #updating the paramters
        theta = gradient(X,y,theta)
    
    return theta
    

In [52]:

bias = np.ones((2400,1))
X_train_pyth = np.hstack((bias,X_train))
X_train_pyth.shape

(2400, 4)

In [71]:
y_train.resize(2400,1)
y_train.shape
y_test.resize(600,1)
y_test.shape

(600, 1)

In [54]:
theta = gradient_descent(X_train_pyth,y_train,300)

In [57]:
print(theta,theta.shape)

[[-1.75505745]
 [-2.27010035]
 [ 1.50485148]
 [-2.32108182]] (4, 1)


In [75]:
bias2 = np.ones((600,1))
X_test_pyth = np.hstack((bias2,X_test))
y_output = np.dot(X_test_pyth,theta)
y_output = sigmoid(y_output)

In [76]:
for i in range(y_output.shape[0]):
    if y_output[i] >= 0.5:
        y_output[i] = 1
    else:
        y_output[i] = 0

In [78]:
conf2 = confusion_matrix(y_test,y_output)

In [79]:
conf2

array([[265,   5],
       [  1, 329]], dtype=int64)

In [82]:
#csv generation for final submission using python code from scratch
bias3 = np.ones((1000,1))
df2 = np.hstack((bias3,df2))
y_output_csv = np.dot(df2,theta)
y_output_csv = sigmoid(y_output_csv)

In [86]:
for i in range(1000):
    if y_output_csv[i] >= 0.5:
        y_output_csv[i] = 1
    else:
        y_output_csv[i] = 0

In [88]:
output2 = pd.DataFrame(y_output_csv,columns=["label"])
output2.to_csv("output_scratch.csv",index=False)