In [1]:
#importing various modules
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm

In [2]:
#reading data
data = pd.read_json('data.json')
data.head(20)

Unnamed: 0,0,1,2
0,-3.005382,-1.701191,1
1,-2.908105,-1.699589,1
2,-2.958954,-1.66094,1
3,-2.92284,-1.638722,1
4,-2.92424,-1.508256,1
5,-2.981481,-1.438883,1
6,-3.087897,-1.444972,1
7,-2.93186,-1.490334,1
8,-2.988094,-1.436734,1
9,-2.94321,-1.37708,1


In [3]:
#assigning column names
data.columns=['x','y','label']
data.head()

Unnamed: 0,x,y,label
0,-3.005382,-1.701191,1
1,-2.908105,-1.699589,1
2,-2.958954,-1.66094,1
3,-2.92284,-1.638722,1
4,-2.92424,-1.508256,1


In [4]:
#checking for missing entries
print(data.shape)
print(data.isnull().sum())

(6000, 3)
x        0
y        0
label    0
dtype: int64


In [5]:
#dropping duplicate rows 
data.drop_duplicates(keep = 'last')

Unnamed: 0,x,y,label
0,-3.005382,-1.701191,1
1,-2.908105,-1.699589,1
2,-2.958954,-1.660940,1
3,-2.922840,-1.638722,1
4,-2.924240,-1.508256,1
...,...,...,...
5995,2.977109,1.471810,1
5996,2.987465,1.498655,1
5997,2.863268,1.543203,1
5998,2.929324,1.621393,1


In [6]:
data['label'].value_counts()

1    4679
0    1321
Name: label, dtype: int64

In [7]:
#splitting training and test data 
data_test = data.sample(n=2000, random_state=400)
data_train = data.drop(data_test.index)
print(data_test.shape)
print(data_train.shape)
data_train.reset_index(drop = True, inplace= True)
data_test.reset_index(drop = True, inplace= True)

(2000, 3)
(4000, 3)


In [8]:
data_train.head()

Unnamed: 0,x,y,label
0,-3.005382,-1.701191,1
1,-2.908105,-1.699589,1
2,-2.958954,-1.66094,1
3,-2.92284,-1.638722,1
4,-2.92424,-1.508256,1


In [9]:
data_test.head()

Unnamed: 0,x,y,label
0,-1.429134,1.587175,0
1,-0.598626,0.459701,1
2,-2.211451,-1.435652,1
3,-0.288557,-0.224374,0
4,-1.250891,-1.514649,0


In [10]:
data_train['label'].value_counts()

1    3145
0     855
Name: label, dtype: int64

In [11]:
data_test['label'].value_counts()

1    1534
0     466
Name: label, dtype: int64

In [12]:
def polynomial(x,y,weights):
    sum = 0
    for i in range(5):
        sum += weights[i]*(x**(4-i))*(y**i)
    return sum 
def sigmoid(result):
    return 1/(1+np.exp(-1*result))
#guess weight    
weights = [0.001]*5
result = polynomial(data_train['x'], data_train['y'], weights)
print(result)
result = sigmoid(result)
print(result)

0       0.177075
1       0.160372
2       0.165009
3       0.156917
4       0.145498
          ...   
3995    0.138183
3996    0.142758
3997    0.154758
3998    0.139156
3999    0.162048
Length: 4000, dtype: float64
0       0.544153
1       0.540007
2       0.541159
3       0.539149
4       0.536311
          ...   
3995    0.534491
3996    0.535629
3997    0.538613
3998    0.534733
3999    0.540424
Length: 4000, dtype: float64


log loss = $\sum$ -ylog(y') - (1-y)log(1-y')

In [13]:
loss = -data_train['label']*np.log(result) - (1-data_train['label'])*np.log(1-result)
loss = loss.sum()
loss

2734.0531738058

Loss is high and we need to employ gradient descent algorithm to get appropriate weights

In [14]:
alpha = 0.0005
def sgn(a):
  b = abs(a)
  return a/b

x,y,label = data['x'],data['y'],data['label']
def grad1(data,weights):
  value = weights[0]*x**4+weights[1]*(x**3)*y+weights[2]*(x**2)*(y**2)+weights[3]*(x**1)*(y**3)+weights[4]*(y**4)
  gradient = -(label/value)*(x**4)+((1-label)/(1-value))*(x**4)
  return gradient.sum()
def grad2(data,weights):
  value = weights[0]*x**4+weights[1]*(x**3)*y+weights[2]*(x**2)*(y**2)+weights[3]*(x**1)*(y**3)+weights[4]*(y**4)
  grad = -(label/value)*(x**3*y)+((1-label)/(1-value))*(x**3*y)
  return grad.sum()
def grad3(data,weights):
  value = weights[0]*x**4+weights[1]*(x**3)*y+weights[2]*(x**2)*(y**2)+weights[3]*(x**1)*(y**3)+weights[4]*(y**4)
  grad = -(label/value)*(x**2*y**2)+((1-label)/(1-value))*(x**2*y**2)
  return grad.sum()
def grad4(data,weights):
  value = weights[0]*x**4+weights[1]*(x**3)*y+weights[2]*(x**2)*(y**2)+weights[3]*(x**1)*(y**3)+weights[4]*(y**4)
  grad = -(label/value)*(x*(y**3))+((1-label)/(1-value))*(x*(y**3))
  return grad.sum()
def grad5(data, weights):
  value = weights[0]*x**4+weights[1]*(x**3)*y+weights[2]*(x**2)*(y**2)+weights[3]*(x**1)*(y**3)+weights[4]*(y**4)
  grad = -(label/value)*(y**4)+((1-label)/(1-value))*(y**4)
  return grad.sum()
  
for i in range(750):
  weights[0] = weights[0] - alpha*sgn(grad1(data,weights))
  weights[1] = weights[1] - alpha*sgn(grad2(data,weights))
  weights[2] = weights[2] - alpha*sgn(grad3(data,weights))
  weights[3] = weights[3] - alpha*sgn(grad4(data,weights))
  weights[4] = weights[4] - alpha*sgn(grad5(data,weights))
result = polynomial(data_train['x'], data_train['y'],weights)
result = sigmoid(result)
loss = -data_train['label']*np.log(result) - (1-data_train['label'])*np.log(1-result)
loss = loss.sum()
print(loss)

1236.6133359076587


In [15]:
#defining threshold for classification 
#0.525 is the best possible threshold ,checked through trial and error
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:]])
print(result.shape)
scores = score_model(result, 0.525)
unique, counts = np.unique(scores, return_counts=True)
print(np.asarray((unique, counts)))
data_train['label'].value_counts()

(4000,)
[[   0    1]
 [ 925 3075]]


1    3145
0     855
Name: label, dtype: int64

In [16]:
result = polynomial(data_test['x'], data_test['y'],weights)
result = sigmoid(result)
loss = -data_train['label']*np.log(result) - (1-data_train['label'])*np.log(1-result)
loss = loss.sum()
print(loss)
scores = score_model(result, 0.525)
unique, counts = np.unique(scores, return_counts=True)
print(np.asarray((unique, counts)))
data_test['label'].value_counts()

3174.0811116439495
[[   0    1]
 [ 466 1534]]


1    1534
0     466
Name: label, dtype: int64

# Checking Various Metrics

In [17]:
def print_metrics(labels, scores):
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy  %0.2f' % sklm.accuracy_score(labels, scores))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])


    
print_metrics(data_test['label'], scores)

                 Confusion matrix
                 Score positive    Score negative
Actual positive       321               145
Actual negative       145              1389

Accuracy  0.85
 
           Positive      Negative
Num case      466          1534
Precision    0.69          0.91
Recall       0.69          0.91
F1           0.69          0.91


In [18]:
print(weights)

[0.25400000000000017, 0.017000000000000008, 0.2460000000000002, -0.01800000000000001, -0.038000000000000034]


These are the required weights which will be used to classify test data