In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
data = pd.read_csv("college_student_placement_dataset.csv")

In [3]:
data.head()

Unnamed: 0,College_ID,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,CLG0030,107,6.61,6.28,8,No,8,8,4,No
1,CLG0061,97,5.52,5.37,8,No,7,8,0,No
2,CLG0036,109,5.36,5.83,9,No,3,1,1,No
3,CLG0055,122,5.47,5.75,6,Yes,1,6,1,No
4,CLG0004,96,7.91,7.69,7,No,8,10,2,No


### **Problem Statement:** Given some set of Input features of a student ($\vec{x}$), we have to train a Bernoulli Linear Regression Model also called Binary Logistic Regression model which will output $P(y=1|\vec{x})$, that is the Probability of a student getting Placed ($y=1 (\text{Placed}), y=0 (\text{Not Placed})$), on a condition that the input feature vector, $\vec{x}$ of the student is given. 

In [4]:
data.drop(labels=data.columns[0],axis=1,inplace=True)

In [5]:
data

Unnamed: 0,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,107,6.61,6.28,8,No,8,8,4,No
1,97,5.52,5.37,8,No,7,8,0,No
2,109,5.36,5.83,9,No,3,1,1,No
3,122,5.47,5.75,6,Yes,1,6,1,No
4,96,7.91,7.69,7,No,8,10,2,No
...,...,...,...,...,...,...,...,...,...
9995,119,8.41,8.29,4,No,1,8,0,Yes
9996,70,9.25,9.34,7,No,0,7,2,No
9997,89,6.08,6.25,3,Yes,3,9,5,No
9998,107,8.77,8.92,3,No,7,5,1,No


In [6]:
data[data.columns[4]].replace(to_replace=["No","Yes"],value=[0,1],inplace=True)
data[data.columns[-1]].replace(to_replace=["No","Yes"],value=[0,1],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[data.columns[4]].replace(to_replace=["No","Yes"],value=[0,1],inplace=True)
  data[data.columns[4]].replace(to_replace=["No","Yes"],value=[0,1],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[data.columns[-1]].replace(to_replace=["No","Yes"],value=[0,1],inp

In [7]:
data

Unnamed: 0,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,107,6.61,6.28,8,0,8,8,4,0
1,97,5.52,5.37,8,0,7,8,0,0
2,109,5.36,5.83,9,0,3,1,1,0
3,122,5.47,5.75,6,1,1,6,1,0
4,96,7.91,7.69,7,0,8,10,2,0
...,...,...,...,...,...,...,...,...,...
9995,119,8.41,8.29,4,0,1,8,0,1
9996,70,9.25,9.34,7,0,0,7,2,0
9997,89,6.08,6.25,3,1,3,9,5,0
9998,107,8.77,8.92,3,0,7,5,1,0


In [12]:
X = np.array(data.iloc[:,0:-1])
y = np.array(data[data.columns[-1]])
y = y.reshape(y.shape[0],1)

In [13]:
print("The shape of X matrix is {}".format(X.shape))
print("The shape of y matrix is {}".format(y.shape))

The shape of X matrix is (10000, 8)
The shape of y matrix is (10000, 1)


In [68]:
X_train = X[0:int(0.8 * X.shape[0])]
X_test = X[int(0.8*X.shape[0]):]

y_train = y[0:int(0.8*y.shape[0])]
y_test = y[int(0.8*y.shape[0]):]

In [69]:
print(X_train.shape, X_test.shape)
print(y_train.shape,y_test.shape)

(8000, 8) (2000, 8)
(8000, 1) (2000, 1)


In [70]:
X_tensor = torch.tensor(X_train,dtype=torch.float32)
y_tensor = torch.tensor(y_train,dtype=torch.float32)

In [77]:
X_test_tensor = torch.tensor(X_test,dtype=torch.float32)
y_test_tensor = torch.tensor(y_test,dtype=torch.float32)

In [78]:
print("The shape of X matrix is {}".format(X_tensor.shape))
print("The shape of y matrix is {}".format(y_tensor.shape))

The shape of X matrix is torch.Size([8000, 8])
The shape of y matrix is torch.Size([8000, 1])


In [79]:
print("The shape of X matrix of testing data is {}".format(X_test_tensor.shape))
print("The shape of y matrix of testing data is {}".format(y_test_tensor.shape))

The shape of X matrix of testing data is torch.Size([2000, 8])
The shape of y matrix of testing data is torch.Size([2000, 1])


In [81]:
class BernoulliLinearRegression(torch.nn.Module):

    def __init__(self):
        super().__init__()

        self.w0 = torch.nn.Parameter(data=torch.tensor([0.0]))
        self.w_vec = torch.nn.Parameter(data=torch.zeros(X_tensor.shape[1],1))

    
    def forward(self,X,y):

        z = self.w0 + torch.matmul(X,self.w_vec)
        p = 1/(1 + torch.exp(-z))
        f = -torch.mean(y*torch.log(p) + (1-y)*torch.log(1-p))

        return p,f

In [82]:
my_binary_cls_model = BernoulliLinearRegression()

In [None]:
tol = 10**(-4)
optimizer = torch.optim.SGD(params=my_binary_cls_model.parameters(),lr=0.001,)
epoch = 0
trained_parameters = list()

while(True):

    p_initial, loss_func_initial_value = my_binary_cls_model(X_tensor,y_tensor)
    optimizer.zero_grad()
    loss_func_initial_value.backward()
    optimizer.step()
    p_final, loss_func_final_value = my_binary_cls_model(X_tensor,y_tensor)

    epoch += 1

    if torch.abs(loss_func_final_value - loss_func_initial_value) < tol:
        break

    print("Epoch = {}, Binary Cross Entropy Loss = {}".format(epoch,loss_func_initial_value.item()))

print("\n\nParameters of our trained Binary Logistic Regression Model are:")

for param in my_binary_cls_model.parameters():
    trained_parameters.append(param.detach().numpy())

print("The Global Minima, w0 = {}".format(trained_parameters[0][0]))
print("The Global Minima, w_vec = {}".format(trained_parameters[1]))

Epoch = 1, Binary Cross Entropy Loss = 0.6931474208831787
Epoch = 2, Binary Cross Entropy Loss = 0.6264618635177612
Epoch = 3, Binary Cross Entropy Loss = 0.48267507553100586
Epoch = 4, Binary Cross Entropy Loss = 0.47715118527412415
Epoch = 5, Binary Cross Entropy Loss = 0.4764786660671234
Epoch = 6, Binary Cross Entropy Loss = 0.4762101471424103


Parameters of our trained Binary Logistic Regression Model are:
The Global Minima, w0 = -0.00028668573941104114
The Global Minima, w_vec = [[-0.01467878]
 [-0.00095564]
 [-0.0009219 ]
 [-0.00170166]
 [-0.00012193]
 [-0.00144448]
 [ 0.00081405]
 [ 0.00026929]]


In [85]:
p_test, f_test = my_binary_cls_model(X_test_tensor,y_test_tensor)
print("Binary Cross Entropy Loss on Test Data = {}".format(f_test.item()))

Binary Cross Entropy Loss on Test Data = 0.4843500554561615


In [88]:
print("The Conditional Probabilities, P(y=1|x) for the Testing Data are:")
y_test_pred = p_test.detach().numpy() > 0.5
print(y_test_pred)

The Conditional Probabilities, P(y=1|x) for the Testing Data are:
[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


In [89]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test,y_pred=y_test_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91      1661
           1       0.00      0.00      0.00       339

    accuracy                           0.83      2000
   macro avg       0.42      0.50      0.45      2000
weighted avg       0.69      0.83      0.75      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [91]:
p_train, f_train = my_binary_cls_model(X_tensor,y_tensor)
print("Binary Cross Entropy Loss on Test Data = {}".format(f_train.item()))

Binary Cross Entropy Loss on Test Data = 0.47598546743392944


In [92]:
print("The Conditional Probabilities, P(y=1|x) for the Training Data are:")
y_train_pred = p_train.detach().numpy() > 0.5
print(y_train_pred)

The Conditional Probabilities, P(y=1|x) for the Training Data are:
[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


In [93]:
print(classification_report(y_true=y_train,y_pred=y_train_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91      6680
           1       0.00      0.00      0.00      1320

    accuracy                           0.83      8000
   macro avg       0.42      0.50      0.46      8000
weighted avg       0.70      0.83      0.76      8000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
