> Scratch Implementation.

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('placement.csv')
df

Unnamed: 0,cgpa,package
0,6.89,3.26
1,5.12,1.98
2,7.82,3.25
3,7.42,3.67
4,6.94,3.57
...,...,...
195,6.93,2.46
196,5.89,2.57
197,7.21,3.24
198,7.63,3.96


In [4]:
# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Cost function
def cost_function(X, y, theta):
    m = len(y)
    h = sigmoid(X @ theta)
    cost = (1 / m) * (-y.T @ np.log(h) - (1 - y).T @ np.log(1 - h))
    gradient = (1 / m) * X.T @ (h - y)
    return cost, gradient

# Gradient descent
def gradient_descent(X, y, theta, learning_rate, iterations):
    cost_history = []
    for _ in range(iterations):
        cost, gradient = cost_function(X, y, theta)
        theta -= learning_rate * gradient
        cost_history.append(cost)
    return theta, cost_history

# Prepare the data
X = df['cgpa'].values.reshape(-1, 1)
X = np.hstack((np.ones((X.shape[0], 1)), X))  # Add intercept term
# num_rows in X := X.shape[0]
y = (df['package'] > 3).astype(int)  # Convert package to binary labels

# Initialize theta and hyperparameters
theta = np.zeros(X.shape[1])
learning_rate = 0.01
iterations = 1000

# Train the model
theta, cost_history = gradient_descent(X, y, theta, learning_rate, iterations)


In [5]:
# Predictions
predictions = sigmoid(X @ theta)
predicted_labels = (predictions > 0.5).astype(int)

# Create DataFrame for predicted and actual output
results_df = pd.DataFrame({
    'CGPA': df['cgpa'],
    'Predicted Output': predicted_labels,
    'Actual Output': y
})

print(results_df)


     CGPA  Predicted Output  Actual Output
0    6.89                 1              1
1    5.12                 0              0
2    7.82                 1              1
3    7.42                 1              1
4    6.94                 1              1
..    ...               ...            ...
195  6.93                 1              0
196  5.89                 1              0
197  7.21                 1              1
198  7.63                 1              1
199  6.22                 1              0

[200 rows x 3 columns]


In [6]:
# Initialize counts
TP = 0
TN = 0
FP = 0
FN = 0

# Iterate through rows of the DataFrame
for index, row in results_df.iterrows():
    actual = row['Actual Output']
    predicted = row['Predicted Output']
    
    if actual == 1 and predicted == 1:
        TP += 1
    elif actual == 0 and predicted == 0:
        TN += 1
    elif actual == 0 and predicted == 1:
        FP += 1
    elif actual == 1 and predicted == 0:
        FN += 1

# Create the confusion matrix DataFrame
confusion_matrix = pd.DataFrame({
    'Predicted Positive': [TP, FP],
    'Predicted Negative': [FN, TN]
}, index=['Actual Positive', 'Actual Negative'])

print(confusion_matrix)


                 Predicted Positive  Predicted Negative
Actual Positive                  98                   0
Actual Negative                  89                  13


F1 score is usually more useful, than accuracy especially if you have an uneven sample distribution. 
Accuracy work best If "FP" & "FN" have similar cost

If FP & FN very different better to look at "Precision" & "Recall"

In [7]:
# Calculate Accuracy
accuracy = (TP + TN) / (TP + FP + FN + TN)

# Calculate Precision
precision = TP / (TP + FP) if (TP + FP) > 0 else 0

# Calculate Recall
recall = TP / (TP + FN) if (TP + FN) > 0 else 0

# Calculate F1 Score
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Display the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [accuracy, precision, recall, f1_score]
})

print(metrics_df)


      Metric     Value
0   Accuracy  0.555000
1  Precision  0.524064
2     Recall  1.000000
3   F1 Score  0.687719


> Using Library Functions.

In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load the dataset
df = pd.read_csv('placement.csv')

# Prepare the data
X = df['cgpa'].values.reshape(-1, 1)
y = (df['package'] > 3).astype(int)

# Initialize and train the logistic regression model
model = LogisticRegression(solver='liblinear')
model.fit(X, y)

# Make predictions
predictions = model.predict(X)

# Create DataFrame for predicted and actual output
results_df = pd.DataFrame({
    'CGPA': df['cgpa'],
    'Predicted Output': predictions,
    'Actual Output': y
})

print(results_df)

# Calculate evaluation metrics using library functions
conf_matrix = confusion_matrix(y, predictions)
accuracy = accuracy_score(y, predictions)
precision = precision_score(y, predictions)
recall = recall_score(y, predictions)
f1 = f1_score(y, predictions)

# Display the metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [accuracy, precision, recall, f1]
})

print(metrics_df)


     CGPA  Predicted Output  Actual Output
0    6.89                 0              1
1    5.12                 0              0
2    7.82                 1              1
3    7.42                 1              1
4    6.94                 1              1
..    ...               ...            ...
195  6.93                 1              0
196  5.89                 0              0
197  7.21                 1              1
198  7.63                 1              1
199  6.22                 0              0

[200 rows x 3 columns]
      Metric     Value
0   Accuracy  0.810000
1  Precision  0.772727
2     Recall  0.867347
3   F1 Score  0.817308
