# Logistic Regression Model Solution

This notebook demonstrates logistic regression analysis to predict exam pass/fail based on hours of study.

Converted from R script to Python.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

## 2. Load and Explore Data

In [None]:
# Load data
data = pd.read_csv('PassFail.csv')
print(data.info())
data.head(10)

In [None]:
# Check data types
print("Data types:")
print(data.dtypes)
print(f"\nDataset shape: {data.shape}")

## 3. Build the Logistic Regression Model

In [None]:
# Prepare features and target
X = data[['Hours']]  # Features (needs to be 2D array)
y = data['Pass']     # Target variable

# Build and fit the model
model = LogisticRegression()
model.fit(X, y)

# Display model coefficients
print("Model Coefficients:")
print(f"Intercept: {model.intercept_[0]:.4f}")
print(f"Hours coefficient: {model.coef_[0][0]:.4f}")

## 4. Make Predictions

### Single Prediction
What is the probability of passing with 2 hours of study?

In [None]:
hours_test = np.array([[2.0]])
prob = model.predict_proba(hours_test)[:, 1]
print(f"Probability of passing with 2 hours: {prob[0]:.4f} ({prob[0]*100:.2f}%)")

### Multiple Predictions

In [None]:
# Predicting for several values at the same time
hours_multiple = np.array([[1.0], [1.5], [2.0], [2.5], [3.0], [3.5], [4.0], [4.5]])
prob_pass = model.predict_proba(hours_multiple)[:, 1]

print("Probabilities for multiple hour values:")
for hours, prob in zip(hours_multiple.flatten(), prob_pass):
    print(f"{hours} hours: {prob:.4f} ({prob*100:.1f}%)")

## 5. Classification with Threshold

Classification means converting probabilities into classes (making decisions).

We need to define a threshold - let's use 0.5

In [None]:
print("Classification with threshold 0.5:")

# Binary format (1/0)
classifications_binary = np.where(prob_pass > 0.5, 1, 0)
print("Binary (1/0):", classifications_binary)

# Boolean format
classifications_bool = np.where(prob_pass > 0.5, True, False)
print("Boolean:", classifications_bool)

# Text format
classifications_text = np.where(prob_pass > 0.5, 'PASS', 'FAIL')
print("Text:", classifications_text)

## 6. In-Sample Predictions and Visualisation

In [None]:
# Probabilities to pass the exam for in-sample values
prob_pass_insample = model.predict_proba(X)[:, 1]
print("In-sample probabilities:")
print(prob_pass_insample)

In [None]:
# Plot the results
plt.figure(figsize=(10, 6))
plt.scatter(data['Hours'], prob_pass_insample, color='blue', alpha=0.6, s=50)
plt.plot(data['Hours'], prob_pass_insample, 'b-', alpha=0.3)
plt.xlabel('Hours of Study', fontsize=12)
plt.ylabel('Probability of Passing', fontsize=12)
plt.title('Logistic Regression: Hours vs Probability of Passing', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Model Predictions on Training Data

In [None]:
# Classification of in-sample values
classes = np.where(prob_pass_insample > 0.5, 1, 0)
print("Predicted classes for in-sample data:")
print(classes)

In [None]:
# Add predicted classes into the original dataset
data['PredictedPass'] = classes
print("Dataset with predictions:")
data

## 8. Model Accuracy

In [None]:
# Calculate the accuracy of the model
accuracy = accuracy_score(data['Pass'], data['PredictedPass'])
print(f"Model Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Alternative way to calculate accuracy
accuracy_alt = np.sum(data['PredictedPass'] == data['Pass']) / len(data)
print(f"Alternative calculation: {accuracy_alt:.4f} ({accuracy_alt*100:.2f}%)")

## 9. Confusion Matrix

In [None]:
# Build confusion matrix
cm = confusion_matrix(data['Pass'], data['PredictedPass'])
print("Confusion Matrix:")
print("                Predicted")
print("              0 (Fail)  1 (Pass)")
print(f"Actual 0  [{cm[0,0]:>8}  {cm[0,1]:>8}]")
print(f"Actual 1  [{cm[1,0]:>8}  {cm[1,1]:>8}]")

## 10. Performance Metrics

Calculate True Positives, False Positives, True Negatives, and False Negatives manually.

In [None]:
# Calculate performance metrics manually
TP = np.sum((data['PredictedPass'] == data['Pass']) & (data['Pass'] == 1))
TN = np.sum((data['PredictedPass'] == data['Pass']) & (data['Pass'] == 0))
FP = np.sum((data['PredictedPass'] != data['Pass']) & (data['Pass'] == 0))
FN = np.sum((data['PredictedPass'] != data['Pass']) & (data['Pass'] == 1))

print("Performance Metrics:")
print(f"True Positives (TP):  {TP}")
print(f"True Negatives (TN):  {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

### Calculate Rates

In [None]:
# Calculate rates
TPR = TP / (TP + FN) if (TP + FN) > 0 else 0
FPR = FP / (FP + TN) if (FP + TN) > 0 else 0

print(f"True Positive Rate (Sensitivity): {TPR:.4f}")
print(f"False Positive Rate: {FPR:.4f}")

## Summary

This notebook demonstrated:
1. Loading and exploring data
2. Building a logistic regression model
3. Making predictions with the model
4. Classifying probabilities using a threshold
5. Visualising the model predictions
6. Evaluating model performance using accuracy and confusion matrix
7. Calculating performance metrics (TP, TN, FP, FN, TPR, FPR)