In [1]:
# Requirements
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import PyQt5 as qt
import copy 
import math

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

pd.set_option('display.colheader_justify', 'center')
np.set_printoptions(suppress=True, precision=3)
sns.set_theme()

In [2]:
# Logistic Function
def logistic_funct(X1, X2, b0, b1, b2):
    odds = np.exp(b0 + b1*X1 + b2*X2) /         \
      (1 + np.exp(b0 + b1*X1 + b2*X2))
    
    return np.array(odds)

## Data Preparation

In [3]:
# Reading the data and creating a copy
df_main = pd.read_csv('binary_train_dataset.csv')
df = copy.deepcopy(df_main)
df.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male


In [4]:
# Mapping the values 
df['Admitted'] = df['Admitted'].map({'Yes':1,'No':0})
df['Gender'] = df['Gender'].map({'Female':1,'Male':0})
df.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0


In [5]:
df.dtypes

SAT         int64
Admitted    int64
Gender      int64
dtype: object

In [6]:
# Dividing the features from the target variables
y = df['Admitted']
X = df.drop(['Admitted'], axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Logistic Regression using Scikit-Learn

## Modeling

In [8]:
log_reg_sk = LogisticRegression()
log_reg_sk.fit(X_train, y_train)

## Evaluation

In [9]:
y_hat_sk = log_reg_sk.predict(X_test)
y_hat_sk_proba = log_reg_sk.predict_proba(X_test)

In [10]:
conf_matrix_sk = confusion_matrix(y_test,y_hat_sk)
acc_score_sk = accuracy_score(y_test, y_hat_sk)
classification_rate_sk = math.floor(acc_score_sk*100)
misclassification_rate_sk = abs(classification_rate_sk-100)

In [11]:
conf_matrix_sk_df = pd.DataFrame(
    data = conf_matrix_sk,
    columns = ['Predicted Non-Admitted', 'Predicted Admitted'],
    index = ['Actual Non-Admitted', 'Actual Admitted']
)

print(f"\n-------- Confusion Matrix Logistic Regression: Scikit-Learn --------\n")
print(conf_matrix_sk_df)


-------- Confusion Matrix Logistic Regression: Scikit-Learn --------

                     Predicted Non-Admitted  Predicted Admitted
Actual Non-Admitted             9                     3        
Actual Admitted                 0                    22        


In [12]:
accuracy_dict_sk = {
    "classification_rate" : [classification_rate_sk],
    "misclassification_rate" : [misclassification_rate_sk]
}

accuracy_df_sk = pd.DataFrame.from_dict(accuracy_dict_sk)

print(f"\n-------- Accuracy Logistic Regression: Scikit-Learn --------\n")
print(accuracy_df_sk)


-------- Accuracy Logistic Regression: Scikit-Learn --------

   classification_rate  misclassification_rate
0          91                      9          


# Logistic Regression Using Statsmodels

## Modeling

In [13]:
X_train_const = sm.add_constant(X_train)

In [14]:
log_reg_sm = sm.Logit(y_train, X_train_const)
log_reg_sm = log_reg_sm.fit()

Optimization terminated successfully.
         Current function value: 0.119533
         Iterations 10


In [15]:
log_reg_sm.summary()

0,1,2,3
Dep. Variable:,Admitted,No. Observations:,134.0
Model:,Logit,Df Residuals:,131.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 13 Jan 2025",Pseudo R-squ.:,0.8269
Time:,20:07:02,Log-Likelihood:,-16.017
converged:,True,LL-Null:,-92.508
Covariance Type:,nonrobust,LLR p-value:,6.032e-34

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.4855,18.076,-3.789,0.000,-103.915,-33.056
SAT,0.0408,0.011,3.767,0.000,0.020,0.062
Gender,2.0074,0.973,2.064,0.039,0.101,3.913


In [16]:
log_reg_sm.params

const    -68.485509
SAT        0.040838
Gender     2.007392
dtype: float64

## Evaluation

In [17]:
X1 = X_test["SAT"]
X2 = X_test["Gender"]
b0 = log_reg_sm.params.iloc[0]
b1 = log_reg_sm.params.iloc[1]
b2 = log_reg_sm.params.iloc[2]

In [18]:
y_hat_sm_proba = logistic_funct(X1, X2, b0, b1, b2)
y_hat_sm = np.round(y_hat_sm_proba, decimals=0).astype(int)

In [19]:
conf_matrix_sm = confusion_matrix(y_test,y_hat_sm)
acc_score_sm = accuracy_score(y_test, y_hat_sm)
classification_rate_sm = math.floor(acc_score_sm*100)
misclassification_rate_sm = abs(classification_rate_sm - 100)

In [20]:
conf_matrix_sm_df = pd.DataFrame(
    data = conf_matrix_sm,
    columns = ['Predicted Non-Admitted', 'Predicted Admitted'],
    index = ['Actual Non-Admitted', 'Actual Admitted']
)

print(f"\n-------- Confusion Matrix for Logistic Regression: StatsModel --------\n")
print(conf_matrix_sm_df)


-------- Confusion Matrix for Logistic Regression: StatsModel --------

                     Predicted Non-Admitted  Predicted Admitted
Actual Non-Admitted            10                     2        
Actual Admitted                 0                    22        


In [21]:
accuracy_dict_sm = {
    "classification_rate" : [classification_rate_sm],
    "misclassification_rate" : [misclassification_rate_sm]
}

accuracy_df_sm = pd.DataFrame.from_dict(accuracy_dict_sm)

print(f"\n-------- Accuracy Logistic Regression: StatsModel --------\n")
print(accuracy_df_sm)


-------- Accuracy Logistic Regression: StatsModel --------

   classification_rate  misclassification_rate
0          94                      6          


# Comparing Both Models

## Accuracy

In [22]:
accuracy_dict = {
    "classification_rate" : [classification_rate_sk, classification_rate_sm],
    "misclassification_rate" : [misclassification_rate_sk, misclassification_rate_sm]
}

In [29]:
accuracy_df = pd.DataFrame(
    accuracy_dict,
    index=["Scitkit-Learn", "Statsmodel"]
).sort_values(by="classification_rate", ascending=False)

print(f"\n-------- Accuracy: Scikit-Learn vs StatsModel --------\n")
accuracy_df


-------- Accuracy: Scikit-Learn vs StatsModel --------



Unnamed: 0,classification_rate,misclassification_rate
Statsmodel,94,6
Scitkit-Learn,91,9


In [25]:
prediction_dict = {
    "SAT" : X_test["SAT"],
    "Gender" : X_test["Gender"],
    "Admitted" : y_test,
    "sk_prediction" : y_hat_sk,
    "sm_prediction" : y_hat_sm
}

In [28]:
prediction_df = pd.DataFrame(prediction_dict).reset_index(drop=True)

In [None]:
def highlight_mismatch(s):
    """
    Highlights cells in red where the value in the current column 
    does not match the corresponding value in another column.

    Args:
        s: Series representing the current column.

    Returns:
        A list of CSS styles, where 'background-color: red' is applied 
        to cells where the values don't match.
    """
    is_match = (s == prediction_df['Admitted'])  # Compare current column with 'Admitted'
    return ['background-color: red' if not match else '' for match in is_match]

In [None]:
data.style.apply(highlight_mismatch, subset=['sk_prediction', "sm_prediction"])