In [1]:
# Requirements
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import PyQt5 as qt
import copy 
import math

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

pd.set_option('display.colheader_justify', 'center')
np.set_printoptions(suppress=True, precision=3)
sns.set_theme()

In [None]:
# Logistic function
def logistic_funct(X1, X2, b0, b1, b2):
    odds = np.exp(b0 + b1*X1 + b2*X2) /         \
      (1 + np.exp(b0 + b1*X1 + b2*X2))
    
    return np.array(odds)

## Data Preparation

In [3]:
# Reading the data and creating a copy
df_main = pd.read_csv('binary_train_dataset.csv')
df = copy.deepcopy(df_main)
df.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male


In [4]:
# Mapping the values 
df['Admitted'] = df['Admitted'].map({'Yes':1,'No':0})
df['Gender'] = df['Gender'].map({'Female':1,'Male':0})
df.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0


In [5]:
df.dtypes

SAT         int64
Admitted    int64
Gender      int64
dtype: object

In [6]:
# Dividing the features from the target variables
y = df['Admitted']
X = df.drop(['Admitted'], axis=1)

In [None]:
# Splitting the data into train dataset and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Logistic Regression using Scikit-Learn

## Modeling

In [None]:
# Training the model
log_reg_sk = LogisticRegression()
log_reg_sk.fit(X_train, y_train)

## Evaluation

In [None]:
# Prediction (Integer)
y_hat_sk = log_reg_sk.predict(X_test)
# Prediction (Probability:Float)
y_hat_sk_proba = log_reg_sk.predict_proba(X_test)

In [None]:
# Confusion matrix
conf_matrix_sk = confusion_matrix(y_test,y_hat_sk)
# Accuracy score
acc_score_sk = accuracy_score(y_test, y_hat_sk)
# Classification rate
classification_rate_sk = math.floor(acc_score_sk*100)
# Misclassification rate
misclassification_rate_sk = abs(classification_rate_sk-100)

In [None]:
# Confusion matrix dataframe
conf_matrix_sk_df = pd.DataFrame(
    data = conf_matrix_sk,
    columns = ['Predicted Non-Admitted', 'Predicted Admitted'],
    index = ['Actual Non-Admitted', 'Actual Admitted']
)

print(f"\n-------- Confusion Matrix: Logistic Regression: Scikit-Learn --------\n")
print(conf_matrix_sk_df)


-------- Confusion Matrix Logistic Regression: Scikit-Learn --------

                     Predicted Non-Admitted  Predicted Admitted
Actual Non-Admitted             9                     3        
Actual Admitted                 0                    22        


In [None]:
# Accuracy dataframe
accuracy_dict_sk = {
    "classification_rate" : [classification_rate_sk],
    "misclassification_rate" : [misclassification_rate_sk]
}

accuracy_df_sk = pd.DataFrame.from_dict(data = accuracy_dict_sk)

print(f"\n-------- Accuracy: Logistic Regression: Scikit-Learn --------\n")
print(accuracy_df_sk)


-------- Accuracy Logistic Regression: Scikit-Learn --------

   classification_rate  misclassification_rate
0          91                      9          


# Logistic Regression Using Statsmodels

## Modeling

In [None]:
# Adding a constant (B0)
X_train_const = sm.add_constant(X_train)

In [None]:
# Training the model
log_reg_sm = sm.Logit(y_train, X_train_const)
log_reg_sm = log_reg_sm.fit()

Optimization terminated successfully.
         Current function value: 0.119533
         Iterations 10


In [None]:
# Statistics
log_reg_sm.summary()

0,1,2,3
Dep. Variable:,Admitted,No. Observations:,134.0
Model:,Logit,Df Residuals:,131.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 13 Jan 2025",Pseudo R-squ.:,0.8269
Time:,20:07:02,Log-Likelihood:,-16.017
converged:,True,LL-Null:,-92.508
Covariance Type:,nonrobust,LLR p-value:,6.032e-34

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.4855,18.076,-3.789,0.000,-103.915,-33.056
SAT,0.0408,0.011,3.767,0.000,0.020,0.062
Gender,2.0074,0.973,2.064,0.039,0.101,3.913


In [None]:
# Coefficients/Weights to set up the logistic function
log_reg_sm.params

const    -68.485509
SAT        0.040838
Gender     2.007392
dtype: float64

## Evaluation

In [None]:
# Setting up the values for the logistic function
X1 = X_test["SAT"]
X2 = X_test["Gender"]
b0 = log_reg_sm.params.iloc[0]
b1 = log_reg_sm.params.iloc[1]
b2 = log_reg_sm.params.iloc[2]

In [None]:
# Predictions
y_hat_sm_proba = logistic_funct(X1, X2, b0, b1, b2)
y_hat_sm = np.round(y_hat_sm_proba, decimals=0).astype(int)

In [None]:
# Confusion matrix
conf_matrix_sm = confusion_matrix(y_test,y_hat_sm)
# Accuracy score
acc_score_sm = accuracy_score(y_test, y_hat_sm)
# Classification rate
classification_rate_sm = math.floor(acc_score_sm*100)
# Misclassification rate
misclassification_rate_sm = abs(classification_rate_sm - 100)

In [None]:
# Confusion matrix dataframe
conf_matrix_sm_df = pd.DataFrame(
    data = conf_matrix_sm,
    columns = ['Predicted Non-Admitted', 'Predicted Admitted'],
    index = ['Actual Non-Admitted', 'Actual Admitted']
)

print(f"\n-------- Confusion Matrix for Logistic Regression: StatsModel --------\n")
print(conf_matrix_sm_df)


-------- Confusion Matrix for Logistic Regression: StatsModel --------

                     Predicted Non-Admitted  Predicted Admitted
Actual Non-Admitted            10                     2        
Actual Admitted                 0                    22        


In [None]:
# Accuracy dataframe
accuracy_dict_sm = {
    "classification_rate" : [classification_rate_sm],
    "misclassification_rate" : [misclassification_rate_sm]
}

accuracy_df_sm = pd.DataFrame.from_dict(data = accuracy_dict_sm)

print(f"\n-------- Accuracy Logistic Regression: StatsModel --------\n")
print(accuracy_df_sm)


-------- Accuracy Logistic Regression: StatsModel --------

   classification_rate  misclassification_rate
0          94                      6          


# Comparing: Scikit-Learn vs Statsmodels

## Accuracy

In [None]:
# Accuracy dataframe Scikitlearn vs Statsmodels 
accuracy_dict = {
    "classification_rate" : [classification_rate_sk, classification_rate_sm],
    "misclassification_rate" : [misclassification_rate_sk, misclassification_rate_sm]
}

accuracy_df = pd.DataFrame(
    data = accuracy_dict,
    index = ["Scitkit-Learn", "Statsmodel"]
).sort_values(by="classification_rate", ascending=False)

print(f"\n-------- Accuracy: Scikit-Learn vs StatsModel --------\n")
accuracy_df

## Predictions

In [None]:
# Predictions dataframe Scikitlearn vs Statsmodels 
prediction_dict = {
    "SAT" : X_test["SAT"],
    "Gender" : X_test["Gender"],
    "Admitted" : y_test,
    "sk_prediction" : y_hat_sk,
    "sm_prediction" : y_hat_sm
}

prediction_df = pd.DataFrame(data = prediction_dict).reset_index(drop=True)

In [None]:
# Styling function
def highlight_mismatch(s):
    """
    Highlights cells in red where the value in the current column 
    does not match the corresponding value in another column.

    Returns:
        A list of CSS styles, where 'background-color: red' is applied 
        to cells where the values don't match.
    """
    # Compare current column with 'Admitted'
    is_match = (s == prediction_df['Admitted'])  
    
    return ["background-color: red" if not match 
            else "background-color: green" 
            for match in is_match
    ]

In [None]:
# Predictions dataframe (Styled)
prediction_df.style.apply(
    func = highlight_mismatch, 
    subset = ['sk_prediction', "sm_prediction"]
)

Unnamed: 0,SAT,Gender,Admitted,sk_prediction,sm_prediction
0,1587,0,0,0,0
1,1962,1,1,1,1
2,1593,0,0,0,0
3,1402,0,0,0,0
4,1721,1,1,1,1
5,1880,1,1,1,1
6,1907,1,1,1,1
7,1855,1,1,1,1
8,2021,1,1,1,1
9,1693,0,0,1,1


## Graph

In [None]:
def f(x,b0,b1):
    return np.array(np.exp(b0+x*b1) / (1 + np.exp(b0+x*b1)))

f_sorted = np.sort(f(X['SAT'],results_log.params[0],results_log.params[1]))
x_sorted = np.sort(np.array(X['SAT']))

sns.scatterplot(y=y, x=X['SAT'], hue=X['Gender'], alpha=0.6)
plt.xlabel('SAT', fontsize = 20)
plt.ylabel('Admitted', fontsize = 20)
plt.plot(x_sorted,f_sorted,color='C10')
plt.show()

pd.DataFrame(f_sorted.round(2),x_sorted)

In [None]:
fig, ax = plt.subplots(2,1, sharex=True, sharey=True, figsize=(8, 8))

# ax[0].set_title("Non-Fraud Transations", color="blue")
ax[0].scatter(y=y, x=X['SAT'], alpha=0.6)
# ax[0].set_xlabel('Amount ($)')
# ax[0].set_ylabel('Count')

# ax[1].set_title("Fraud Transations", color="red")
# ax[1].hist(x=df_fraud_1['Amount'], bins=10)
# ax[1].set_xlabel('Amount ($)')
# ax[1].set_ylabel('Count')

In [43]:
help(ax[0].scatter)

Help on method scatter in module matplotlib.axes._axes:

scatter(x, y, s=None, c=None, *, marker=None, cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, edgecolors=None, colorizer=None, plotnonfinite=False, data=None, **kwargs) method of matplotlib.axes._axes.Axes instance
    A scatter plot of *y* vs. *x* with varying marker size and/or color.

    Parameters
    ----------
    x, y : float or array-like, shape (n, )
        The data positions.

    s : float or array-like, shape (n, ), optional
        The marker size in points**2 (typographic points are 1/72 in.).
        Default is ``rcParams['lines.markersize'] ** 2``.

        The linewidth and edgecolor can visually interact with the marker
        size, and can lead to artifacts if the marker size is smaller than
        the linewidth.

        If the linewidth is greater than 0 and the edgecolor is anything
        but *'none'*, then the effective size of the marker will be
        increased by half the 