# **Logistic Regression**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer

In [None]:
df = pd.read_excel(r"diabetes.xls")

# pandas data cast to numpy data type conversion
# all missing values with dots will be replaced with NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['BMI'] = pd.to_numeric(df['BMI'], errors='coerce')
df['Diabetes'] = pd.to_numeric(df['Diabetes'], errors='coerce')
df

Unnamed: 0,Diabetes,BMI,Age
0,1,33.6,50
1,0,26.6,31
2,1,23.3,32
3,0,28.1,21
4,1,43.1,33
...,...,...,...
763,0,32.9,63
764,0,36.8,27
765,0,26.2,30
766,1,30.1,47


### There are 11 missing BMI values. Tackling it by removing them from the data set:

In [None]:
# Find the indices of rows with missing BMI values
missing_indices = df[df['BMI'].isnull()].index

# Extract the corresponding age and BMI values
missing_bmi = df.loc[missing_indices, ['Age', 'Diabetes']]
print("Age and Diabetes status for rows with missing BMI values:")
missing_bmi

Age and Diabetes status for rows with missing BMI values:


Unnamed: 0,Age,Diabetes
9,54,1
49,24,0
60,21,0
81,22,0
145,21,0
371,21,0
426,25,0
494,22,0
522,26,0
684,69,0


In [None]:
# removing rows with NaN values
df.dropna(subset=['BMI'], inplace=True)
df

Unnamed: 0,Diabetes,BMI,Age
0,1,33.6,50
1,0,26.6,31
2,1,23.3,32
3,0,28.1,21
4,1,43.1,33
...,...,...,...
763,0,32.9,63
764,0,36.8,27
765,0,26.2,30
766,1,30.1,47


# Training the model:

In [None]:
# Defining predictors and the response variable
X = df[['Age', 'BMI']]
y = df['Diabetes']

# Splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Adding a constant
X_train_const = sm.add_constant(X_train)

#Logistic regression model
log_reg = sm.Logit(y_train, X_train_const).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.549537
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               Diabetes   No. Observations:                  614
Model:                          Logit   Df Residuals:                      611
Method:                           MLE   Df Model:                            2
Date:                Thu, 14 Dec 2023   Pseudo R-squ.:                  0.1487
Time:                        00:39:04   Log-Likelihood:                -337.42
converged:                       True   LL-Null:                       -396.34
Covariance Type:            nonrobust   LLR p-value:                 2.559e-26
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.3713      0.626    -10.185      0.000      -7.597      -5.145
Age            0.0575      0.

Let **a** represent Age and **b** be represent BMI.
Let P(y=1) represent the probabilty of a Pima Indian from the dataset having diabetes.

---
The logistic regression equation is given by : logit[P(y=1)] = α + (β1 * a) + (β2 * b)

From the results above, we get logit[P(y=1)] = - 6.37 + (0.06 * a) + (0.12 * b).


---


The antilog of 0.06 represents the estimated odds ratio between a person's age and their diabetes status while controlling for their BMI. Since that equals 1.06, **a unit increase in the age when BMI is controlled, means that the odds of having diabetes gets multiplied by 1.06.**

Similarly, the antilog of 0.12 represents the estimated odds ratio between a person's BMI and their diabetes status while controlling for their age. Since that equals 1.13, **a unit increase in the BMI when age is controlled, means that the odds of having diabetes gets multiplied by 1.13.**

---



# Wald test for Age only:

In [None]:
wald_test_result = log_reg.wald_test('Age = 0')
print("Wald Test Results:")
print(wald_test_result)

Wald Test Results:
<Wald test (chi2): statistic=[[49.59119566]], p-value=1.8936159362422495e-12, df_denom=1>


Wald statistic = 49.59 and its p-value << 0.05. In general, a higher test statistic suggests stronger evidence against the null hypothesis. **This suggests strong eveidence against H0: β1=0. So Age has a statistically significant impact on whether a person has Diabetes or not.**

# Wald test for BMI only:

In [None]:
wald_test_result = log_reg.wald_test('BMI = 0')
print("Wald Test Results:")
print(wald_test_result)

Wald Test Results:
<Wald test (chi2): statistic=[[59.12117894]], p-value=1.482549570689794e-14, df_denom=1>


Wald statistic = 59.12 and its p-value << 0.05. **This suggests strong eveidence against H0: β2=0. So BMI has a statistically significant impact on whether a person has Diabetes or not.**

# Wald test for Age and BMI:

In [None]:
wald_test_result = log_reg.wald_test('Age = 0, BMI = 0')
print("Wald Test Results:")
print(wald_test_result)

Wald Test Results:
<Wald test (chi2): statistic=[[91.9084052]], p-value=1.102410624342103e-20, df_denom=2>


Wald statistic = 91.908 and its p-value << 0.05. **This suggests strong eveidence against H0: β1=β2=0. So the joint significance of Age and BMI on Diabetes status cannot be ignored.**


# Testing the model:

In [None]:
# Adding a constant
X_test_const = sm.add_constant(X_test)

# Prediction
pred = log_reg.predict(X_test_const)
Pred = list(map(round, pred))

# Model accuracy
print('\nModel accuracy with Testing data =', accuracy_score(y_test, Pred))

# Confusion matrix
Conf_mat = confusion_matrix(y_test, Pred)
print ("\nConfusion Matrix = \n", Conf_mat)


Model accuracy with Testing data = 0.6428571428571429

Confusion Matrix = 
 [[79 20]
 [35 20]]


"For a binary classification model like logistic regression, the confusion matrix will be a 2x2 matrix with each row representing the counts of actual conditions and each column representing the counts of predicted conditions."

The top left matrix element will show the count of true negatives, the bottom right element will show the count of true positives, the top right element will show the count of false positives, the bottom left element will show the count of false negatives.

The classification threshold for the probabilities is 0.5 by default.


---


In our case, out of 152 testing data points:

**85 of them were classfied as true negatives (no diabetes) and 20 were classified as true positives (yes diabetes).**

**35 were classfied as false negatives and 20 were classfied as false positives.**


---


"The accuracy score is a metric given as the fraction of correct predictions generated by the given model." **In our case the accuracy is around 64.28%**

---



# Tackling missing values by imputing them with most frequent BMI value:

In [None]:
df = pd.read_excel(r"diabetes.xls")
# Check and convert data types if needed
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['BMI'] = pd.to_numeric(df['BMI'], errors='coerce')
df['Diabetes'] = pd.to_numeric(df['Diabetes'], errors='coerce')
df

Unnamed: 0,Diabetes,BMI,Age
0,1,33.6,50
1,0,26.6,31
2,1,23.3,32
3,0,28.1,21
4,1,43.1,33
...,...,...,...
763,0,32.9,63
764,0,36.8,27
765,0,26.2,30
766,1,30.1,47


In [None]:
imputer = SimpleImputer(strategy='most_frequent')
df['BMI'] = imputer.fit_transform(df[['BMI']])
df

Unnamed: 0,Diabetes,BMI,Age
0,1,33.6,50
1,0,26.6,31
2,1,23.3,32
3,0,28.1,21
4,1,43.1,33
...,...,...,...
763,0,32.9,63
764,0,36.8,27
765,0,26.2,30
766,1,30.1,47


# Training the model:

In [None]:
# Defining predictors and the response variable
X = df[['Age', 'BMI']]
y = df['Diabetes']

# Splitting data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Adding a constant
X_train_const = sm.add_constant(X_train)

#Logistic regression model
log_reg = sm.Logit(y_train, X_train_const).fit()
print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.549537
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               Diabetes   No. Observations:                  614
Model:                          Logit   Df Residuals:                      611
Method:                           MLE   Df Model:                            2
Date:                Thu, 14 Dec 2023   Pseudo R-squ.:                  0.1487
Time:                        00:42:42   Log-Likelihood:                -337.42
converged:                       True   LL-Null:                       -396.34
Covariance Type:            nonrobust   LLR p-value:                 2.559e-26
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -6.3713      0.626    -10.185      0.000      -7.597      -5.145
Age            0.0575      0.

# Testing the model:

In [None]:
# Adding a constant
X_test_const = sm.add_constant(X_test)

# Prediction
pred = log_reg.predict(X_test_const)
Pred = list(map(round, pred))

# Model accuracy
print('\nModel accuracy with Testing data =', accuracy_score(y_test, Pred))

# Confusion matrix
Conf_mat = confusion_matrix(y_test, Pred)
print ("\nConfusion Matrix = \n", Conf_mat)


Model accuracy with Testing data = 0.6428571428571429

Confusion Matrix = 
 [[79 20]
 [35 20]]


---

The confusion matrices and the accuracy scores are the same for tackling missing BMI data by replacing them with mean/median/most frequent value as they were when missing values were just dropped.

---