In [None]:
# Importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import confusion_matrix, precision_recall_curve, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
# Reading CSV file 
url = 'preprocessed_diabetes.csv'
df = pd.read_csv(url)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
#### Encoding smoking history column
encoder = OrdinalEncoder(categories=[['Never', 'Current', 'Former', 'Ever', 'Not current']])
df['smoking_encoded'] = encoder.fit_transform(df[['smoking_history']])

## 1. LOGISTIC REGRESSION

#### Split the Data into training and test dataset

In [None]:
# Seperating target and features
X = df[['gender', 'age', 'hypertension', 'heart_disease', 'smoking_encoded', 'bmi', 'HbA1c_level', 'blood_glucose_level']]
y = df[['diabetes']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Fitting the Logistic Regression Model

In [None]:
# Creating a logistic regression model
model = LogisticRegression()

# Fitting the model to the training data
model.fit(X_train, y_train)

#### Making Predictions

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Probability for class 1
y_prob = model.predict_proba(X_test)[:, 1]  

#### Model Evaluation

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}")

#### Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

#### Precision and Recall Matrix

In [None]:
print(classification_report(y_test, y_pred))

#### Interpretation of Model Coefficients

In [None]:
# Model coefficients (weights)
coefficients = model.coef_
print(f"Model Coefficients: {coefficients}")

#### Tuning and Cross-Validation

In [None]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

# Using GridSearchCV to search for the best parameters
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best hyperparameters
print(grid_search.best_params_)


## Visaulization

#### Confusion matrix

In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

#### Actual vs Predicted Values

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(range(len(y_test)), y_test, color='blue', alpha=0.9, label='Actual Values')
plt.scatter(range(len(y_test)), y_pred, color='orange', alpha=0.2, label='Predicted Values')
plt.title('Actual vs Predicted Values (Logistic Regression)')
plt.xlabel('Data Point Index')
plt.ylabel('Class (0 or 1)')
plt.legend()
plt.grid(True)
plt.show()


#### Precision Recall vs Threshold

In [None]:
# Computing precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# Plotting Precision-Recall vs Threshold
plt.figure(figsize=(8, 6))
plt.plot(thresholds, precision[:-1], label="Precision", color="blue")
plt.plot(thresholds, recall[:-1], label="Recall", color="orange")
plt.title('Precision and Recall vs Threshold')
plt.xlabel('Decision Threshold')
plt.ylabel('Score')
plt.legend(loc="best")
plt.grid(True)
plt.show()


## 2. Decision Tree

In [None]:
# Importing required libraries
from sklearn.tree import DecisionTreeClassifier, plot_tree

#### Training the model

In [None]:
# Creating a Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Fitting the model on the training data
dt_classifier.fit(X_train, y_train)

#### Predictions

In [None]:
# Making predictions on the test set
y_pred = dt_classifier.predict(X_test)

#### Model Evaluation

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ",conf_matrix)

## Visualization

#### Confusion Matrix

In [None]:
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, 
            xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

#### Trained Decision tree

In [None]:
# Plot the decision tree
plt.figure(figsize=(20, 10))
plot_tree(dt_classifier, feature_names=X.columns, class_names=['No Diabetes', 'Diabetes'], filled=True)
plt.title('Decision Tree Visualization')
plt.show()