Q1. Import the dataset and examine the variables. Use descriptive statistics and visualizations to
understand the distribution and relationships between the variables.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import StandardScaler

# Load Dataset (assuming it is downloaded locally as 'diabetes.csv')
df = pd.read_csv('diabetes.csv')

### Question 1: Examine Variables
# Display the first few rows of the dataset
print("Dataset Preview:")
print(df.head())

# Summary statistics
df.describe()

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Pairplot to visualize relationships
sns.pairplot(df, hue='Outcome')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

Q2. Preprocess the data by cleaning missing values, removing outliers, and transforming categorical
variables into dummy variables if necessary.

In [None]:
# Handling missing values (if any)
# For simplicity, replacing missing values with column medians (if missing values exist)
for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column].fillna(df[column].median(), inplace=True)

# Removing outliers using z-scores
from scipy.stats import zscore
z_scores = np.abs(zscore(df.drop('Outcome', axis=1)))
df = df[(z_scores < 3).all(axis=1)]  # Retaining data within 3 standard deviations

# Check dataset after preprocessing
print("\nDataset after preprocessing:")
print(df.info())

Q3. Split the dataset into a training set and a test set. Use a random seed to ensure reproducibility.

In [None]:
# Splitting the dataset
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


Q4. Use a decision tree algorithm, such as ID3 or C4.5, to train a decision tree model on the training set. Use
cross-validation to optimize the hyperparameters and avoid overfitting.

In [None]:
# Using GridSearchCV to find the best hyperparameters for Decision Tree
params = {'max_depth': [3, 5, 7, 10], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt, param_grid=params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and model
best_dt = grid_search.best_estimator_
print("\nBest Parameters:", grid_search.best_params_)

Q5. Evaluate the performance of the decision tree model on the test set using metrics such as accuracy,
precision, recall, and F1 score. Use confusion matrices and ROC curves to visualize the results.

In [None]:
# Predictions on test set
y_pred = best_dt.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"\nEvaluation Metrics:\nAccuracy: {accuracy:.2f}\nPrecision: {precision:.2f}\nRecall: {recall:.2f}\nF1 Score: {f1:.2f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Diabetic', 'Diabetic'], yticklabels=['Non-Diabetic', 'Diabetic'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ROC Curve
probs = best_dt.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


Q6. Interpret the decision tree by examining the splits, branches, and leaves. Identify the most important
variables and their thresholds. Use domain knowledge and common sense to explain the patterns and
trends.

In [None]:
# Visualize the tree
plt.figure(figsize=(20, 10))
plot_tree(best_dt, feature_names=X.columns, class_names=['Non-Diabetic', 'Diabetic'], filled=True)
plt.title('Decision Tree Visualization')
plt.show()

# Feature importance
importance = pd.DataFrame({'Feature': X.columns, 'Importance': best_dt.feature_importances_}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(importance)


Q7. Validate the decision tree model by applying it to new data or testing its robustness to changes in the
dataset or the environment. Use sensitivity analysis and scenario testing to explore the uncertainty and
risks.

In [None]:
# Sensitivity analysis by perturbing test data slightly
X_test_perturbed = X_test + np.random.normal(0, 0.1, X_test.shape)
y_pred_perturbed = best_dt.predict(X_test_perturbed)
perturbed_accuracy = accuracy_score(y_test, y_pred_perturbed)

print(f"\nValidation Metrics:\nAccuracy with Perturbed Data: {perturbed_accuracy:.2f}")
