In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/My Drive/Colab Notebooks/IndianLiver.csv'
column_names = ['Age', 'Gender', 'TB', 'DB', 'Alkphos', 'sgpt', 'sgot', 'TP', 'ALB', 'A/G ratio', 'Selector']
data = pd.read_csv(file_path, header=None, names=column_names)

# Checking for NaN values
print(data.isnull().sum())

# Handling NaN values for numerical columns
numerical_cols = ['Age', 'TB', 'DB', 'Alkphos', 'sgpt', 'sgot', 'TP', 'ALB', 'A/G ratio']
imputer = SimpleImputer(strategy='median')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

# If 'Gender' had NaN values or special handling was needed:
# data['Gender'].fillna('Unknown', inplace=True)

# Convert 'Gender' to a binary variable (ensure no NaN values beforehand)
data['Gender'] = pd.get_dummies(data['Gender'], drop_first=True)

# Split the dataset into features (X) and the label (y)
X = data.drop('Selector', axis=1)
y = data['Selector']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create decision tree models with varying depths
models = [DecisionTreeClassifier(max_depth=1),  # Low complexity
          DecisionTreeClassifier(max_depth=5),  # Medium complexity
          DecisionTreeClassifier(max_depth=None)]  # High complexity

# Fit models and evaluate their accuracy
for model in models:
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    print(f"Model: {model}, Train Accuracy: {accuracy_score(y_train, y_pred_train)}, Test Accuracy: {accuracy_score(y_test, y_pred_test)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Age          0
Gender       0
TB           0
DB           0
Alkphos      0
sgpt         0
sgot         0
TP           0
ALB          0
A/G ratio    4
Selector     0
dtype: int64
Model: DecisionTreeClassifier(max_depth=1), Train Accuracy: 0.7060085836909872, Test Accuracy: 0.7435897435897436
Model: DecisionTreeClassifier(max_depth=5), Train Accuracy: 0.776824034334764, Test Accuracy: 0.7435897435897436
Model: DecisionTreeClassifier(), Train Accuracy: 1.0, Test Accuracy: 0.7435897435897436


In [8]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# Imputing missing values in 'A/G ratio'
imputer = SimpleImputer(strategy='median')
data['A/G ratio'] = imputer.fit_transform(data[['A/G ratio']])

# Parameters to be tested in the grid search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(1, 11)  # Testing depths from 1 to 10
}

# Decision tree classifier for the grid search
dt = DecisionTreeClassifier()

# Grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score}")

# Best model according to grid search
best_model = DecisionTreeClassifier(criterion=best_params['criterion'], max_depth=best_params['max_depth'])

# Perform cross-validation to get a new set of scores for the best model
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5)

# Compute mean and standard deviation of the new scores
mean_score = np.mean(cv_scores)
std_dev = np.std(cv_scores, ddof=1)

# Compute 95% Confidence Interval for the mean score
confidence_interval = (mean_score - 1.96 * std_dev, mean_score + 1.96 * std_dev)

print(f"95% Confidence Interval: {confidence_interval}")

Best Parameters: {'criterion': 'gini', 'max_depth': 1}
Best Cross-Validation Score: 0.7060169297643561
95% Confidence Interval: (0.6959451585923992, 0.716088700936313)


# Low Comp Model
The low complexity model had a training accuracy of approximately 70.6% and a test accuracy of approximately 74.4%. It scores decently in both training and testing, suggesting a balance between learning and generalizing to new data.

# Medium Comp Model
The medium complexity model improved the training accuracy to about 77.7% without changing the test accuracy, indicating a better fit to the training data while maintaining performance. It improves on training accuracy but doesn't see a boost in testing performance, indicating a better fit to the training data without necessarily overfitting.

# High Comp Interval
The high complexity model achieved perfect training accuracy (100%) but did not improve on test accuracy, suggesting it might be overfitting the training data.

#Overfitting:
The high complexity model, despite its perfect score on training data, fails to improve on unseen data, suggesting it's too tailored to the training examples.

# Underfitting:
The low complexity model, while not explicitly underperforming, could be seen as too simplistic as it might not capture all the patterns in the data.

# Best Model Parameters
Using 5-fold cross-validation and grid search, the analysis concluded that a simple decision tree (max_depth=1) with gini criterion is optimal, suggesting that beyond a certain point, adding complexity doesn't yield better generalization.

# Confidence Interval
The confidence interval calculated for the best model's accuracy (approximately 69.6% to 71.6%) provides an estimate of where we expect the model's true accuracy to lie 95% of the time, giving us confidence in its stability.

# Medium vs Best
Between the medium complexity and the best (simple) model, the simple model is favored for its comparable performance in test scenarios but with far less complexity. This suggests that for this dataset, a simpler model achieves effective generalization without the need for intricate decision-making processes. The choice leans towards the simpler model, emphasizing efficiency and generalizability. Despite the higher training accuracy of the medium complexity model, its similar test performance to the simpler model indicates no significant benefit from the added complexity.