# 1. What is a Decision Tree?
# A supervised machine learning algorithm used for classification and regression.
# It splits the data into branches based on feature thresholds, leading to decisions.

# 2. How does it work?
# At each node, it picks the best feature to split using an impurity metric (like Gini or Entropy).
# The process repeats recursively until stopping criteria are met.

# 3. Gini Impurity Formula:
# Gini(D) = 1 - sum(p_i^2) for each class i

# 4. Entropy Formula:
# Entropy(D) = -sum(p_i * log2(p_i)) for each class i

# 5. Information Gain:
# IG(D, A) = Entropy(D) - sum (|D_v| / |D|) * Entropy(D_v) for values v of attribute A

# 6. Gini vs Entropy:
# Gini is simpler and faster; Entropy may yield more balanced splits.

# 7. Mathematical View:
# Decision Trees use a greedy approach to minimize impurity or maximize Information Gain.

# 8. Pre-Pruning:
# Stop tree early using parameters like max_depth, min_samples_split.

# 9. Post-Pruning:
# Grow full tree, then prune unnecessary nodes using validation or cost-complexity pruning.

# 10. Pre-Pruning vs Post-Pruning:
# Pre: Happens during training. Post: Happens after full tree is built.

# 11. Decision Tree Regressor:
# Predicts continuous outputs, minimizes MSE at splits.

# 12. Advantages:
# Interpretable, works with mixed feature types, no scaling needed.

# 13. Disadvantages:
# Prone to overfitting, unstable with small changes in data.

# 14. Missing Values Handling:
# Can use surrogate splits or imputation.

# 15. Categorical Features:
# One-hot encode or use split-by-category logic.

# 16. Applications:
# Loan approval, fraud detection, medical diagnosis, customer churn, etc.


In [2]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))


Accuracy: 1.0


In [3]:
model = DecisionTreeClassifier(criterion='gini')
model.fit(X_train, y_train)

print("Feature Importances:", model.feature_importances_)


Feature Importances: [0.0167399  0.0167399  0.40268917 0.56383102]


In [4]:
model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train, y_train)
print("Accuracy:", accuracy_score(y_test, model.predict(X_test)))


Accuracy: 0.9


In [5]:
from sklearn.datasets import fetch_california_housing
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

data = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2)

reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
pred = reg.predict(X_test)

print("MSE:", mean_squared_error(y_test, pred))


MSE: 0.5477097968744914


In [6]:
from sklearn.tree import export_graphviz
import graphviz

dot_data = export_graphviz(model, out_file=None,
                           feature_names=iris.feature_names,
                           class_names=iris.target_names,
                           filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph.render("iris_tree", view=True)


'iris_tree.pdf'

In [None]:
short_tree = DecisionTreeClassifier(max_depth=3)
short_tree.fit(X_train, y_train)

print("Full Tree Accuracy:", accuracy_score(y_test, model.predict(X_test)))
print("Depth-3 Accuracy:", accuracy_score(y_test, short_tree.predict(X_test)))


In [None]:
custom_tree = DecisionTreeClassifier(min_samples_split=5)
custom_tree.fit(X_train, y_train)

print("Default Accuracy:", accuracy_score(y_test, model.predict(X_test)))
print("min_samples_split=5 Accuracy:", accuracy_score(y_test, custom_tree.predict(X_test)))


In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(StandardScaler(), DecisionTreeClassifier())
pipe.fit(X_train, y_train)

print("Scaled Accuracy:", accuracy_score(y_test, pipe.predict(X_test)))
print("Unscaled Accuracy:", accuracy_score(y_test, model.predict(X_test)))


In [None]:
from sklearn.multiclass import OneVsRestClassifier

ovr_model = OneVsRestClassifier(DecisionTreeClassifier())
ovr_model.fit(X_train, y_train)

print("OvR Accuracy:", accuracy_score(y_test, ovr_model.predict(X_test)))

In [11]:
import pandas as pd

importance_df = pd.DataFrame({
    'Feature': iris.feature_names,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(importance_df)


             Feature  Importance
3   petal width (cm)    0.661784
2  petal length (cm)    0.323675
0  sepal length (cm)    0.014542
1   sepal width (cm)    0.000000


In [12]:
limited_reg = DecisionTreeRegressor(max_depth=5)
limited_reg.fit(X_train, y_train)

print("Unrestricted MSE:", mean_squared_error(y_test, reg.predict(X_test)))
print("max_depth=5 MSE:", mean_squared_error(y_test, limited_reg.predict(X_test)))


Unrestricted MSE: 0.5477097968744914
max_depth=5 MSE: 0.5398376312027791


In [17]:
path = model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

accuracies = []
for alpha in ccp_alphas:
    pruned_tree = DecisionTreeClassifier(ccp_alpha=alpha)
    pruned_tree.fit(X_train, y_train)
    accuracies.append(accuracy_score(y_test, pruned_tree.predict(X_test)))

print("CCP Alpha Range:", ccp_alphas)
print("Accuracies:", accuracies)

In [17]:
from sklearn.metrics import classification_report

print(classification_report(y_test, model.predict(X_test), target_names=iris.target_names))


In [17]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, model.predict(X_test))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=iris.target_names, yticklabels=iris.target_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [3, 5, 10, None], 'min_samples_split': [2, 5, 10]}
grid = GridSearchCV(DecisionTreeClassifier(), param_grid=params, cv=5)
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)