In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split # For splitting data
from sklearn.metrics import accuracy_score, mean_squared_error # For evaluating performance
import matplotlib.pyplot as plt # For plotting the tree

In [None]:
df = pd.read_csv(r"D:\workspace\MachineLearning\Cases\Wisconsin\BreastCancer.csv", index_col=0)

In [None]:
X,y = df.drop('Class', axis=1), df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25 , stratify=y)

In [None]:
dtc = DecisionTreeClassifier(random_state=25)
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [None]:
depths = np.linspace(2,20,20).astype(int)
scores=[]

for d in depths:
    dtc = DecisionTreeClassifier(random_state=25, max_depth=d)
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_test)
    scores.append([d,accuracy_score(y_test, y_pred)])

scores_df = pd.DataFrame(scores, columns=['Depth', 'Scores'])
scores_df.sort_values('Scores', ascending=False)

In [None]:
plt.figure(figsize=(12, 8)) # Adjust figure size as needed
plot_tree(dtc, feature_names=X_train.columns, class_names=X_train.columns, filled=True, rounded=True)
plt.show()

In [None]:
min_samples = np.linspace(2,20,20).astype(int)
print(min_samples)
scores=[]

for min_sample in min_samples:
    dtc = DecisionTreeClassifier(random_state=25, min_samples_split=min_sample)
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_test)
    scores.append([min_sample,accuracy_score(y_test, y_pred)])

scores_df = pd.DataFrame(scores, columns=['Min Sample Split', 'Scores'])
scores_df.sort_values('Scores', ascending=False)

In [None]:
# depths = np.linspace(2,20,20).astype(int)
# min_samples = np.linspace(2,20,20).astype(int)
# min_sample_leaf = np.linspace(1,20,20).astype(int)
# scores=[]

# for d in depths:
#     for leaf in min_sample_leaf:
#         for split in min_samples:
#             dtc = DecisionTreeClassifier(random_state=25, max_depth=d, min_samples_leaf=leaf, min_samples_split= split)
#             dtc.fit(X_train, y_train)
#             y_pred = dtc.predict(X_test)
#             scores.append([d,leaf,split,accuracy_score(y_test, y_pred)])

# scores_df = pd.DataFrame(scores, columns=['Depth','Leaf','Split', 'Scores'])
# scores_df.sort_values('Scores', ascending=False)

In [None]:
depths = [2,3,4,5,6,None]
min_samples = [2,10,20,50,75]
min_sample_leaf = [1,10,20,50,75]
scores=[]

for d in depths:
    for leaf in min_sample_leaf:
        for split in min_samples:
            dtc = DecisionTreeClassifier(random_state=25, max_depth=d, min_samples_leaf=leaf, min_samples_split= split)
            dtc.fit(X_train, y_train)
            y_pred = dtc.predict(X_test)
            scores.append([d,leaf,split,accuracy_score(y_test, y_pred)])

scores_df = pd.DataFrame(scores, columns=['Depth','Leaf','Split', 'Scores'])
scores_df.sort_values('Scores', ascending=False).iloc[0]

In [None]:
best_tree = DecisionTreeClassifier(random_state=25, max_depth=6, min_samples_leaf=1, min_samples_split= 20)

best_tree.fit(X,y)

In [None]:
best_tree.feature_importances_

np.cumsum(best_tree.feature_importances_)

In [None]:
X.columns

In [None]:
df_imp = pd.DataFrame({
    'Feature':list(X.columns),
    'Importance' : best_tree.feature_importances_
})

df_imp.sort_values('Importance', ascending=False)

In [None]:
plt.barh(df_imp['Feature'], df_imp['Importance'])
plt.show()

In [None]:
plt.figure(figsize=(35, 15))
plot_tree(best_tree, feature_names=list(X.columns), class_names=['Benign', 'Malignant'], filled=True, rounded=True)
plt.show()

In [None]:
depths = [2,3,4,5,6,None]
min_samples = [2,10,20,50,75]
min_sample_leaf = [1,10,20,50,75]
scores=[]
X_train = X_train.drop(['SEpith', 'Mitoses','BChromatin', 'MargAdh'],axis=1)
X_test = X_test.drop(['SEpith', 'Mitoses', 'BChromatin', 'MargAdh'],axis=1)
for d in depths:
    for leaf in min_sample_leaf:
        for split in min_samples:
            dtc = DecisionTreeClassifier(random_state=25, max_depth=d, min_samples_leaf=leaf, min_samples_split= split)
            dtc.fit(X_train, y_train)
            y_pred = dtc.predict(X_test)
            scores.append([d,leaf,split,accuracy_score(y_test, y_pred)])

scores_df = pd.DataFrame(scores, columns=['Depth','Leaf','Split', 'Scores'])
scores_df.sort_values('Scores', ascending=False).iloc[0]

In [None]:
0.961905 - 0.952381

In [None]:
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import OneHotEncoder


df = pd.read_csv(r'D:\workspace\MachineLearning\Cases\human-resources-analytics\HR_comma_sep.csv')
X,y = df.drop('left', axis=1), df['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25 , stratify=y)
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
ct = make_column_transformer((ohe, make_column_selector(dtype_include=object)),
                             ("passthrough",make_column_selector(dtype_exclude=object)),
                             verbose_feature_names_out=False)
ct = ct.set_output(transform='pandas')
X_train_ohe = ct.fit_transform(X_train)
X_test_ohe = ct.transform(X_test)
depths = [2,3,4,5,6,None]
min_samples = [2,10,20,50,75, 100, 500, 1000]
min_sample_leaf = [1,10,20,50,75, 100, 500]
scores=[]
for d in depths:
    for leaf in min_sample_leaf:
        for split in min_samples:
            dtc = DecisionTreeClassifier(random_state=25, max_depth=d, min_samples_leaf=leaf, min_samples_split= split)
            dtc.fit(X_train_ohe, y_train)
            y_pred = dtc.predict(X_test_ohe)
            scores.append([d,leaf,split,accuracy_score(y_test, y_pred)])

scores_df = pd.DataFrame(scores, columns=['Depth','Leaf','Split', 'Scores'])
scores_df.sort_values('Scores', ascending=False)

In [None]:
best_tree = DecisionTreeClassifier(random_state=25, max_depth=None, min_samples_leaf=1, min_samples_split= 75)
X_ct = ct.transform(X)
best_tree.fit(X_ct,y)

In [None]:
best_tree.feature_importances_

In [None]:
X.columns

In [None]:
df_imp = pd.DataFrame({
    'Feature':list(X_ct.columns),
    'Importance' : best_tree.feature_importances_
})

df_imp.sort_values('Importance', ascending=False)

In [None]:
plt.barh(df_imp['Feature'], df_imp['Importance'])
plt.show()

In [None]:
plt.figure(figsize=(150, 150))
plot_tree(best_tree, feature_names=list(X_ct.columns), class_names=['0','1'],  filled=True, rounded=True)
plt.show()