In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree
import numpy as np

In [10]:
df = pd.read_csv('ov4-breast-cancer.csv')

# Clean up
df.replace('?', np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
column_means = df.mean()
df.fillna(column_means, inplace=True)    
    
y = df['classes']
X = df.drop(columns=['classes'], axis=1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the model
classifier = DecisionTreeClassifier(min_impurity_decrease=0.01, max_depth=5)
classifier.fit(X_train, y_train)

# Predict result
y_pred = classifier.predict(X_test)

# Evaluate result
report = classification_report(y_test, y_pred)
print(report)



# Generate the tree file
dotfile = open("./dtree1.dot", 'w')
# Classes are marked with y[0] and y[1] where y[0] means person not surviving, and vice versa. 
ditfile = tree.export_graphviz(classifier, out_file = dotfile, feature_names = X.columns, class_names=True)
dotfile.close()

# Screenshot of the tree is in the project's folder. The file is called tree.png.

# Answer: It seems that this method is better because you can visually see how the 
# factors influence the predictions underway in the process of decision making.

              precision    recall  f1-score   support

           0       0.97      0.98      0.97        96
           1       0.95      0.93      0.94        44

    accuracy                           0.96       140
   macro avg       0.96      0.96      0.96       140
weighted avg       0.96      0.96      0.96       140

