### Import libraries

In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Customize

In [5]:
# Customize
my_train_data="0703 Train Data.csv"
my_new_cases="0703 New Cases.csv"
my_results="0703 predicted_new_cases.csv"
my_decision_rules="0703 decision_rules.txt"

### Load and prepare the training data

In [7]:
# Loading  the training data

train_data_path = my_train_data  # Replace with your actual file path
train_data = pd.read_csv(train_data_path)

# Drop the identifier and separate the target variable
X = train_data.drop(columns=['Code', 'Success'])
y = train_data['Success']

# Fill missing numerical values with the mean
X_filled = X.fillna(X.mean())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_filled, y, test_size=0.2, random_state=42)


### Train the decision tree mode, evaluate, and predict

In [9]:
# Train the decision tree model
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Evaluate the model on the testing data
y_pred = decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Load new cases and predict their success
new_cases_path = my_new_cases  # Replace with your actual file path
new_cases_data = pd.read_csv(new_cases_path)

# Prepare new cases data by dropping unnecessary columns and filling missing values
new_cases_filled = new_cases_data.drop(columns=['Code', 'Success']).fillna(X.mean())

# Predict the 'Success' for the new cases
new_cases_predictions = decision_tree.predict(new_cases_filled)
new_cases_data['Predicted Success'] = new_cases_predictions

### Save prediction 

In [11]:
# Save the prediction output as a CSV file
output_csv_path = my_results
new_cases_data[['Code', 'Predicted Success']].to_csv(output_csv_path, index=False)
print(f"Predictions saved to {output_csv_path}")


Predictions saved to 0703 predicted_new_cases.csv


### Extract decision tree and save decision rules 

In [13]:
# Extract decision rules in text statement form

def tree_to_rules(decision_tree, feature_names):
    """
    Traverse the decision tree to extract decision rules as text statements.
    Each rule is formatted as:
    "If [condition1] and [condition2] and ... then predict class X."
    """
    tree_ = decision_tree.tree_
    # Map feature indices to names (leaf nodes have feature index of -2)
    feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature]
    rules = []
    
    def recurse(node, path):
        # If the node is not a leaf, traverse its children
        if tree_.feature[node] != -2:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            # Condition for the left child (feature value <= threshold)
            left_rule = f"{name} <= {threshold:.2f}"
            recurse(tree_.children_left[node], path + [left_rule])
            # Condition for the right child (feature value > threshold)
            right_rule = f"{name} > {threshold:.2f}"
            recurse(tree_.children_right[node], path + [right_rule])
        else:
            # At a leaf node, retrieve the predicted class
            value = tree_.value[node]
            class_idx = value.argmax()  # Class with the highest count
            # Combine all conditions in the current path
            rule_statement = f"If {' and '.join(path)}, then predict class {class_idx}."
            rules.append(rule_statement)
    
    recurse(0, [])
    return rules

# Extract the decision rules using our custom function
rules_list = tree_to_rules(decision_tree, list(X_filled.columns))

# Save the decision rules as a text file in plain text statement form
output_txt_path = my_decision_rules
with open(output_txt_path, "w") as f:
    f.write("Decision Tree Rules (Text Statement Form):\n\n")
    for rule in rules_list:
        f.write(rule + "\n")
print(f"Accuracy of the model on test data: {accuracy:.2f}")
print(f"Decision rules saved to {output_txt_path}")


Accuracy of the model on test data: 0.83
Decision rules saved to 0703 decision_rules.txt


### Display the prediction results 

In [15]:
# Display results
print("Prediction Results:")
print(new_cases_data[['Code', 'Predicted Success']])

Prediction Results:
        Code Predicted Success
0    Proj_90                 1
1    Proj_91                 1
2    Proj_92                 1
3    Proj_93                 1
4    Proj_94                 0
5    Proj_95                 1
6    Proj_96                 1
7    Proj_97                 1
8    Proj_98                 1
9    Proj_99                 1
10  Proj_100                 1
