# Decision Tree Implementation

In [138]:
import os
import pandas as pd
import numpy as np

In [140]:
# Set working directory
os.chdir(os.getcwd())

## Iris dataset

### Import dataset

In [144]:
# Load the dataset
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type']
data = pd.read_csv('./Data/iris/iris.data', header=None, names=column_names)

# Quick summary
print(data.head())

   sepal_length  sepal_width  petal_length  petal_width         type
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


### Preprocessing

In [147]:
X = data.drop('type', axis=1).values  # Define features
y = data['type'].values  # Define targets

### Train-Test Split

In [150]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Run performance functions

In [153]:
from performance_data import get_tree_depth, count_nodes, collect_performance_data

In [155]:
# Collect the performance data
results_df = collect_performance_data(X_test, y_test)

# Save to CSV file
results_df.to_csv("performance_data_iris.csv", index=False)

## Breast Cancer dataset

### Import dataset

In [159]:
# Load the dataset
column_names = [
    "ID", "Diagnosis", "Radius_Mean", "Texture_Mean", "Perimeter_Mean", "Area_Mean", "Smoothness_Mean", 
    "Compactness_Mean", "Concavity_Mean", "Concave_Points_Mean", "Symmetry_Mean", "Fractal_Dimension_Mean",
    "Radius_Se", "Texture_Se", "Perimeter_Se", "Area_Se", "Smoothness_Se", "Compactness_Se", "Concavity_Se",
    "Concave_Points_Se", "Symmetry_Se", "Fractal_Dimension_Se", "Radius_Worst", "Texture_Worst", 
    "Perimeter_Worst", "Area_Worst", "Smoothness_Worst", "Compactness_Worst", "Concavity_Worst", 
    "Concave_Points_Worst", "Symmetry_Worst", "Fractal_Dimension_Worst"
]
data = pd.read_csv('./Data/breast_cancer/wdbc.data', header=None, names=column_names)

# Quick summary
print(data.head())

         ID Diagnosis  Radius_Mean  Texture_Mean  Perimeter_Mean  Area_Mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   Smoothness_Mean  Compactness_Mean  Concavity_Mean  Concave_Points_Mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  Radius_Worst  Texture_Worst  Perimeter_Worst  Area_Wor

### Preprocess the Dataset

In [162]:
data = data.drop("ID", axis=1) # Drop ID column
data["Diagnosis"] = data["Diagnosis"].map({"M": 1, "B": 0}) # Map diagnosic column to numerical column
X = data.drop("Diagnosis", axis=1).values # Define features
y = data["Diagnosis"].values # Define targets

### Test-Train Split

In [165]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Run performance function

In [168]:
# Collect the performance data
results_df = collect_performance_data(X_test, y_test)

# Save to CSV file
results_df.to_csv("performance_data_wdbc.csv", index=False)

## Covertype dataset

### Import the dataset

In [172]:
# Column names
column_names = [
    "elevation",
    "aspect",
    "slope",
    "horizontal_distance_to_hydrology",
    "vertical_distance_to_hydrology",
    "horizontal_distance_to_roadways",
    "hillshade_9am",
    "hillshade_noon",
    "hillshade_3pm",
    "horizontal_distance_to_fire_points"
]

# Add wilderness area and soil type
wilderness_names = [f"wilderness_area_{i}" for i in range(4)]  # 4 wilderness areas
soil_type_names = [f"soil_type_{i}" for i in range(40)]        # 40 soil types

# Add target column
column_names += wilderness_names + soil_type_names + ["cover_type"]

# Load the data
file_path = './data/covertype/covtype.data'
data = pd.read_csv(file_path, header=None, names=column_names)

# Quick summary
print(data.head())

   elevation  aspect  slope  horizontal_distance_to_hydrology  \
0       2596      51      3                               258   
1       2590      56      2                               212   
2       2804     139      9                               268   
3       2785     155     18                               242   
4       2595      45      2                               153   

   vertical_distance_to_hydrology  horizontal_distance_to_roadways  \
0                               0                              510   
1                              -6                              390   
2                              65                             3180   
3                             118                             3090   
4                              -1                              391   

   hillshade_9am  hillshade_noon  hillshade_3pm  \
0            221             232            148   
1            220             235            151   
2            234             238   

### Preprocess the Dataset

In [175]:
# Reduce dataset to more manageable size
data = data.sample(frac=0.1, random_state=0)  # Use 10% of the data

X = data.drop("cover_type", axis=1).values # Define features
y = data["cover_type"].values # Define targets

## Test-Train split

In [178]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Run performance function

In [181]:
from performance_data import get_tree_depth, count_nodes, collect_performance_data

In [49]:
# Collect the performance data
results_df = collect_performance_data(X_test, y_test)

# Save to CSV file
results_df.to_csv("performance_data_covertype.csv", index=False)