<a href="https://colab.research.google.com/github/benmanjackson/CS290/blob/main/Attribute_Selection_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Khush Dakwala and Benjamin Jackson

In [None]:
import numpy as np
from collections import Counter

# Entropy calculation function
def calculate_entropy(y):
    counts = Counter(y)
    total = len(y)
    entropy = -sum((count / total) * np.log2(count / total) for count in counts.values())
    return entropy

# Gini impurity calculation function
def calculate_gini(y):
    counts = Counter(y)
    total = len(y)
    gini = 1 - sum((count / total) ** 2 for count in counts.values())
    return gini

# Mean Squared Error (MSE) calculation for regression
def calculate_mse(y):
    mean_y = np.mean(y)
    mse = np.mean((y - mean_y) ** 2)
    return mse

# Split dataset based on a feature and threshold
def split_dataset(X, y, feature_idx, threshold):
    left_split = [i for i in range(len(X)) if X[i][feature_idx] <= threshold]
    right_split = [i for i in range(len(X)) if X[i][feature_idx] > threshold]
    return left_split, right_split

# Calculate the weighted impurity after a split
def calculate_weighted_impurity(y, left_indices, right_indices, impurity_func):
    left_impurity = impurity_func([y[i] for i in left_indices])
    right_impurity = impurity_func([y[i] for i in right_indices])
    left_weight = len(left_indices) / len(y)
    right_weight = len(right_indices) / len(y)
    weighted_impurity = left_weight * left_impurity + right_weight * right_impurity
    return weighted_impurity

# Main function to find the best attribute and threshold
def Attribute_selection_method(task, dataset, target_var, criterion=None):
    # Separate features and target
    X = dataset.drop(columns=[target_var]).values
    y = dataset[target_var].values

    best_feature = None
    best_threshold = None
    best_impurity = float('inf')

    # Define impurity function based on task
    if task == "classification":
        if criterion == "entropy":
            impurity_func = calculate_entropy
        elif criterion == "gini":
            impurity_func = calculate_gini
        else:
            raise ValueError("Invalid criterion for classification. Choose 'entropy' or 'gini'.")
    elif task == "regression":
        impurity_func = calculate_mse
    else:
        raise ValueError("Invalid task type. Choose 'classification' or 'regression'.")

    # Loop over each feature
    for feature_idx in range(X.shape[1]):
        # Get all unique values of the feature for splitting
        unique_values = np.unique(X[:, feature_idx])

        # Try each unique value as a threshold
        for threshold in unique_values:
            left_indices, right_indices = split_dataset(X, y, feature_idx, threshold)

            # Skip if one side of the split is empty
            if not left_indices or not right_indices:
                continue

            # Calculate weighted impurity for the split
            weighted_impurity = calculate_weighted_impurity(y, left_indices, right_indices, impurity_func)

            # Update the best split if this one is better
            if weighted_impurity < best_impurity:
                best_impurity = weighted_impurity
                best_feature = feature_idx
                best_threshold = threshold

    # Return the best feature and threshold
    if best_feature is not None:
        return dataset.columns[best_feature], best_threshold
    else:
        return None, None


## Applying the method to Categorical and Numerical targets and comparing the split chosen between our method and sklearn's.

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
# Load the dataset
url = 'https://raw.githubusercontent.com/dreamingv-oid/CS290/main/train.csv'
df = pd.read_csv(url)

# Strip any whitespace characters from column names
df.columns = df.columns.str.strip()

# Drop rows with missing values
df.dropna(inplace=True)

# Apply the method to the classification dataset
# Create a copy of the DataFrame for classification
df_classification = df.copy()
best_feature_classification = Attribute_selection_method("classification", df_classification, "satisfaction", criterion="gini")

# Fit sklearn's DecisionTreeClassifier to compare first split
X_classification = df_classification.drop(columns=["satisfaction"])
y_classification = df_classification["satisfaction"]

# Create and fit the classifier within a pipeline
classifier = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), X_classification.select_dtypes(include=['object']).columns)
        ], remainder='passthrough'
    )),
    ('model', DecisionTreeClassifier(criterion="gini", random_state=42))
])

classifier.fit(X_classification, y_classification)

# Get the first split chosen by sklearn's DecisionTreeClassifier
first_split_classification = classifier.named_steps['model'].tree_.feature[classifier.named_steps['model'].tree_.children_left[0]]

# Apply the method to the regression dataset
# Create a copy of the DataFrame for regression
df_regression = df.copy()
best_feature_regression = Attribute_selection_method("regression", df_regression, "Flight Distance")

# Fit sklearn's DecisionTreeRegressor to compare first split
X_regression = df_regression.drop(columns=["Flight Distance"])
y_regression = df_regression["Flight Distance"]

# Create and fit the regressor within a pipeline
regressor = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), X_regression.select_dtypes(include=['object']).columns)
        ], remainder='passthrough'
    )),
    ('model', DecisionTreeRegressor(criterion="squared_error", random_state=42))
])

regressor.fit(X_regression, y_regression)

# Get the first split chosen by sklearn's DecisionTreeRegressor
first_split_regression = regressor.named_steps['model'].tree_.feature[regressor.named_steps['model'].tree_.children_left[0]]

# Print the results together
print(f"Best splitting criterion for classification (Gini) via our method: {best_feature_classification}")
print(f"Best splitting criterion for regression via our method: {best_feature_regression}")

# Print sklearn's first split features
print(f"First split feature chosen by sklearn's DecisionTreeClassifier: {X_classification.columns[first_split_classification]}")
print(f"First split feature chosen by sklearn's DecisionTreeRegressor: {X_regression.columns[first_split_regression]}")