In [37]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [38]:
# Load data
validation = 'pa2validation.txt'
test = 'pa2test.txt'
train = 'pa2train.txt'
features = 'pa2features.txt'

validation_data = pd.read_csv(validation, header=None, delim_whitespace=True)
test_data = pd.read_csv(test, header=None, delim_whitespace=True)
train_data = pd.read_csv(train, header=None, delim_whitespace=True)

In [39]:
# reading feature names and strip newline characters
with open(features, 'r') as file:
    feature_names = [line.strip() for line in file.readlines()]

# assigning column names to the dataframes
train_data.columns = feature_names + ['target']
validation_data.columns = feature_names + ['target']
test_data.columns = feature_names + ['target']


In [40]:

#split data into features and target
X_train, y_train = train_data.iloc[:, :-1], train_data['target']
X_validation, y_validation = validation_data.iloc[:, :-1], validation_data['target']
X_test, y_test = test_data.iloc[:, :-1], test_data['target']

In [41]:
def bag(data, n_samples=100):
    samples = []
    n = len(data)
    for _ in range(n_samples):
        indices = np.random.choice(n, size=n, replace=True)
        sample = data.iloc[indices, :]
        samples.append(sample)
    return samples


In [42]:
samples = bag(train_data)

In [43]:
def create(samples):
    trees = []
    valdata = []
    testdata = []

    for sample in samples:
        x = sample.iloc[:, :-1]
        y = sample.iloc[:, -1]

        # Randomly selecting features
        n_features = len(x.columns)
        n_selected_features = np.random.randint(n_features // 2, n_features)
        selected_features = np.random.choice(x.columns, size=n_selected_features, replace=False)
        x = x[selected_features]

        # Creating decision tree
        tree = DecisionTreeClassifier(criterion='gini')
        tree.fit(x, y)
        trees.append((tree, selected_features))

        # Recording the selected features for validation and test data
        valdata.append(X_validation[selected_features])
        testdata.append(X_test[selected_features])

    return trees, valdata, testdata

trees, valdata, testdata = create(samples)

In [44]:
# Function to make predictions using the forest
def predict_forest(trees, data):
    predictions = []
    for tree, features in trees:
        preds = tree.predict(data[features])
        predictions.append(preds)
    predictions = np.array(predictions)
    return np.round(predictions.mean(axis=0))

# Evaluating on validation data
y_pred_validation = predict_forest(trees, X_validation)
accuracy_validation = accuracy_score(y_validation, y_pred_validation)
print("Validation Accuracy:", accuracy_validation)

# Evaluating on test data
y_pred_test = predict_forest(trees, X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)


Validation Accuracy: 0.888
Test Accuracy: 0.897


In [45]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [46]:
# Defining a grid of hyperparameters to search
param_grid = {
    'max_depth': [None,5, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10]
}

In [47]:
# Creating a GridSearchCV object
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')

# Fitting the model on training data
grid_search.fit(X_train, y_train)


In [48]:

# Getting the best model
best_tree = grid_search.best_estimator_
print(f"Best parameters: {grid_search.best_params_}")

# Evaluating the best tree on validation data
y_pred_validation_tree = best_tree.predict(X_validation)
accuracy_validation_tree = accuracy_score(y_validation, y_pred_validation_tree)
print(f"Best Decision Tree Accuracy on Validation Data: {accuracy_validation_tree}")

# Evaluating the best tree on test data
y_pred_test_tree = best_tree.predict(X_test)
accuracy_test_tree = accuracy_score(y_test, y_pred_test_tree)
print(f"Best Decision Tree Accuracy on Test Data: {accuracy_test_tree}")


Best parameters: {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2}
Best Decision Tree Accuracy on Validation Data: 0.886
Best Decision Tree Accuracy on Test Data: 0.89


In [49]:
print(f"Random Forest Accuracy on Test Data: {accuracy_test}")

# Comparison
if accuracy_test_tree > accuracy_test:
    print("The Decision Tree performs better on the test data.")
else:
    print("The Random Forest performs better on the test data.")

Random Forest Accuracy on Test Data: 0.897
The Random Forest performs better on the test data.
