In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
# import the data
x1 = pd.read_csv('HW4 Data/p2/x1.csv')
x2 = pd.read_csv('HW4 Data/p2/x2.csv')
x3 = pd.read_csv('HW4 Data/p2/x3.csv')

y1 = pd.read_csv('HW4 Data/p2/c1.csv')
y2 = pd.read_csv('HW4 Data/p2/c2.csv')
y3 = pd.read_csv('HW4 Data/p2/c3.csv')

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Reset column names to be the same across all datasets
common_columns = [f'feature_{i}' for i in range(x1.shape[1])]
x1.columns = common_columns
x2.columns = common_columns
x3.columns = common_columns

# Combine datasets x1 and x2 for training
X_train = pd.concat([x1, x2])
y_train = pd.concat([y1, y2])

# Convert y_train to a 1D array for easier handling of NaNs
y_train_array = y_train.iloc[:, 0].values  # Take the first column 

# Filter out NaN values for training
valid_train_indices = ~np.isnan(y_train_array)
X_train_clean = X_train[valid_train_indices]
y_train_clean = y_train_array[valid_train_indices]

# Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create a pipeline with imputation and classifier
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define parameters for grid search
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1
)
grid_search.fit(X_train_clean, y_train_clean)

# Print results
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Get the best model
best_model = grid_search.best_estimator_

# Try bagging with the best model
bagging_classifier = BaggingClassifier(
    base_estimator=best_model,
    n_estimators=10,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    bootstrap_features=False,
    random_state=42
)
bagging_classifier.fit(X_train_clean, y_train_clean)

# Evaluate the bagging classifier with cross-validation
bagging_scores = cross_val_score(bagging_classifier, X_train_clean, y_train_clean, cv=cv)
print(f"Bagging cross-validation scores: {bagging_scores}")
print(f"Mean bagging accuracy: {bagging_scores.mean():.4f}")

# Choose the final model based on cross-validation results
if bagging_scores.mean() > grid_search.best_score_:
    print("Using bagging classifier as final model")
    final_model = bagging_classifier
else:
    print("Using best model from grid search as final model")
    final_model = best_model

# Final validation on a holdout set
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(
    X_train_clean, y_train_clean, test_size=0.2, random_state=42
)
final_model.fit(X_train_final, y_train_final)
val_preds = final_model.predict(X_val_final)
val_accuracy = accuracy_score(y_val_final, val_preds)
print(f"Final validation accuracy: {val_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val_final, val_preds))

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best cross-validation score: 0.8856




Bagging cross-validation scores: [0.84322034 0.86440678 0.86440678 0.86864407 0.87288136]
Mean bagging accuracy: 0.8627
Using best model from grid search as final model
Final validation accuracy: 0.8771

Classification Report:
              precision    recall  f1-score   support

         1.0       0.85      0.73      0.79        15
         2.0       0.94      0.76      0.84        21
         3.0       0.89      0.76      0.82        21
         4.0       0.92      0.79      0.85        14
         5.0       0.80      0.97      0.88        34
         6.0       0.95      1.00      0.98        41
         7.0       0.91      0.88      0.89        24
         8.0       0.91      0.88      0.89        33
         9.0       0.79      0.95      0.86        20
        10.0       0.77      0.77      0.77        13

    accuracy                           0.88       236
   macro avg       0.87      0.85      0.86       236
weighted avg       0.88      0.88      0.88       236



In [None]:
# this is where to replace test file
x3_test = pd.read_csv('x3.csv')
# x3_test = x3.copy()

# Impute any missing values using the same imputer that was fitted on training data
# X_test_imputed = imputer.transform(X_test)

# Make predictions with the final model
y_test_pred = final_model.predict(x3_test)

# Convert predictions to integers
y_test_pred_int = y_test_pred.astype(int)

# Save predictions to CSV without header and index
np.savetxt('x3_predictions.csv', y_test_pred_int, fmt='%d')

print(f"Made predictions for {len(x3_test)} samples")
print(f"Predictions saved to 'x3_predictions.csv'")

Made predictions for 2246 samples
Predictions saved to 'x3_predictions.csv'
