## Assignment

The task is to build and train a classifier given a labeled dataset and then use it to infer the labels of a given unlabeled evaluation dataset. 

You will find the training and evaluation data on canvas.

Here's the training data: TrainOnMe-2.csv 

Here's the evaluation data: EvaluateOnMe-2.csv 

Here's the ground truth: EvaluationGT-2.csv

You can use whatever python libraries you like! The steps below are suggestions, but feel free to try any other techniques we discussed in class.

You can submit the predicted labels by uploading them in csv format, which will then be compared to the ground truth.

In [1]:
# Import packages
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# For feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# For min-max scaling
from sklearn.preprocessing import MinMaxScaler

# For encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Some models you can try
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier

# Packages I am importing:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.base import BaseEstimator, TransformerMixin

## Load the training and evaluation datasets

In [2]:
# Read datasets
df = pd.read_csv('TrainOnMe-2.csv')
eval_df = pd.read_csv('EvaluateOnMe-2.csv')

# Split your training dataset into features and labels
X = df.drop('y', axis=1)
y = df['y'].apply(lambda x: x.strip() if isinstance(x, str) else x)

## Data pre-processing

In [3]:
# Do some data pre-processing

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
dunno_numeric_cols = X.select_dtypes(include=['object']).columns

for col in dunno_numeric_cols:
    converted = pd.to_numeric(X[col], errors='coerce')
    if not converted.isna().all():  # Only convert if we get valid numbers
        X[col] = converted
        numeric_cols.append(col)

# Check the dtypes of all features
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Convert text columns to category
numeric_imputer = SimpleImputer(strategy='median')
X[numeric_cols] = numeric_imputer.fit_transform(X[numeric_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])
X[categorical_cols] = X[categorical_cols].astype('category')

# Remove NA values and noise
X = X.dropna()
y = y[X.index]

# Change categories to encoded labels using LabelEncoder()
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)])

le = LabelEncoder()
y = le.fit_transform(y)

# encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# encoded_features = encoder.fit_transform(X[categorical_cols])

## Dealing with outliers

In [4]:
# Try to remove outliers from training data to improve performance
# There are different ways to do this but one way could be to use stats.zscore

from scipy import stats

for i, col1 in enumerate(numeric_cols):
    for col2 in numeric_cols[i+1:]:
        X[f'{col1}_x_{col2}'] = X[col1] * X[col2]

numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
z_scores = stats.zscore(X[numeric_cols])
abs_z_scores = np.abs(z_scores)

# I selected 3 standard devs:
threshold = 3

# Filtering w/ |Z-score| < 3:
filter_mask = (abs_z_scores < threshold).all(axis=1)
X_filtered = X[filter_mask]
y_filtered = y[filter_mask]

print(f"The original size of {len(X)}, filtered to {len(X_filtered)}")

The original size of 1004, filtered to 759


## Scaling the features

In [5]:
# Debugging:
##print(y.unique())

y = pd.to_numeric(y, errors='ignore')
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

In [6]:
class InteractionCreator(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.interaction_pairs = []

    def fit(self, X, y=None):
        numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        self.interaction_pairs = [(col1, col2) for i, col1 in enumerate(numeric_cols)
                                 for col2 in numeric_cols[i+1:]]
        return self
    
    def transform(self, X):
        X = X.copy()
        interaction_features = {}

        for col1, col2 in self.interaction_pairs:
            new_feature_name = f'{col1}_x_{col2}'
            if new_feature_name in X.columns:
                new_feature_name += "_dup"  # Add a suffix if the column already exists
            interaction_features[new_feature_name] = X[col1] * X[col2]

        X = pd.concat([X, pd.DataFrame(interaction_features, index=X.index)], axis=1)
        return X

In [7]:
# Scale your features
# You can try both standardscaler and minmaxscaler and see which works better

import warnings
from sklearn.exceptions import FitFailedWarning

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, 
                        module="sklearn.model_selection._split")
warnings.filterwarnings("ignore", category=FitFailedWarning)

scalers = {
    'NoScaling': None,
    'RobustScaler': RobustScaler(),
    'QuantileTransformer': QuantileTransformer(output_distribution='normal'),
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler()}

cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
best_score = -np.inf
best_scaler = None

for scaler_name, scaler in scalers.items():
    num_transformer = scaler if scaler else 'passthrough'
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
            ('num', num_transformer, numeric_cols)
        ],
        remainder='drop'
    )

    model = Pipeline([
        ('preprocessor', preprocessor),
        ('feature_selector', SelectKBest(k=10)),  # Keep top 10 features
        ('classifier', RandomForestClassifier(
            n_estimators=200,
            max_depth=10,
            class_weight='balanced',
            random_state=42
        ))])

    scores = cross_val_score(model, X, y, cv=cv_strategy, scoring='accuracy')
    avg_score = np.mean(scores)
    std_score = np.std(scores)
    
    print(f"{scaler_name}: {avg_score:.3f} ± {std_score:.3f}")
    
    if avg_score > best_score:
        best_score = avg_score
        best_scaler = scaler_name

print(f"\nThe best one is {best_scaler} and I got a {best_score:.4f} accuracy.")

# Will use hyperparameter tuning:

if best_scaler:
    from sklearn.model_selection import GridSearchCV
    
    param_grid = {
        'classifier__n_estimators': [200, 300],
        'classifier__max_depth': [5, 10, None],
        'feature_selector__k': [5, 10, 15]
    }

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Tuned accuracy: {grid_search.best_score_:.3f}")

NoScaling: 0.486 ± 0.057
RobustScaler: 0.486 ± 0.057




QuantileTransformer: 0.514 ± 0.055
MinMaxScaler: 0.486 ± 0.057
StandardScaler: 0.486 ± 0.057

The best one is QuantileTransformer and I got a 0.5140 accuracy.
Best parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 300, 'feature_selector__k': 10}
Tuned accuracy: 0.516


In [8]:
# I will use QuantileTransformer:

# The best one is QuantileTransformer and I got a 51% (± 4.4%) accuracy.
# HP Tuning got me 52.3%.

## Feature selection

In [9]:
# You could try to apply SelectKBest class to extract the most useful features (this is optional but MIGHT improve accuracy)
# Remove whichever features that are not useful

# Will try this since I can't get more than 53%...

## Split your data to train and test set

In [10]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state = 0)


X_train.shape[0]
X_test.shape[0]

101

## Fit the model

* You can try models other than the models listed below
* You can try different hyperparameters
* Evaluate your model using cross-validation

In [11]:
# Try linear SVM classifier
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Use the same preprocessor from earlier
    ('classifier', SVC(kernel='linear', C=0.5))])
svm_pipeline.fit(X_train, y_train)

test_score = svm_pipeline.score(X_test, y_test)
print(test_score)

scores = cross_val_score(svm_pipeline, X_train, y_train, cv=5)
print(scores)

# linear = SVC(kernel='linear', C=0.5).fit(X_train, y_train)
# print(linear.score(X_test,y_test))
# # Evaluate using cross-validation
# scores = cross_val_score(linear,X_test,y_test,cv=5)
# print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.49504950495049505
[0.46961326 0.44751381 0.45856354 0.41666667 0.50555556]


In [12]:
#Try decision tree classifier
decision_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(criterion='gini', random_state=0))
])

decision_tree = decision_tree_pipeline.fit(X_train, y_train)
dttest_score = decision_tree_pipeline.score(X_test, y_test)
dttest_score

# decision_tree = DecisionTreeClassifier(criterion = "gini").fit(X_train, y_train)
# print(decision_tree.score(X_test,y_test))
# # Evaluate using cross-validation
# scores = cross_val_score(decision_tree,X_test,y_test,cv=10)
# print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.42574257425742573

In [13]:
randomforest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=300, random_state=0, min_samples_split=5, max_features='sqrt'))])

randomforest_pipeline.fit(X_train, y_train)
randomforest_score = randomforest_pipeline.score(X_test, y_test)
randomforest_score

scores = cross_val_score(randomforest_pipeline, X_train, y_train, cv=10)

# #Try random forest classifier
# random_forest = RandomForestClassifier().fit(X_train, y_train)
# print(random_forest.score(X_test,y_test))
# scores = cross_val_score(random_forest,X_test,y_test,cv=10)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.53 accuracy with a standard deviation of 0.04


In [14]:
scores = cross_val_score(randomforest_pipeline, X, y, cv=10, scoring='accuracy')
print(f"Cross-val accuracy: {scores.mean():.2f} (±{scores.std():.2f})")

Cross-val accuracy: 0.48 (±0.05)


In [15]:
# Using the suggested Select K Best:

best_fit_randomf = Pipeline(steps=[
    ('interactions', InteractionCreator()),
    ('preprocessor', preprocessor),
    ('feature_selector', SelectKBest(score_func=mutual_info_classif, k=15)),  # Keep top 15 features
    ('classifier', RandomForestClassifier(
        n_estimators=300, 
        max_depth=10,
        class_weight='balanced',
        random_state=0,
        min_samples_split=5,
        max_features='sqrt'
    ))
])

scores = cross_val_score(best_fit_randomf, X, y, cv=5, scoring='accuracy')
print(f"Cross-val accuracy with feature selection: {scores.mean():.2f} (±{scores.std():.2f})")

param_grid = {
    'feature_selector__k': [10, 15, 20, 'all'],
    'feature_selector__score_func': [mutual_info_classif, chi2]
}

grid_search = GridSearchCV(best_fit_randomf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_:.2f}")

Cross-val accuracy with feature selection: 0.51 (±0.03)


 0.48008955        nan]


Best parameters: {'feature_selector__k': 20, 'feature_selector__score_func': <function mutual_info_classif at 0x7fffd22fa0e0>}
Best accuracy: 0.52


In [16]:
def verify_features(pipe, X, eval_df):
    pipe.fit(X, y)
    
    # Access steps using SAME NAMES as pipeline definition
    interaction_step = pipe.named_steps['interactions']  # Now exists
    preprocessor_step = pipe.named_steps['preprocessor']
    
    # Get transformed features
    X_trans = interaction_step.transform(X)
    X_trans = preprocessor_step.transform(X_trans)
    
    print(f"Final feature count: {X_trans.shape[1]}")
    return X_trans

# Execute
_ = verify_features(best_fit_randomf, X, eval_df)

Final feature count: 96


In [17]:
# At the moment, my Random Forest prediction looks better
# amongst the others. 57% accuracy in the best case (52%).

# Update: Now 58% best case (54% normal).
# Best cross-val accuracy I got: 53%.

In [18]:
eval_df = eval_df.reindex(columns=X.columns, fill_value=0)
best_fit_randomf.fit(X, y)

train_transformed = best_fit_randomf[:-1].transform(X)
eval_transformed = best_fit_randomf[:-1].transform(eval_df)

train_features = pd.DataFrame(train_transformed, columns=[f'feature_{i}' for i in range(train_transformed.shape[1])])
eval_features = pd.DataFrame(eval_transformed, columns=[f'feature_{i}' for i in range(eval_transformed.shape[1])])

eval_predictions = best_fit_randomf.predict(eval_df)

In [19]:
train_transformed

array([[ 0.03157549, -0.663927  ,  1.85067221, ..., -0.03157545,
        -0.01612756, -0.03157545],
       [ 0.03157523, -1.20924518, -0.3563153 , ..., -0.03157545,
        -0.01612756, -0.03157545],
       [ 0.03157532,  0.81407278, -0.48656889, ..., -0.03157545,
         2.27243253, -0.03157545],
       ...,
       [ 0.03157554, -0.85182855,  0.86976355, ..., -0.03157545,
         2.60132795, -0.03157545],
       [ 0.0315755 , -1.19434517, -0.74480325, ..., -0.03157545,
        -0.01612756, -0.03157545],
       [ 0.03157551, -1.64024435,  1.36251227, ..., -0.03157545,
        -0.01612756, -0.03157545]])

In [20]:
eval_transformed

array([[ 0.03157545, -1.39835781,  1.46497449, ..., -0.03157545,
        -0.01612756, -0.03157545],
       [ 0.03157559, -1.72942418,  0.08136806, ..., -0.03157545,
        -0.01612756, -0.03157545],
       [ 0.03157543, -1.25222032, -1.99458977, ..., -0.03157545,
        -0.01612756, -0.03157545],
       ...,
       [ 0.03157553, -0.92801675, -1.60230145, ..., -0.03157545,
        -0.01612756, -0.03157545],
       [ 0.03157534,  0.82144939, -0.13044262, ..., -0.03157545,
        -0.01612756, -0.03157545],
       [ 0.0315753 , -0.89983152,  2.07477097, ..., -0.03157545,
        -0.01612756, -0.03157545]])

In [21]:
# Use your best model to predict the labels for the evaluation set

y_pred = best_fit_randomf.predict(eval_df)
# y_pred_final = [val for val in y_pred if val in label_map] # removed error '5' key. (No need anymore).

print(set(y_pred))

{0, 1, 2, 3}


In [22]:
# Final processing before saving. Was not getting keys but the actual values:

label_map = {
    0: 'Jorg',
    1: 'Bob',
    2: 'Shoogee',
    3: 'Atsuto',
    5: 'ERROR'
}

y_pred_filtered = [val for val in y_pred if val in label_map]
# names_only = [label_map[val] for val in y_pred_filtered]
names_only = [label_map.get(val, 'Unknown') for val in y_pred]

predictions_df = pd.DataFrame(names_only, columns=['y'])
predictions_df.to_csv("ccw_assignment_results.csv", index=False, header=True)

In [23]:
# Save your predictions to a .csv and upload it to canvas

pd.DataFrame(y_pred).to_csv("ccw_assignment_results.csv")