## Assignment

The task is to build and train a classifier given a labeled dataset and then use it to infer the labels of a given unlabeled evaluation dataset. 

You will find the training and evaluation data on canvas.

Here's the training data: TrainOnMe-2.csv 

Here's the evaluation data: EvaluateOnMe-2.csv 

Here's the ground truth: EvaluationGT-2.csv

You can use whatever python libraries you like! The steps below are suggestions, but feel free to try any other techniques we discussed in class.

You can submit the predicted labels by uploading them in csv format, which will then be compared to the ground truth.


In [24]:
# Import packages 
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# For feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# For min-max scaling
from sklearn.preprocessing import MinMaxScaler

# For encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Some models you can try
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier

# Packages I am importing:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

## Load the training and evaluation datasets

In [5]:
# Read datasets
df = pd.read_csv('TrainOnMe-2.csv')
eval_df = pd.read_csv('EvaluateOnMe-2.csv')

# Split your training dataset into features and labels
X = df.drop('y', axis=1) #Features -- all except "y"
y = df['y'] # Labels -- targeting "y"

## Data pre-processing

In [17]:
# Do some data pre-processing
for col in X.columns:
    if X[col].dtype == 'object':
        print(f"Column: {col}")
        print(X[col].unique())

        X[col] = pd.to_numeric(X[col], errors='coerce')

# Remove NA values and noise
X = X.dropna()
y = y[X.index]

# Check the dtypes of all features
text_columns = X.select_dtypes(include=['object']).columns
X[text_columns] = X[text_columns].astype('category')

# Convert text columns to category
categorical_cols = X.select_dtypes(include=['category']).columns

# Change categories to encoded labels using LabelEncoder()
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(X[categorical_cols])

## Dealing with outliers

In [7]:
# Try to remove outliers from training data to improve performance
# There are different ways to do this but one way could be to use stats.zscore

from scipy import stats

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
z_scores = stats.zscore(X[numerical_cols])
abs_z_scores = np.abs(z_scores)

# I selected 3 standard devs:
threshold = 3

# Filtering w/ |Z-score| < 3:
filter_mask = (abs_z_scores < threshold).all(axis=1)
X_filtered = X[filter_mask]
y_filtered = y[filter_mask]

print(f"The original size of {len(X)}, filtered to {len(X_filtered)}")

The original size of 1000, filtered to 978


## Scaling the features

In [31]:
# Debugging:
##print(y.unique())

y = pd.to_numeric(y, errors='ignore')
if y.dtype == 'object':
    le = LabelEncoder()
    y = le.fit_transform(y)

In [29]:
# Scale your features
# You can try both standardscaler and minmaxscaler and see which works better

scalers = {'MinMaxScaler': MinMaxScaler(),
           'StandardScaler': StandardScaler()}

scalers = {
    'MinMaxScaler': MinMaxScaler(),
    'StandardScaler': StandardScaler()
}

best_scaler = None
best_score = -np.inf

for scaler_name, scaler in scalers.items():
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
            ('num', scaler, numerical_cols)
        ])

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])

    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    avg_score = np.mean(scores)
    print(f"{scaler_name} accuracy of {avg_score:.4f}")
    
    # Track best scaler
    if avg_score > best_score:
        best_score = avg_score
        best_scaler = scaler_name

print(f"\nThe best one is {best_scaler} and I got a {best_score:.4f} accuracy.")



MinMaxScaler with an accuracy of 0.4540




StandardScaler with an accuracy of 0.4450

The best scaler is MinMaxScaler and I got a 0.4540 accuracy.


## Feature selection

In [None]:
# You could try to apply SelectKBest class to extract the most useful features (this is optional but MIGHT improve accuracy)
# Remove whichever features that are not useful

## Split your data to train and test set

In [33]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state = 0)


X_train.shape[0]
X_test.shape[0]

100

## Fit the model

* You can try models other than the models listed below
* You can try different hyperparameters
* Evaluate your model using cross-validation

In [38]:
# Try linear SVM classifier
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Use the same preprocessor from earlier
    ('classifier', SVC(kernel='linear', C=0.5))])
svm_pipeline.fit(X_train, y_train)

test_score = svm_pipeline.score(X_test, y_test)
print(test_score)

scores = cross_val_score(svm_pipeline, X_train, y_train, cv=5)
print(scores)

# linear = SVC(kernel='linear', C=0.5).fit(X_train, y_train)
# print(linear.score(X_test,y_test))
# # Evaluate using cross-validation
# scores = cross_val_score(linear,X_test,y_test,cv=5)
# print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.45




[0.48888889 0.46111111 0.47777778 0.51111111 0.48888889]


In [43]:
#Try decision tree classifier
decision_tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(criterion='gini', random_state=0))
])

decision_tree = decision_tree_pipeline.fit(X_train, y_train)
dttest_score = decision_tree_pipeline.score(X_test, y_test)
dttest_score

# decision_tree = DecisionTreeClassifier(criterion = "gini").fit(X_train, y_train)
# print(decision_tree.score(X_test,y_test))
# # Evaluate using cross-validation
# scores = cross_val_score(decision_tree,X_test,y_test,cv=10)
# print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.39

In [48]:
randomforest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=50, random_state=0))])

randomforest_pipeline.fit(X_train, y_train)
randomforest_score = randomforest_pipeline.score(X_test, y_test)
randomforest_score

scores = cross_val_score(randomforest_pipeline, X_train, y_train, cv=10)

# #Try random forest classifier
# random_forest = RandomForestClassifier().fit(X_train, y_train)
# print(random_forest.score(X_test,y_test))
# scores = cross_val_score(random_forest,X_test,y_test,cv=10)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))



0.51 accuracy with a standard deviation of 0.04


In [52]:
# Use your best model to predict the labels for the evaluation set
eval_csv = pd.read_csv('EvaluateOnMe-2.csv')

y_pred = randomforest_pipeline.predict(eval_csv)

print(y_pred)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [599]:
# Save your predictions to a csv and upload it to canvas

pd.DataFrame(y_pred).to_csv("file.txt",index = False,header=False)