In [None]:
# Baseline System
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Load taining data
Data_train = pd.read_csv('mushroom_train.csv')

In [None]:
# Load test data
Data_test = pd.read_csv('mushroom_test.csv')

In [None]:
# Separate training data features and labels
X = Data_train.drop(columns='class')
y = Data_train['class']

# Define column transformer
numerical_features = ['cap-diameter', 'stem-height', 'stem-width']
categorical_features = ['cap-shape', 'cap-surface', 'cap-color', 'does-bruise-or-bleed',
                        'gill-attachment', 'gill-spacing', 'gill-color', 'stem-color', 'has-ring',
                        'ring-type', 'habitat', 'season']

column_transformer = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(), categorical_features)
])

# Create the pipeline
transformer = Pipeline(steps=[('transformer', column_transformer)])

# Fit and transform training data
X_data = transformer.fit_transform(X)

le = LabelEncoder()
y_data = le.fit_transform(y)

# Split data into training data and validation data
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

In [None]:
# Separate test data features and labels
Xtest = Data_test.drop(columns='class')
ytest = Data_test['class']

# Transform test data using the same pipeline
X_test = transformer.transform(Xtest)

y_test = le.transform(ytest)

In [None]:
# Trivial system
unique_labels, counts = np.unique(y_data, return_counts=True)

probability = counts / counts.sum()

y_pred = np.random.choice(unique_labels, size=len(y_test), p=probability)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary', pos_label=1)

print("Trivial system accuracy:", accuracy)
print("Trivial system F1-score:", f1)

In [None]:
# Nearest Mean Classifier
def classifier(X, y):
    means = {}
    for label in np.unique(y):
        class_data = X[y == label]
        class_mean = np.mean(class_data, axis=0)
        means[label] = class_mean
    return means

def predict_nearest_mean(X, means):
    predictions = []
    for x in X:
        min = float('inf')
        predicted_class = None
        for label, class_mean in means.items():
            distance = np.linalg.norm(x - class_mean)
            if distance < min:
                min = distance
                predicted_class = label
        predictions.append(predicted_class)
    return np.array(predictions)

print("Nearest Mean Classifier")
# Train the Nearest Mean Classifier
means = classifier(X_train, y_train)

# Make predictions on validation set
y_pred_val = predict_nearest_mean(X_val, means)

# Calculate the accuracy and F1-score for the validation set
accuracy_val = accuracy_score(y_val, y_pred_val)
f1_val = f1_score(y_val, y_pred_val, average='binary', pos_label=1)

print("Validation accuracy:", accuracy_val)
print("Validation F1-score:", f1_val)
print()

# Train Nearest Mean Classifier on full training set
class_means_full = classifier(X_data, y_data)

# Make predictions on the test set
y_pred_nc = predict_nearest_mean(X_test, class_means_full)

# Calculate the accuracy and F1-score for test set
accuracy_test = accuracy_score(y_test, y_pred_nc)
f1_test = f1_score(y_test, y_pred_nc, average='binary', pos_label=1)

print("Test accuracy:", accuracy_test)
print("Test F1-score:", f1_test)