In [1]:
import os
import json
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch import optim
from torch.optim import Adam
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.utils import save_image, make_grid
from mpl_toolkits.axes_grid1 import ImageGrid
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, multilabel_confusion_matrix
from scipy.io import arff

# Set the device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Configure Polars
pl.Config.set_tbl_rows(-1)


polars.config.Config

In [2]:
# read from directory
train, trameta = arff.loadarff('../../data/madelon.trn.arff')
test, tstmeta = arff.loadarff('../../data/madelon.tst.arff')
train = pl.from_numpy(train)
test = pl.from_numpy(test)
train.shape, test.shape

((2000, 501), (600, 501))

In [3]:
print(train['class'].cast(pl.Utf8).value_counts().sort('counts', descending=True))
print(test['class'].cast(pl.Utf8).value_counts().sort('counts', descending=True))

shape: (2, 2)
┌───────┬────────┐
│ class ┆ counts │
│ ---   ┆ ---    │
│ str   ┆ u32    │
╞═══════╪════════╡
│ -1    ┆ 1000   │
│ 1     ┆ 1000   │
└───────┴────────┘
shape: (2, 2)
┌───────┬────────┐
│ class ┆ counts │
│ ---   ┆ ---    │
│ str   ┆ u32    │
╞═══════╪════════╡
│ -1    ┆ 300    │
│ 1     ┆ 300    │
└───────┴────────┘


In [4]:
# Split into features and labels
X_train = train.select(pl.col('*').exclude('class')).to_pandas()
y_train = train.select(pl.col('class').cast(pl.datatypes.Utf8)).to_pandas()


X_test = test.select(pl.col('*').exclude('class')).to_pandas()
y_test = test.select(pl.col('class').cast(pl.datatypes.Utf8)).to_pandas()

In [5]:
X_train.head(1)

Unnamed: 0,att_1,att_2,att_3,att_4,att_5,att_6,att_7,att_8,att_9,att_10,...,att_491,att_492,att_493,att_494,att_495,att_496,att_497,att_498,att_499,att_500
0,485.0,477.0,537.0,479.0,452.0,471.0,491.0,476.0,475.0,473.0,...,477.0,481.0,477.0,485.0,511.0,485.0,481.0,479.0,475.0,496.0


In [6]:
y_train

Unnamed: 0,class
0,-1
1,-1
2,-1
3,1
4,1
...,...
1995,1
1996,-1
1997,-1
1998,1


In [7]:
# # Convert labels to a list of labels
# y_train = [labels.split(',') for labels in y_train['class']]
# y_test = [labels.split(',') for labels in y_test['class']]

# # One-hot encode the labels
# mlb = MultiLabelBinarizer()
# y_train_encoded = mlb.fit_transform(y_train)
# y_test_encoded = mlb.transform(y_test)


In [8]:
# Define the pipeline with StandardScaler and MLPClassifier
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("mlp", MLPClassifier(max_iter=1000, random_state=1))  # Adjust parameters as needed
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


In [11]:
# Use the pipeline to make predictions on the test set
y_pred = pipeline.predict(X_test)

# Generating classification report
report = classification_report(y_test, y_pred)

# Print evaluation report
print("Evaluation Report for MLP Classifier with StandardScaler:")
print(report)


Evaluation Report for MLP Classifier with StandardScaler:
              precision    recall  f1-score   support

          -1       0.56      0.58      0.57       300
           1       0.57      0.55      0.56       300

    accuracy                           0.57       600
   macro avg       0.57      0.57      0.57       600
weighted avg       0.57      0.57      0.57       600

