# Mandelon

MADELON is an artificial dataset containing data points grouped in 32 clusters placed on the vertices of a five dimensional hypercube and randomly labeled +1 or -1. The five dimensions constitute 5 informative features. 15 linear combinations of those features were added to form a set of 20 (redundant) informative features. Based on those 20 features one must separate the examples into the 2 classes (corresponding to the +-1 labels). We added a number of distractor feature called 'probes' having no predictive power. The order of the features and patterns were randomized.


Sources:
- https://github.com/dstrodtman/madelon

## Importamos librerías

In [1]:
import math
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, binarize, PolynomialFeatures, RobustScaler
from sklearn.decomposition import PCA
from scipy.stats import boxcox
from itertools import combinations
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from deap import base  # Estructura que permite agrupar todos los componentes de nuestro algoritmo en una misma bolsa
from deap import creator  # Permite crear los componentes de nuestro algoritmo
from deap import tools  # Contiene funciones precargadas
from joblib import Parallel, delayed
from scipy.io import arff
scaler = StandardScaler()

In [2]:
tra, trameta = arff.loadarff('/home/sebacastillo/ealab/data/madelon.trn.arff')
tst, tstmeta = arff.loadarff('/home/sebacastillo/ealab/data/madelon.tst.arff')
train = pl.DataFrame(tra)


In [3]:
train = (
    train.with_columns(
        pl.col('class').cast(pl.datatypes.Utf8)
    )   
)
test =  pl.DataFrame(tst)
test = (
    test.with_columns(
        pl.col('class').cast(pl.datatypes.Utf8)
    )   
)


In [4]:
train.groupby('class').agg(pl.count())

class,count
str,u32
"""1""",1000
"""-1""",1000


In [5]:
print(train.shape), print(test.shape)

(2000, 501)
(600, 501)


(None, None)

# PARTICIONO LOS DATOS

# Leemos los datasets

In [6]:
TRAIN = train
TEST = test

# Normalizamos los datos

In [7]:

TRAIN = TRAIN.to_numpy()
TEST = TEST.to_numpy()

X_TRAIN = TRAIN[:,:-1]
y_train = TRAIN[:,-1]
y_train = np.where(np.array(y_train) == '1', 1, 0).astype('int64')
X_TEST = TEST[:,:-1]
y_test = TEST[:,-1]
y_test = np.where(np.array(y_test) == '1', 1, 0).astype('int64')
scaler.fit(X_TRAIN)
Xtrain = scaler.transform(X_TRAIN)
Xtest = scaler.transform(X_TEST)


# Clasify with SVM

In [10]:
svc_pipe = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca', PCA(5)),
    ('scaler2', StandardScaler()),
    ('clf', SVC(decision_function_shape='ovo'))
])

In [11]:
svc_params = {'clf__C':(100.0, 10.0, 1.0, 0.1, .01, .001),
              'clf__gamma':(.5, .6, .7, .8, .9, 1.0)}

In [12]:
gs_svc = GridSearchCV(svc_pipe, svc_params, cv=5, n_jobs=-1, verbose=1)

In [13]:
gs_svc.fit(Xtrain, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [14]:
gs_svc.best_score_

0.8320000000000001

In [15]:
gs_svc.best_params_

{'clf__C': 1.0, 'clf__gamma': 0.6}

In [16]:
gs_svc.score(Xtrain, y_train)

0.898

In [17]:
gs_svc.score(Xtest, y_test)

0.845

# Random Forest

In [75]:
from sklearn.ensemble import RandomForestClassifier

In [76]:
rfc=RandomForestestimator(random_state=42)

In [77]:
param_grid = { 
    'n_estimators': [10, 200],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth' : [2,4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [78]:
gs_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
gs_rfc.fit(Xtrain, y_train)

In [79]:
gs_rfc.best_score_

0.8160000000000001

In [80]:
gs_rfc.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': None,
 'n_estimators': 200}

In [81]:
gs_rfc.score(Xtrain, y_train)

0.999

In [82]:
gs_rfc.score(Xtest, y_test)

0.8416666666666667

# MLP

In [9]:
# MLP skitlear
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

In [11]:
mlp_pipe = Pipeline([    
    ('estimator', MLPClassifier(random_state=42, verbose=1))
])

In [12]:
'''
params = {    
    'estimator__solver': ['adam'],
    'estimator__learning_rate_init': [0.001, 0.005],
    'estimator__max_iter': [300],
    'estimator__hidden_layer_sizes': [(100, 200, 2), (200, 20,2)],
    'estimator__activation': ['logistic', 'tanh', 'relu'],
    'estimator__alpha': [0.0001, 0.001, 0.005],
    'estimator__early_stopping': [True, False]
}
'''
params = {
    "estimator__hidden_layer_sizes": [(5,5), (10,10), (5,5,5)],
    "estimator__activation": ["tanh", "relu"],
    "estimator__learning_rate": ["constant", "invscaling", "adaptive"],
    "estimator__max_iter": [1000, 2000],
    "estimator__alpha": [0.0001, 0.001, 0.005],
    "estimator__solver": ['adam'],
}


In [13]:
gs_mlp = GridSearchCV(estimator=mlp_pipe, param_grid=params, cv= 5)

In [None]:
gs_mlp.fit(Xtrain, y_train)

In [20]:
gs_mplc.best_score_

0.5780000000000001

In [21]:
gs_mplc.best_params_

{'estimator__activation': 'logistic',
 'estimator__alpha': 0.005,
 'estimator__early_stopping': True,
 'estimator__hidden_layer_sizes': (100, 200, 2),
 'estimator__learning_rate_init': 0.005,
 'estimator__max_iter': 300,
 'estimator__solver': 'adam'}

In [22]:
gs_mplc.score(Xtrain, y_train)

0.724

In [23]:
gs_mplc.score(Xtest, y_test)

0.5666666666666667

In [None]:
#  corregir mlpc

# MLP with scorch
https://github.com/skorch-dev/skorch