In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from jcopml.plot import plot_missing_value
from jcopml.pipeline import cat_pipe, num_pipe
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from jcopml.tuning import random_search_params as rsp

In [2]:
df = pd.read_csv("dataset/mnist_kw.csv")
df.head()

Unnamed: 0,feat_0001,feat_0002,feat_0003,feat_0004,feat_0005,feat_0006,feat_0007,feat_0008,feat_0009,feat_0010,...,feat_3128,feat_3129,feat_3130,feat_3131,feat_3132,feat_3133,feat_3134,feat_3135,feat_3136,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Dataset Splitting


In [7]:
X = df.drop(columns="label").values / 255
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14007, 3136), (6003, 3136), (14007,), (6003,))

In [8]:
pipeline = Pipeline([
    ("algo", RandomForestClassifier(n_jobs=-1, random_state=42))
])

model = RandomizedSearchCV(pipeline, rsp.rf_params, n_iter=50, cv=3, verbose=1)
model.fit(X_train, y_train)
print(model.best_estimator_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Pipeline(steps=[('algo',
                 RandomForestClassifier(max_depth=45,
                                        max_features=0.13651794933077863,
                                        n_estimators=188, n_jobs=-1,
                                        random_state=42))])
1.0 0.9987149282501607 0.9998334166250208


In [9]:
print(model.best_params_)

{'algo__max_depth': 45, 'algo__max_features': 0.13651794933077863, 'algo__min_samples_leaf': 1, 'algo__n_estimators': 188}
