In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plts
import seaborn as sns; sns.set()

In [None]:
X = np.random.normal(loc=5, scale=3, size=(10000,1))
y = np.round(X)%2 == 0
y = y.reshape(-1)

In [None]:
sns.distplot(X, label='Input')

In [None]:
sns.scatterplot(x=X.reshape(-1), y=y, label= 'truth')

# Pre-processing

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Standardize each feature
fit = StandardScaler().fit(X)
print('Fit mean      =', fit.mean_)
print('Fit scale     =', fit.scale_)
print('Fit var       =', fit.var_)
print('Fit n_samples =', fit.n_samples_seen_)

In [None]:
X_scaled = fit.transform(X)
sns.distplot(X_scaled, label='Scaled')
sns.distplot(X, label='Input')
plt.legend()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=0)
print(f'{len(X_train)} training + {len(X_test)} testing = {len(X)} total')

# Fitting and predicting

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)

In [None]:
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
tmp_x = X_test.reshape((-1,))
tmp_y = y_test.reshape((-1,))
print(f'{tmp_x.shape=}; {tmp_y.shape=}; {pred.shape=}')

In [None]:
sns.scatterplot(x=tmp_x, y=tmp_y, label= 'truth')
sns.scatterplot(x=tmp_x, y=pred, label = 'pred')

# Pipelines

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
pipe = make_pipeline(
    StandardScaler(), 
    clf
)
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)
sns.scatterplot(x=tmp_x, y=tmp_y, label= 'truth')
sns.scatterplot(x=tmp_x, y=pred, label = 'pred')

# Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

In [None]:
accuracy_score(pred, y_test)

In [None]:
result = cross_validate(pipe, X, y) # defaults to 5-fold CV
print(result['test_score']) # r_squared score

# Parameter Searches

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [None]:
param_distributions = {
    'n_estimators': randint(1, 5),
    'max_depth': randint(5, 10)
}
search = RandomizedSearchCV(estimator=clf, # Can't use pipe?
                            n_iter=5,
                            param_distributions=param_distributions,
                            random_state=0)
search.fit(X_train, y_train)
print(search.best_params_)

In [None]:
search.score(X_test, y_test)