### Fitting and predicting: estimator basics

In [5]:
from sklearn.ensemble import RandomForestClassifier

In [6]:
clf = RandomForestClassifier(random_state=0)
X = [[1, 2, 3], [11,12,13]]
y = [0,1]
clf.fit(X,y)

In [7]:
clf.predict(X)

array([0, 1])

In [8]:
clf.predict([[4,5,6],[14,15,16]])

array([0, 1])

### Transformers and pre-processors

In [9]:
from sklearn.preprocessing import StandardScaler
X = [[0,15],[1,-10]]


In [10]:
StandardScaler().fit(X).transform(X)

array([[-1.,  1.],
       [ 1., -1.]])

### Pipelines: chaining pre-processors and estimators

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [13]:
# create a pipeline object
pipe = make_pipeline(StandardScaler(), LogisticRegression())

# load the iris dataset and split into train/test
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# fir the whole pipeline
pipe.fit(X_train, y_train)

# use it like any other estimator
accuracy_score(pipe.predict(X_test), y_test)

0.9736842105263158

## Model Evaluation

In [15]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

X, y = make_regression(n_samples=1000, random_state=0)
lr = LinearRegression()

result = cross_validate(lr, X, y)
result['test_score']

array([1., 1., 1., 1., 1.])

### Automatic parameter searches

In [16]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

X,y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# define the parameter space that will bs searched over
param_distributions = {'n_estimators': randint(1,5),
                      'max_depth': randint(5,10)}

# now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter=5,
                            param_distributions=param_distributions,
                            random_state=0)
search.fit(X_train, y_train)

In [17]:
search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [18]:
## the search object now acts like a normal random forest estimator
# with max_depth=9 and n_estimators=4
search.score(X_test, y_test)

0.735363411343253