### [Installation](https://scikit-learn.org/stable/install.html)

In [1]:
# verify installation
import sklearn

In [2]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.3.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /home/bjpcjp/.local/lib/python3.11/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: fastai


In [3]:
# see all packages in this active venv
!pip freeze

aiohttp==3.8.5
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.0.0
appdirs==1.4.4
argcomplete==2.0.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
async-timeout==4.0.3
attrs==23.1.0
Babel==2.10.3
backoff==2.2.1
bcrypt==3.2.2
beautifulsoup4==4.12.2
beniget==0.4.1
bleach==6.1.0
blinker==1.6.2
blis==0.7.11
Brlapi==0.8.4
Brotli==1.0.9
catalogue==2.0.10
certifi==2022.9.24
cffi==1.16.0
chardet==5.1.0
charset-normalizer==3.2.0
chart-studio==1.1.0
click==8.1.6
cloudpathlib==0.16.0
colorama==0.4.6
comm==0.2.0
command-not-found==0.3
confection==0.1.4
contourpy==1.0.7
cryptography==38.0.4
cupshelpers==1.0
cvxopt==1.3.2
cycler==0.11.0
cymem==2.0.8
dbus-python==1.3.2
debugpy==1.8.0
decorator==5.1.1
defer==1.0.6
defusedxml==0.7.1
distro==1.8.0
distro-info==1.5
duplicity==1.2.2
executing==2.0.1
fastai==2.7.13
fastcore==1.5.29
fastdownload==0.0.7
fasteners==0.17.3
fastjsonschema==2.18.1
fastprogress==1.0.3
filelock==3.13.1
fonttools==4.38.0
fqdn==1.5.

In [4]:
sklearn.show_versions()


System:
    python: 3.11.6 (main, Oct  8 2023, 05:06:43) [GCC 13.2.0]
executable: /usr/bin/python3
   machine: Linux-6.5.0-14-generic-x86_64-with-glibc2.38

Python dependencies:
      sklearn: 1.3.2
          pip: 23.2
   setuptools: 68.1.2
        numpy: 1.24.2
        scipy: 1.10.1
       Cython: None
       pandas: 2.1.2
   matplotlib: 3.6.3
       joblib: 1.2.0
threadpoolctl: 3.1.0

Built with OpenMP: True

threadpoolctl info:
       user_api: openmp
   internal_api: openmp
         prefix: libgomp
       filepath: /home/bjpcjp/.local/lib/python3.11/site-packages/scikit_learn.libs/libgomp-a34b3233.so.1.0.0
        version: None
    num_threads: 16

       user_api: blas
   internal_api: openblas
         prefix: libopenblas
       filepath: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.23.so
        version: 0.3.23
threading_layer: pthreads
   architecture: Haswell
    num_threads: 16


### Estimator basics
scikit-learn estimators are fitted to data using the fit() method.

In [5]:
from sklearn.ensemble import RandomForestClassifier as RFC

In [6]:
clf = RFC(random_state=0)

In [7]:
X = [[1,2,3], [11,12,13]] # 2 samples, 3 features each
y = [0,1]                 # the classes of each sample

clf.fit(X,y)

In [8]:
# fit() accepts 2 inputs: X (samples), y (target values)
# y not needed in unsupervised learning tasks.
# X & y are expected to be numpy arrays or similar data types
# once the estimator is fitted, it can be used for predictions.

In [9]:
clf.predict(X)

array([0, 1])

In [10]:
clf.predict([[4,5,6],[14,15,16]]) # new data

array([0, 1])

### Transformers & pre-processors
Typical workflows have multiple steps, often a data pre-processing step and a final predictor.

In [11]:
from sklearn.preprocessing import StandardScaler as SS

In [12]:
X = [[0,15],[1,-10]]

SS().fit(X).transform(X)

array([[-1.,  1.],
       [ 1., -1.]])

### Pipelines: chaining pre-processors & estimators

In [13]:
from sklearn.preprocessing import StandardScaler as SS
from sklearn.linear_model  import LogisticRegression as LR
from sklearn.pipeline      import make_pipeline as MP
from sklearn.datasets      import load_iris     as LI
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics         import accuracy_score   as AS

In [18]:
# make a pipeline with standard scaling (preprocessing) & logistic regression (model)
pipe = MP(SS(), LR())

In [19]:
# load iris data - split into training & test sets
X,y = LI(return_X_y=True)

In [20]:
X_train, X_test, y_train, y_test = TTS(X,y,random_state=0)

In [21]:
# fit entire pipeline
pipe.fit(X_train,y_train)

In [22]:
# ready to use - return accuracy score of estimator on test data
AS(pipe.predict(X_test),y_test)

0.9736842105263158

### Model Evaluation
Fitting a model does not guarantee good predictions on unknown data.
Below: 5-fold cross validation example.

In [23]:
from sklearn.datasets import make_regression as MR
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import cross_validate as CV

In [30]:
X,y    = MR(n_samples = 1000, random_state = 0)
lr     = LR()
result = CV(lr,X,y) # default: 5-fold

In [31]:
X

array([[ 0.41929687, -1.5489299 ,  0.65218686, ..., -0.81368398,
        -2.03884275,  0.90000294],
       [-2.06947249,  0.72712806,  0.0975975 , ..., -0.35978104,
        -0.74513907, -0.55050613],
       [-0.37595997,  0.66414405,  1.02239232, ...,  0.50481546,
        -2.83201187, -0.79978614],
       ...,
       [-0.7719197 , -1.33667649, -0.72733814, ..., -0.59830311,
        -0.60986158,  1.69242973],
       [ 0.67198393, -1.50733364,  1.17622157, ...,  2.05921537,
        -1.11140442,  0.01787532],
       [ 1.10334268, -0.59531919, -0.29831814, ..., -0.89706521,
        -0.11546748, -1.299286  ]])

In [32]:
# return r_squared score. (easy to eval, so scores will be high.)
result['test_score']

array([1., 1., 1., 1., 1.])

### Automatic parameter searches
Estimator effectiveness usually depends on a few key variables.
It's usually not clear what parameter values are optimal.
scikit-learn provides tools to auto-search for optimal values.

In [33]:
from sklearn.datasets import fetch_california_housing as FCH
from sklearn.ensemble import RandomForestRegressor    as RFR
from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.model_selection import train_test_split as TTS
from scipy.stats             import randint

In [34]:
X,y = FCH(return_X_y=True)
X_train, X_test, y_train, y_test = TTS(X,y,random_state=0)

In [35]:
# define param space for searching
param_distributions = {'n_estimators': randint(1,5),
                       'max_depth':    randint(5,10)}

In [36]:
# create a search object
searcher = RSCV(estimator = RFR(random_state = 0),
                n_iter = 5,
                param_distributions = param_distributions,
                random_state = 0)

In [38]:
# fit searcher to training data
searcher.fit(X_train, y_train)

In [39]:
# what's the optimal values to use?
searcher.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [40]:
# searcher now acts like a normal random forest estimator:
searcher.score(X_test,y_test)

0.735363411343253

In [41]:
# note:
# - usually want to search over a pipeline - not a single estimator.
# - applying preprocessing to an entire dataset before cross-validation
#   breaks assumption of independence between training & testing data.