# Librerie sklearn per il pre-processing di dati

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

## Esempio

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.cluster import KMeans

In [None]:
data_file = 'data/fifa/fifa_2019.csv'
D = pd.read_csv(data_file, index_col=0)

In [None]:
w = [int(x.replace('lbs', '')) * 0.45359237 if not pd.isnull(x) else np.nan for x in D.Weight.values]
h = [float(x.replace("'", '.')) * 0.3048 if not pd.isnull(x) else np.nan for x in D.Height.values]
X = np.array([h, w]).T

## Manage missing values
[SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html#sklearn.impute.SimpleImputer), [IterativeImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.IterativeImputer.html#sklearn.impute.IterativeImputer), [KNNImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html#sklearn.impute.KNNImputer)

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer, KNNImputer

In [None]:
examples = [i for i, x in enumerate(X[:,0]) if pd.isnull(x)]

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
itp = IterativeImputer()
itn = KNNImputer()
Xi = imp.fit_transform(X)
Xp = itp.fit_transform(X)
Xn = itn.fit_transform(X)

In [None]:
Xi[examples][:2]

In [None]:
Xp[examples][:2]

In [None]:
Xn[examples][:2]

In [None]:
kmeans = KMeans(n_clusters=2)
clusters = kmeans.fit_predict(Xi)

In [None]:
fig, ax = plt.subplots(figsize=(6, 3), ncols=2)
ax[0].scatter(X[:,0], X[:,1], alpha=.4)
ax[1].scatter(X[:,0], X[:,1], alpha=.4, c=clusters)
plt.tight_layout()
plt.show()

## Scaling
[StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler)

In [None]:
from sklearn import preprocessing as pre

In [None]:
std = pre.StandardScaler().fit(Xi)
print(std.mean_, std.scale_)
Xistd = std.transform(Xi)
print(Xistd.mean(axis=0), Xistd.std(axis=0))

In [None]:
kmeans = KMeans(n_clusters=2)
clusters = kmeans.fit_predict(Xistd)

In [None]:
fig, ax = plt.subplots(figsize=(6, 3), ncols=2)
ax[0].scatter(X[:,0], X[:,1], alpha=.4)
ax[1].scatter(X[:,0], X[:,1], alpha=.4, c=clusters)
plt.tight_layout()
plt.show()

### Altri metodi di scaling
[https://scikit-learn.org/stable/modules/preprocessing.html#standardization-or-mean-removal-and-variance-scaling](https://scikit-learn.org/stable/modules/preprocessing.html#standardization-or-mean-removal-and-variance-scaling)

In [None]:
from IPython.display import display

In [None]:
scalers = [pre.StandardScaler(), pre.MinMaxScaler(), pre.MaxAbsScaler(), pre.RobustScaler()]
for scaler in scalers:
    Xt = scaler.fit_transform(Xi)
    display(scaler.__class__.__name__)
    display(pd.DataFrame(Xt).describe().T)

## Trasformazioni non lineari

[https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html#sklearn.preprocessing.QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html#sklearn.preprocessing.QuantileTransformer)


[https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer)

In [None]:
np.percentile(Xi[:,0], [0, 25, 50, 75, 100])

In [None]:
Xiq = pre.QuantileTransformer(output_distribution='normal').fit_transform(Xi)
np.percentile(Xiq[:,0], [0, 25, 50, 75, 100])

In [None]:
Xip = pre.PowerTransformer().fit_transform(Xi)

In [None]:
def scatter_hist(x, y, ax, ax_histx, ax_histy, colors):
    ax_histx.tick_params(axis="x", labelbottom=False)
    ax_histy.tick_params(axis="y", labelleft=False)
    ax.scatter(x, y, alpha=0.4, c=colors)

    binwidth = 0.25
    xymax = max(np.max(np.abs(x)), np.max(np.abs(y)))
    lim = (int(xymax/binwidth) + 1) * binwidth

    bins = np.arange(-lim, lim + binwidth, binwidth)
    ax_histx.hist(x, bins=bins)
    ax_histy.hist(y, bins=bins, orientation='horizontal')

In [None]:
def make_plot(data, colors):
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    spacing = 0.005
    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom + height + spacing, width, 0.2]
    rect_histy = [left + width + spacing, bottom, 0.2, height]
    fig = plt.figure(figsize=(4, 4))
    ax = fig.add_axes(rect_scatter)
    ax_histx = fig.add_axes(rect_histx, sharex=ax)
    ax_histy = fig.add_axes(rect_histy, sharey=ax)
    scatter_hist(data[:,0], data[:,1], ax, ax_histx, ax_histy, colors)
    plt.show()

In [None]:
make_plot(Xip, colors=clusters)

### Sommario
Si veda [https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py)

## Encoding categorical features

In [None]:
Y = D[['Age', 'Nationality', 'Overall', 'Club']].dropna().to_numpy()

In [None]:
ordn = pre.OrdinalEncoder()
Yo = ordn.fit_transform(Y)

In [None]:
ordn.categories_

In [None]:
Yo

In [None]:
oneh = pre.OneHotEncoder()
Yh = oneh.fit_transform(Y)

In [None]:
oneh.categories_

In [None]:
Yh

In [None]:
Yh.toarray()

## Discretizzazione

In [None]:
est = pre.KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')

In [None]:
age = est.fit_transform(D.Age.values.reshape(-1, 1))

In [None]:
est.bin_edges_

In [None]:
age.toarray()

## Trasformazioni personalizzate

In [None]:
transformer = pre.FunctionTransformer(lambda x: 2*x + 10**4, validate=True)

In [None]:
transformer.transform(Xi)