# Обработка данных

In [None]:
import numpy as no
import pandas as pd

from sklearn.preprocessing import QuantileTransformer, PowerTransformer # преобразование данных
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder  # перекодирование категориальных переменных
from sklearn.preprocessing import StandardScaler  # z-нормировка
# from sklearn.preprocessing import MinMaxScaler, RobustScaler # другие способы нормировки

import matplotlib.pyplot as plt

# Не показывать Warning
import warnings
warnings.simplefilter(action='ignore', category=Warning)

In [None]:
df = pd.read_csv('./datasets/Diamond.csv')
df.dtypes

In [None]:
df.head()

## Преобразование количественных переменных

In [None]:
X = df[['carat', 'price']]

### Квантильное преобразование

In [None]:
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
# quantile_transformer = QuantileTransformer(output_distribution='uniform', random_state=0)
X_qtr = quantile_transformer.fit_transform(X)

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=2)

axs[0,0].hist(X['carat'], bins=30)
axs[0,1].hist(X['price'], bins=30)
axs[1,0].hist(X_qtr[:,0], bins=30)
axs[1,1].hist(X_qtr[:,1], bins=30)

axs[0,0].set_title('carat (levels)')
axs[0,1].set_title('price (levels)')
axs[1,0].set_title('carat (transformed)')
axs[1,1].set_title('price (transformed)')

plt.show()

### Нелинейные преобразования

In [None]:
pt = PowerTransformer(method='yeo-johnson')
# pt = PowerTransformer(method='box-cox')
X_ptr = pt.fit_transform(X=X)
pt.lambdas_

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=2)

axs[0,0].hist(X['carat'], bins=30)
axs[0,1].hist(X['price'], bins=30)
axs[1,0].hist(X_ptr[:,0], bins=30)
axs[1,1].hist(X_ptr[:,1], bins=30)

axs[0,0].set_title('carat (levels)')
axs[0,1].set_title('price (levels)')
axs[1,0].set_title('carat (transformed)')
axs[1,1].set_title('price (transformed)')

plt.show()

## Оцифровка категориальных признаков

### Преобразование в дамми

In [None]:
pd.get_dummies(data=df, columns=['certification'], prefix=None, dtype=int)

### Оцифровка

оцифровка признаков

In [None]:
enc = OrdinalEncoder()
enc.fit(X=df[['colour', 'clarity']])
enc.categories_

In [None]:
enc.fit_transform(X=df[['colour', 'clarity']])

Оцифровка таргетной переменной

In [None]:
le = LabelEncoder()
le.fit(y=df['colour'])
le.classes_

In [None]:
le.fit_transform(y=df['colour'])