<a href="https://colab.research.google.com/github/alexnodejs/bandit/blob/main/HT_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import metrics


In [13]:
df = pd.read_csv("https://raw.githubusercontent.com/HalyshAnton/IT-Step-Pyton-AI/main/module2/data/ship_passengers.csv",
                 index_col="Unnamed: 0")

In [14]:
df.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [15]:
df = df[['pclass', 'age', 'embarked', 'fare']]
df.head()

Unnamed: 0,pclass,age,embarked,fare
0,3,22.0,S,7.25
1,1,38.0,C,71.2833
2,3,26.0,S,7.925
3,1,35.0,S,53.1
4,3,35.0,S,8.05


# Завдання 1
Очистіть дані від викидів(лише `fare`), розділіть на тренувальну та тестову чатини

In [16]:
df = df[df['fare'] < 200]
df

Unnamed: 0,pclass,age,embarked,fare
0,3,22.0,S,7.2500
1,1,38.0,C,71.2833
2,3,26.0,S,7.9250
3,1,35.0,S,53.1000
4,3,35.0,S,8.0500
...,...,...,...,...
886,2,27.0,S,13.0000
887,1,19.0,S,30.0000
888,3,,S,23.4500
889,1,26.0,C,30.0000


In [17]:
y = df['fare']
X = df.drop(columns='fare')

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.8,
                                                    random_state=42)

# Завдання 2
Створіть Pipeline для обробки даних

In [22]:
num_columns = X.select_dtypes(include="number").columns
cat_columns = X.select_dtypes(include="object").columns

num_transformer = Pipeline(
  steps=[("imputer", SimpleImputer(strategy="most_frequent"))]
    )

num_transformer

In [23]:
cat_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")),
           ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=3
                                  ))])
cat_transformer



In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_columns),
        ("cat", cat_transformer, cat_columns),
    ]
)

preprocessor.set_output(transform='pandas')
preprocessor

# Завдання 3
Об'єднайте дерево рішень з препроцесором та натренуйте їх

In [25]:
model = Pipeline(
    [("prep", preprocessor),
     ("tree", tree.DecisionTreeRegressor(max_depth=10,
                                         min_samples_leaf=4,
                                         min_samples_split=6,
                                         random_state=42)
     )
    ]
)

model

In [27]:
model.fit(X_train, y_train)

model['prep'].set_output(transform='pandas')
model['prep'].transform(X_train)

Unnamed: 0,num__pclass,num__age,cat__embarked
676,3.0,24.5,2.0
467,1.0,56.0,2.0
533,3.0,22.0,0.0
763,1.0,36.0,2.0
372,3.0,19.0,2.0
...,...,...,...
108,3.0,38.0,2.0
274,3.0,22.0,1.0
880,2.0,25.0,2.0
445,1.0,4.0,2.0


# Завдання 4
Виведіть основні метрики моделі

In [28]:
def get_metrics(df, y_true, y_pred, name="model"):
  df = pd.DataFrame()

  df.loc["MAE", name] = metrics.mean_absolute_error(y_true, y_pred)
  df.loc["RMSE", name] = metrics.mean_squared_error(y_true, y_pred) ** 0.5
  df.loc["R2", name] = metrics.r2_score(y_true, y_pred)

  return df.round(2)



In [29]:
df_metrics = pd.DataFrame()

df_metrics["train"] = get_metrics(df, y_train, model.predict(X_train))
df_metrics["test"] = get_metrics(df, y_test, model.predict(X_test))

df_metrics

Unnamed: 0,train,test
MAE,11.09,14.56
RMSE,18.06,23.65
R2,0.61,0.42


# Завдання 5
Змініть параметри моделі та попробуйте покращити результат

Попробуйте:
* не видаляти викиди
* змінити обробку даних(imputer(SimpleImputer vs KNNImputer) та кодування категоріальних даних(OneHotEncoder vs OrdinaryEncoder))
* змінити параметри дерева(глибина, кількість точок у листках тощо)

In [80]:
df_experiments = pd.read_csv("https://raw.githubusercontent.com/HalyshAnton/IT-Step-Pyton-AI/main/module2/data/ship_passengers.csv",
                 index_col="Unnamed: 0")
y = df_experiments['fare']
X = df_experiments.drop(columns='fare')

# divide to tests and train
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.8,
                                                    random_state=42)

num_columns = X.select_dtypes(include="number").columns
cat_columns = X.select_dtypes(include="object").columns

# numeric
num_transformer = Pipeline(
  steps=[("imputer", KNNImputer(n_neighbors=5))]
    )

num_transformer

# categorial
cat_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")),
           ("encoder", OneHotEncoder(sparse_output=False))])
cat_transformer
# combine
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_columns),
        ("cat", cat_transformer, cat_columns),
    ]
)

preprocessor.set_output(transform='pandas')
preprocessor

# tree
model_improved = Pipeline(
    [("prep", preprocessor),
     ("tree", tree.DecisionTreeRegressor(max_depth=6,
                                         min_samples_leaf=2,
                                         min_samples_split=2,
                                         random_state=42)
     )
    ]
)

model_improved


In [81]:
# train
model_improved.fit(X_train, y_train)



In [82]:
improved_metrics = pd.DataFrame()
improved_metrics["train"] = get_metrics(df_experiments, y_train, model_improved.predict(X_train))
improved_metrics["test"] = get_metrics(df_experiments, y_test, model_improved.predict(X_test))

improved_metrics

Unnamed: 0,train,test
MAE,8.79,18.6
RMSE,20.06,55.0
R2,0.85,-0.96


# Завдання 6
Збережіть модель