## Imports

In [1]:
# https://vkvideo.ru/video-145052891_456248547?utm_source=email&utm_medium=email&utm_campaign=otus&utm_content=auto&utm_term=after_ol&relogin=True

In [2]:
import pandas as pd
import seaborn as sns
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, f1_score, roc_auc_score, recall_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

## Load source data

In [3]:
df = sns.load_dataset('titanic')

In [4]:
os.makedirs('data', exist_ok=True)
df.to_csv('data/titanic.csv', index=False)

In [5]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Pipeline

In [6]:
len(df.columns)

15

### Features

In [7]:
# Первичная предобработка
# Удалим колонку 'deck' (много пропусков) и строки, где пропущен 'embarked'
df.drop(columns=['deck'], inplace=True)
df.dropna(subset=['embarked'], inplace=True)

In [8]:
# Целевая переменная и признаки
y = df['survived']
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]


In [9]:
# Определим числовые и категориальные столбцы
numeric_features = ['age', 'sibsp', 'parch', 'fare']
categorical_features = ['pclass', 'sex', 'embarked']

In [10]:
# Настраиваем трансформацию для числовых признаков
#    - Заполним пропуски медианой (SimpleImputer)
#    - Применим StandardScaler (для нормализации)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Настраиваем трансформацию для категориальных признаков
#    - Заполним пропуски самой частотной категорией (хотя в этом наборе их уже удалили, но для примера)
#    - Затем закодируем OneHotEncoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Комбинируем обработку в ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

# Создаём финальный Pipeline
#    Шаги в конвейере:
#      - Предобработка (preprocessor)
#      - Обучение классификатора (RandomForestClassifier)
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

model_pipeline

In [11]:
# Разделение данных на обучающую/тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [12]:
# Обучение модели (Pipeline автоматически применит все трансформации к X_train)
model_pipeline.fit(X_train, y_train)

In [13]:
# Предсказание на тестовой выборке
y_pred = model_pipeline.predict(X_test)

In [14]:
# Оценка качества
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.758

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.77      0.80       109
           1       0.67      0.74      0.70        69

    accuracy                           0.76       178
   macro avg       0.75      0.75      0.75       178
weighted avg       0.76      0.76      0.76       178

Confusion Matrix:
[[84 25]
 [18 51]]


In [15]:
pd.DataFrame({'metrics': ['accuracy_score', 'precision_score', 'recall_score', 'f1_score', 'roc_auc_score'],
              'values':[accuracy_score(y_test, y_pred),
              precision_score(y_test, y_pred),
              recall_score(y_test, y_pred),
              f1_score(y_test, y_pred),
              roc_auc_score(y_test, y_pred)]}).to_csv('metrics.csv', index=False)


In [16]:
# # Изменим финальный Pipeline
# #    Шаги в конвейере:
# #      - Предобработка (preprocessor)
# #      - Обучение классификатора (RandomForestClassifier)
# from sklearn.linear_model import LogisticRegression
# from sklearn.pipeline import Pipeline
# new_model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(max_iter=1000, random_state=42))
# ])

# new_model_pipeline.fit(X_train, y_train)

# # Предсказание на тестовой выборке
# new_y_pred = new_model_pipeline.predict(X_test)

# # Оценка качества
# acc = accuracy_score(y_test, new_y_pred)
# print(f"Accuracy: {acc:.3f}")

# print("\nClassification Report:")
# print(classification_report(y_test, new_y_pred))

# print("Confusion Matrix:")
# print(confusion_matrix(y_test, new_y_pred))