# Spaceship Titanic

Описание набора: персональные данные пассажиров. \
Цель: предсказать, переместится ли пассажир в альтернативное измерение во время столкновения космического корабля с пространственно-временной аномалией.

Метрика оценки: Accuracy \
Бинарная классификация

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
import joblib

## Загрузка и подготовка данных

In [6]:
train_data = pd.read_csv('/kaggle/input/competitions/spaceship-titanic/train.csv')

In [7]:
test_data = pd.read_csv('/kaggle/input/competitions/spaceship-titanic/test.csv')

In [8]:
test_passenger_ids = test_data['PassengerId'].copy()

Функция подготовки признаков для модели

In [9]:
def prepare_features(df):
    features = pd.DataFrame()
    
    features['Age'] = df['Age'].fillna(df['Age'].median())
    
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in spending_cols:
        df[col] = df[col].fillna(0)
    
    features['TotalSpent'] = df[spending_cols].sum(axis=1)
    features['LogTotalSpent'] = np.log1p(features['TotalSpent'])
    features['HasSpent'] = (features['TotalSpent'] > 0).astype(int)
    
    features['CryoSleep'] = df['CryoSleep'].fillna(0).astype(int)
    features['VIP'] = df['VIP'].fillna(0).astype(int)
    
    planet_dummies = pd.get_dummies(df['HomePlanet'], prefix='Planet', dummy_na=True)
    features = pd.concat([features, planet_dummies], axis=1)
    
    def extract_deck(cabin):
        if pd.isna(cabin):
            return 'Unknown'
        return str(cabin).split('/')[0]
    
    def extract_num(cabin):
        if pd.isna(cabin):
            return np.nan
        parts = str(cabin).split('/')
        try:
            return float(parts[1]) if len(parts) > 1 else np.nan
        except:
            return np.nan
    
    def extract_side(cabin):
        if pd.isna(cabin):
            return 'Unknown'
        parts = str(cabin).split('/')
        return parts[2] if len(parts) > 2 else 'Unknown'
    
    df['Deck'] = df['Cabin'].apply(extract_deck)
    df['CabinNum'] = df['Cabin'].apply(extract_num)
    df['Side'] = df['Cabin'].apply(extract_side)

    features['Deck_B'] = (df['Deck'] == 'B').astype(int)
    features['Deck_C'] = (df['Deck'] == 'C').astype(int)
    features['Deck_F'] = (df['Deck'] == 'F').astype(int)
    features['Deck_T'] = (df['Deck'] == 'T').astype(int)

    features['Side_P'] = (df['Side'] == 'P').astype(int)
    features['Side_S'] = (df['Side'] == 'S').astype(int)

    features['CabinNum'] = df['CabinNum'].fillna(df['CabinNum'].median())
    features['CabinNum_scale'] = (features['CabinNum'] - features['CabinNum'].mean()) / features['CabinNum'].std()

    features = features.fillna(0)
    
    return features

Подготовка тренировочных данных

In [10]:
X = prepare_features(train_data)
y = train_data['Transported'].astype(int)

Создаем лучшую модель - Stacking classifier

In [12]:
final_model = StackingClassifier(
    estimators=[
        ('cat', CatBoostClassifier(iterations=100, random_state=42, verbose=0)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('dt', DecisionTreeClassifier(max_depth=5, random_state=42))
    ],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=5
)

Обучаем на тренировочных данных

In [13]:
final_model.fit(X, y)

Оценка качества на кросс-валидации

In [14]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(final_model, X, y, cv=cv, scoring='accuracy')
print(f"\n Оценка модели на кросс-валидации:")
print(f"   Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")


 Оценка модели на кросс-валидации:
   Accuracy: 0.7566 (+/- 0.0062)


Подготовка тестовых признаков

In [15]:
X_test = prepare_features(test_data)

In [16]:
for col in X.columns:
    if col not in X_test.columns:
        X_test[col] = 0

In [17]:
X_test = X_test[X.columns]

Делаем предсказания и создаем submisson

In [18]:
predictions = final_model.predict(X_test)
probabilities = final_model.predict_proba(X_test)
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': predictions.astype(bool)  # Преобразуем 0/1 в False/True
})
submission.to_csv('submission.csv', index=False)
print(submission.head(10))

  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01        False
4     0023_01        False
5     0027_01         True
6     0029_01         True
7     0032_01         True
8     0032_02         True
9     0033_01        False
