In [357]:
import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


kaggle = False

input_path, output_path = '', ''
if kaggle:
    input_path = '/kaggle/input/spaceship-titanic/'
    output_path = '/kaggle/working/'

df = pd.read_csv(input_path + 'train.csv', index_col='PassengerId')

In [358]:
df = df.drop(['Name'], axis = 1)
print(df.shape)
df.head()

(8693, 12)


Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [359]:
class CabinSplitter(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=['Cabin'])
        
        # Заполнение NaN значений перед разделением
        X['Cabin'] = X['Cabin'].fillna('Unknown/0/Unknown')
        
        cabins_split = X['Cabin'].str.split('/', expand=True)
        cabins_split.columns = ['Deck', 'Number', 'Side']
        
        # Заполнение NaN значений после разделения (на всякий случай)
        cabins_split['Deck'] = cabins_split['Deck'].fillna('Unknown')
        cabins_split['Number'] = cabins_split['Number'].fillna(0).astype('float64')
        cabins_split['Side'] = cabins_split['Side'].fillna('Unknown')
        
        # Преобразование категориальных переменных в dummy-переменные
        cabins_split = pd.get_dummies(cabins_split, columns=['Deck', 'Side'], drop_first=False)
        cabins_split = cabins_split.astype('float64')
        
        # Убедимся, что все возможные столбцы присутствуют
        expected_columns = [
            'Number',
            'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_Unknown',
            'Side_P', 'Side_S', 'Side_Unknown'
        ]
        
        for col in expected_columns:
            if col not in cabins_split.columns:
                cabins_split[col] = 0.0
        
        cabins_split = cabins_split[expected_columns]
        
        return cabins_split
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input should be a DataFrame")
        
        return X.drop(columns=self.columns_to_drop)
    
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=1.5, top=0.85, bottom=0.25):
        self.threshold = threshold
        self.top = top
        self.bottom = bottom

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out = X.copy()
        for column in X_out.select_dtypes(include=[np.number]).columns:
            # Replace NaNs with mean
            mean_val = X_out[column].mean()
            X_out[column] = X_out[column].fillna(mean_val)
            
            Q1 = X_out[column].quantile(self.bottom)
            Q3 = X_out[column].quantile(self.top)
            IQR = Q3 - Q1
            lower_bound = Q1 - self.threshold * IQR
            upper_bound = Q3 + self.threshold * IQR
            X_out[column] = X_out[column].clip(lower=lower_bound, upper=upper_bound)
        return X_out
outlier = OutlierRemover()

In [360]:
y, df = df['Transported'].astype('int64'), df.drop('Transported', axis = 1)
num_cols = df.select_dtypes(exclude='object').columns.tolist()
cat_cols = df.select_dtypes(include='object').columns.tolist()
cat_cols = [c for c in cat_cols if c != 'Cabin']
print('Категориальные:', cat_cols)
print('Вещественные:', num_cols) 

Категориальные: ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
Вещественные: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


In [361]:
df[cat_cols] = df[cat_cols].astype('str')
df.dtypes

HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
dtype: object

In [362]:
from sklearn.tree import DecisionTreeClassifier


one_trans = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NaN')),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
])
cabin_trans = Pipeline([
    ('splitter', CabinSplitter()),
    ('dropper', DropColumns(['Number']))
])
num_trans = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('outlier', outlier, num_cols),
    ('num', num_trans, num_cols),
    ('cabin', cabin_trans, ['Cabin']),
    ('onehot', one_trans, cat_cols),
])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(random_state=42))
    #('model', LogisticRegression(penalty = 'l2', max_iter=1_000_0))
])


In [363]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    #'model__C': [2.6842105263157894],
    'model__max_depth': [10],
    'model__min_samples_split': [2],
    'model__min_samples_leaf': [4],
}
search = GridSearchCV(model, param_grid = param_grid, scoring='accuracy',
                      cv = 5, return_train_score=True)
search.fit(df, y)

In [364]:
print(search.best_params_)
print(search.best_score_)

{'model__max_depth': 10, 'model__min_samples_leaf': 4, 'model__min_samples_split': 2}
0.7692408835150554


с = 2.963265306122449, acc = 0.7875304312955808

{'model__C': 1, 'model__l1_ratio': 0, 'model__penalty': 'elasticnet', 'model__solver': 'saga'}
acc = 0.7838518095991838

0.7877606470657912 = mean

In [365]:
df_test = pd.read_csv(input_path + 'test.csv')
df_sub = pd.read_csv(input_path + 'sample_submission.csv')
df_test = df_test.drop('Name', axis = 1)

preds = search.predict(df_test)
df_sub['Transported'] = preds.astype(bool)
df_sub.to_csv(output_path + 'submission.csv', index=False)
df_sub.head()



Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
