In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
kaggle = False
p1, p2 =  '', ''
if kaggle:
    p1 = '/kaggle/input/titanic/'
    p2 = '/kaggle/working/'
df = pd.read_csv(p1 + 'train.csv', index_col='PassengerId')
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input should be a DataFrame")
        
        return X.drop(columns=self.columns_to_drop)

In [5]:
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=1.5, top=0.85, bottom=0.25):
        self.threshold = threshold
        self.top = top
        self.bottom = bottom

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out = X.copy()
        for column in X_out.select_dtypes(include=[np.number]).columns:
            # Replace NaNs with mean
            mean_val = X_out[column].mean()
            X_out[column] = X_out[column].fillna(mean_val)
            
            Q1 = X_out[column].quantile(self.bottom)
            Q3 = X_out[column].quantile(self.top)
            IQR = Q3 - Q1
            lower_bound = Q1 - self.threshold * IQR
            upper_bound = Q3 + self.threshold * IQR
            X_out[column] = X_out[column].clip(lower=lower_bound, upper=upper_bound)
        return X_out
outlier = OutlierRemover()


In [6]:
class CorrelationThreshold(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.to_drop_ = []

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        corr_matrix = X.corr().abs()
        upper_triangle = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        upper_corr_matrix = corr_matrix.where(upper_triangle)

        # Находим признаки с корреляцией выше порогового значения
        self.to_drop_ = [column for column in upper_corr_matrix.columns if any(upper_corr_matrix[column] > self.threshold)]
        
        return self

    def transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        return X.drop(columns=self.to_drop_)


In [7]:
def ticket_number(x):
    if isinstance(x, str):
        return x.split()[-1]
    else:
        return x
    
def ticket_sym(x):
    return x.split()[0]
def rem(x):
    try:
        x = x.replace('.', '')
        x = x.replace('/', '')
        return x.replace(' ', '')
    except:
        return np.nan

def has_letters(input_string):
    if not isinstance(input_string, str):
        return False
    return any(char.isalpha() for char in input_string)

class TicketTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        X['Ticket_num'] = X['Ticket'].apply(ticket_number)
        X['Ticket_num'] = X['Ticket_num'].apply(lambda x : -1 if has_letters(x) else x)
        X['Ticket_num'] = X['Ticket_num'].astype('int64')
        X['Ticket'] = X['Ticket'].apply(ticket_sym)
        X['Ticket'][X['Ticket'] == X['Ticket_num'].astype(str)] = np.nan
        X['Ticket'] = X['Ticket'].apply(rem)
        return X
        

Самая сложная колонка - ticket.
10% точек, 10% слешей

Для колонки Parch удаляем выбросы на top = 0.8, примерно

Для SibSp выбросы на 0.9 или около того

для Fare ставим на 0.7 но надо тестить

In [8]:
onehot_cols = ['Embarked', 'Sex']
drop_cols = ['Name', 'Cabin', 'Ticket']

In [9]:
y = df['Survived']
X = df.drop('Survived', axis = 1)

In [10]:
cat_cols = X.select_dtypes(['object']).columns
num_cols = X.select_dtypes(exclude=['object']).columns

In [11]:
cat_cols = [i for i in cat_cols if i not in drop_cols]
cat_cols = [i for i in cat_cols if i not in ['Ticket']]
num_cols = [i for i in num_cols if i not in drop_cols]
all_cols = [i for i in df.columns if i not in drop_cols]

In [12]:
num_trans = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])
cat_trans = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop = 'first')),
])

processor = ColumnTransformer([
    #('ticket', TicketTransformer(), ['Ticket']),
    #('drop_trans', DropTransformer(), drop_cols),
    ('outlier', outlier, num_cols),
    ('dropper', DropColumns(drop_cols), drop_cols),
    ('num_trans', num_trans, num_cols),
    ('cat_trans', cat_trans, cat_cols),
])
pipe = Pipeline([
    ('processor', processor),
    ('corr', CorrelationThreshold()),
    ('model', RandomForestClassifier()),#(penalty = 'l2', max_iter=10000))
])


In [15]:
from sklearn.model_selection import train_test_split


param_grid={
    #'model__C': [7.0536912751677 ],
    #'model__solver':  [ 'newton-cg' ],
    #'model__intercept_scaling': [0.00000000001],
    'model__oob_score': [True],
    'model__min_samples_split': [0.01365306122448979],
    'model__n_estimators': [100],
    'model__ccp_alpha': np.linspace(0.01, 1, 101)
}
model = GridSearchCV(pipe, param_grid=param_grid, scoring='accuracy',
                      cv = 5, return_train_score=True)
model.fit(X, y)

In [None]:
print(model.best_score_)
print(f'Accuracy на тесте: \t{np.mean(model.cv_results_["mean_test_score"]):.7f}')
print(f'Accuracy на трейне: \t{np.mean(model.cv_results_["mean_train_score"]):.7f}')
print(f'Параметры: {model.best_params_}')

0.8283033080158182
Accuracy на тесте: 	0.8283033
Accuracy на трейне: 	0.9079668
Параметры: {'model__min_samples_split': 0.01365306122448979, 'model__n_estimators': 100, 'model__oob_score': True}


In [252]:
df_test = pd.read_csv(p1 + 'test.csv', index_col='PassengerId')
df_preds = pd.read_csv(p1+'gender_submission.csv')
df_preds['Survived'] = model.predict(df_test)
df_preds.to_csv(p2 + 'submission.csv', index = False)