In [1]:
# !pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# from google.colab import files

# uploaded = files.upload()

# for fn in uploaded.keys():
  
#   print(
#       'User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn]))
#   )
  
# # Then move kaggle.json into the folder where the API expects to find it.
# !mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 66 bytes


In [3]:
!kaggle competitions download "titanic"

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 40.4MB/s]


In [4]:
!unzip titanic.zip

Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [5]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

import string
import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

SEED = 42

Feature Analysis

In [6]:
print(train_df.info())
train_df.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
677,678,1,3,"Turja, Miss. Anna Sofia",female,18.0,0,0,4138,9.8417,,S
159,160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
773,774,0,3,"Elias, Mr. Dibo",male,,0,0,2674,7.225,,C


In [7]:
print(test_df.info())
test_df.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
102,994,3,"Foley, Mr. William",male,,0,0,365235,7.75,,Q
312,1204,3,"Sadowitz, Mr. Harry",male,,0,0,LP 1588,7.575,,S
254,1146,3,"Wenzel, Mr. Linhart",male,32.5,0,0,345775,9.5,,S


Feature Engineering

In [10]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

target = 'Survived'


def concat_df(train_data, test_data):
    # Returns a concatenated df of training and test set
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)


def divide_df(all_data):
    # Returns divided dfs of training and test set
    return all_data.loc[:890], all_data.loc[891:].drop(['Survived'], axis=1)


def generate_features(data_df):
    data_df['family_size'] = data_df['SibSp'] + data_df['Parch']
    data_df['name_length'] = data_df['Name'].apply(len)
    data_df['is_alone'] = 0
    data_df.loc[data_df['family_size'] == 1, 'is_alone'] = 1

    data_df['cabin'] = data_df['Cabin'].str[:1]

    data_df['title'] = 0
    data_df['title'] = data_df.Name.str.extract('([A-Za-z]+)\.')
    data_df['title'].replace(
        ['Mlle', 'Mme', 'Ms', 'Dr', 'Major', 'Lady', 'Countess', 'Jonkheer', 'Col', 'Rev', 'Capt', 'Sir', 'Don',
         'Dona'],
        ['Miss', 'Miss', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Other', 'Other', 'Other', 'Mr', 'Mr', 'Mr', 'Mrs'],
        inplace=True
    )

    data_df.loc[(data_df.Age.isnull()) & (data_df.title == 'Mr'), 'Age'] = data_df.Age[data_df.title == 'Mr'].mean()
    data_df.loc[(data_df.Age.isnull()) & (data_df.title == 'Mrs'), 'Age'] = data_df.Age[data_df.title == 'Mrs'].mean()
    data_df.loc[(data_df.Age.isnull()) & (data_df.title == 'Master'), 'Age'] = data_df.Age[
        data_df.title == 'Master'].mean()
    data_df.loc[(data_df.Age.isnull()) & (data_df.title == 'Miss'), 'Age'] = data_df.Age[data_df.title == 'Miss'].mean()
    data_df.loc[(data_df.Age.isnull()) & (data_df.title == 'Other'), 'Age'] = data_df.Age[
        data_df.title == 'Other'].mean()

    data_df.loc[data_df.Ticket.str.isdigit(), 'ticket_class'] = 1
    data_df.loc[~data_df.Ticket.str.isdigit(), 'ticket_class'] = 0
    data_df['ticket_class'] = data_df['ticket_class'].apply(int)

    data_df = data_df.drop(columns=['Name', 'Ticket', 'PassengerId', 'Cabin'])
    return data_df


def get_feature_transformer(X: pd.DataFrame):
    X = X.copy()
    X.drop(target, axis='columns', inplace=True)

    categorical_pipeline = Pipeline(
        steps=[
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
        ]
    )
    numeric_pipeline = Pipeline(
        steps=[("impute", SimpleImputer(strategy="mean")),
               ("scale", StandardScaler())]
    )
    full_transformer = ColumnTransformer(
        transformers=[
            ("numeric", numeric_pipeline, numerical_features),
            ("categorical", categorical_pipeline, categorical_features),
        ]
    )

    full_transformer.fit(X)
    return full_transformer


def prepare_features(X: pd.DataFrame, feature_transformer: ColumnTransformer, is_debug=True):
    if is_debug:
        print(feature_transformer.get_feature_names_out())

    X_processed = feature_transformer.transform(X)

    y = X[target]
    y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
        y.values.reshape(-1, 1)
    )

    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y_processed, stratify=y_processed, random_state=SEED
    )

    return X_train, X_test, y_train, y_test


def generate_and_save_submission(test_df: pd.DataFrame, rez_preds, rez_file_name: str):
    submission_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
    submission_df['PassengerId'] = test_df['PassengerId']
    submission_df['Survived'] = rez_preds
    submission_df.to_csv(rez_file_name, header=True, index=False)
    submission_df.head(10)


Model Engineering

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

categorical_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Ticket', 'Cabin']  # Embarked - nulls (2)
numerical_features = ['Age', 'Fare']  # Embarked - nulls
drop_features = ['PassengerId', 'Name']
target = 'Survived'

all_df = concat_df(train_df, test_df)
generate_features(all_df)
categorical_features = ['Embarked', 'Parch', 'Pclass', 'Sex', 'SibSp', 'family_size', 'is_alone', 'cabin', 'title', 'ticket_class']
df_train, df_test = divide_df(all_df)

# X_train, X_test, y_train, y_test = prepare_features_1(train_df)
feature_transformer = get_feature_transformer(df_train)
X_train, X_test, y_train, y_test = prepare_features(df_train, feature_transformer)

model = RandomForestClassifier(random_state=SEED)
model.fit(X_train, y_train)
print(model)

preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
print('accuracy_score: ', acc)

X_rez = feature_transformer.transform(df_test)
rez_preds = model.predict(X_rez).astype(int)

generate_and_save_submission(df_test, rez_preds, 'submissions_13.csv')

['numeric__Age' 'numeric__Fare' 'categorical__Embarked_C'
 'categorical__Embarked_Q' 'categorical__Embarked_S'
 'categorical__Parch_0' 'categorical__Parch_1' 'categorical__Parch_2'
 'categorical__Parch_3' 'categorical__Parch_4' 'categorical__Parch_5'
 'categorical__Parch_6' 'categorical__Pclass_1' 'categorical__Pclass_2'
 'categorical__Pclass_3' 'categorical__Sex_female' 'categorical__Sex_male'
 'categorical__SibSp_0' 'categorical__SibSp_1' 'categorical__SibSp_2'
 'categorical__SibSp_3' 'categorical__SibSp_4' 'categorical__SibSp_5'
 'categorical__SibSp_8' 'categorical__family_size_0'
 'categorical__family_size_1' 'categorical__family_size_2'
 'categorical__family_size_3' 'categorical__family_size_4'
 'categorical__family_size_5' 'categorical__family_size_6'
 'categorical__family_size_7' 'categorical__family_size_10'
 'categorical__is_alone_0' 'categorical__is_alone_1'
 'categorical__cabin_A' 'categorical__cabin_B' 'categorical__cabin_C'
 'categorical__cabin_D' 'categorical__cabin_E' 'c

Predictions