In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from pandas.api.types import is_numeric_dtype, is_object_dtype
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# LOADING DATA 

In [2]:
titanic_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
titanic_data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
534,535,0,3,"Cacic, Miss. Marija",female,30.0,0,0,315084,8.6625,,S
168,169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S
715,716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19.0,0,0,348124,7.65,F G73,S
838,839,1,3,"Chip, Mr. Chang",male,32.0,0,0,1601,56.4958,,S
847,848,0,3,"Markoff, Mr. Marin",male,35.0,0,0,349213,7.8958,,C


# Total Categorical Features

In [3]:
def get_uniques_values_with_column(data:pd.DataFrame, column:str) -> list:
    if data[column].nunique() <= 10:
        return list(data[column].unique())
    return ["uniques values more than 10"]

In [4]:
cat_cols = list(titanic_data.select_dtypes(include='object').columns)
print(f"Categorical Features --> {len(cat_cols)} \n")
[print(f"{col} => {get_uniques_values_with_column(titanic_data, col)}") for col in cat_cols];

Categorical Features --> 5 

Name => ['uniques values more than 10']
Sex => ['male', 'female']
Ticket => ['uniques values more than 10']
Cabin => ['uniques values more than 10']
Embarked => ['S', 'C', 'Q', nan]


# Total Numerical Features

In [5]:
num_cols = list(titanic_data.select_dtypes(exclude='object').columns)
print(f"Numerical Features --> {len(num_cols)} \n")
titanic_data[num_cols].dtypes

Numerical Features --> 7 



PassengerId      int64
Survived         int64
Pclass           int64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
dtype: object

In [6]:
titanic_data[num_cols].describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Missing data

In [7]:
(missing_val_count_by_column  := titanic_data.isnull().sum())[missing_val_count_by_column  > 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

In [8]:
def get_null_df(features:pd.DataFrame) -> pd.DataFrame:
    col_null_df = pd.DataFrame(columns = ['Feature', 'Type', 'Total NaN', 'Missing %'])
    col_null = features.columns[features.isna().any()].to_list()
    for col in col_null:
        dtype = "Numerical" if is_numeric_dtype(features[col]) else "Categorical"
        nulls = len(features[features[col].isna() == True][col])   
        col_null_df = col_null_df.append({'Feature': col, 
                                          'Type': dtype,
                                          'Total NaN': nulls,
                                          'Missing %': (nulls / len(features))*100
                                         }, ignore_index=True)
    return col_null_df

In [9]:
get_null_df(titanic_data)

Unnamed: 0,Feature,Type,Total NaN,Missing %
0,Age,Numerical,177,19.86532
1,Cabin,Categorical,687,77.104377
2,Embarked,Categorical,2,0.224467


# PREPROCESSING

In [10]:
drops_col = ['PassengerId', 'Survived', 'Ticket', 'Cabin', 'Name']
X = titanic_data.drop(drops_col, axis=1)
y = titanic_data.Survived

In [11]:
best_cat_cols = [col for col in cat_cols if not(col in drops_col)]
best_num_cols = [col for col in num_cols if not(col in drops_col)]

In [12]:
# # Preprocessing for numerical data
numerical_transformer =  Pipeline(steps=[
    ('norm', StandardScaler()),
    ('knn_imputer', KNNImputer(n_neighbors=7, weights='distance'))
    ])

In [13]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('norm', StandardScaler(with_mean = False))
])

In [14]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, best_num_cols),
        ('cat', categorical_transformer, best_cat_cols)
    ])

In [15]:
# model = tree.DecisionTreeClassifier(criterion='entropy', random_state=0, max_depth = 6)
model = GradientBoostingClassifier()

In [16]:
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

In [17]:
params = { 
           'model__n_estimators': [120],
           'model__max_depth': [1, 2, 3],
           'model__random_state': [42],
         }

In [18]:
grid_search = GridSearchCV(clf, params, cv=10, scoring='accuracy')
grid_search.fit(X, y)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('norm',
                                                                                          StandardScaler()),
                                                                                         ('knn_imputer',
                                                                                          KNNImputer(n_neighbors=7,
                                                                                                     weights='distance'))]),
                                                                         ['Pclass',
                                                                          'Age',
                                                                          'SibSp',
                                           

In [19]:
grid_search.best_score_

0.8305493133583021

In [20]:
test_pred = grid_search.predict(test_data.drop(["PassengerId", 'Name', 'Ticket', 'Cabin'], axis=1))

In [21]:
pd.DataFrame({'PassengerId' : test_data.PassengerId,
              'Survived' : test_pred}
            ).to_csv('save.csv', index=False)