In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
raw_train = pd.read_csv("/kaggle/input/titanic/train.csv")
raw_train

In [None]:
raw_test = pd.read_csv("/kaggle/input/titanic/test.csv")
raw_test

# Eksplorasi Data

In [None]:
raw_train.dtypes

In [None]:
raw_train.describe()

In [None]:
raw_train.describe(include='object')

In [None]:
raw_test.dtypes

In [None]:
raw_test.describe()

In [None]:
raw_test.describe(include='object')

## Cek Data Duplikat

In [None]:
raw_train.duplicated().sum()

In [None]:
raw_test.duplicated().sum()

Tidak ada data duplikat pada `raw_train` dan `raw_test`.

## Cek Missing Value

In [None]:
raw_train.isna().sum()

Terdapat 177 _missing data_ pada kolom `Age`, 687 _missing data_ pada kolom `Cabin`, dan 2 _missing data_ pada kolom `Embarked` di dalam _dataframe_ `raw_train`.

In [None]:
raw_test.isna().sum()

Terdapat 86 _missing data_ pada kolom `Age`, 1 _missing data_ pada kolom `Fare`, dan 327 _missing data_ pada kolom `Cabin` dalam _dataframe_ `raw_test`.

## Target
Tujuan proyek ini adalah memprediksi penumpang Titanic selamat atau tidak. Sehingga target adalah kolom `Survived`. Penumpang selamat bernilai 1 dan penumpang yang tidka selamat bernilai 0.

## Cek Apakah Target pada Train Data termasuk _Imbalance_

In [None]:
raw_train.Survived.value_counts()

Jumlah yang selamat (1) dan tidak selamat (0) relatif tidak jauh berbeda, sehingga data tersebut bukan termasuk _imbalance data_.

## Hapus kolom yang tidak revelevan
Kolom `Cabin` dan `Ticket` memiliki banyak data yang unik, sehingga kedua kolom tersebut tidak memberikan berbedaan signifikan pada hasil prediksi. Untuk kolom `PassengerId` akan digunkan dalam dalam _submission_. Sedangkan pada `Name` terdapat gelar seperti Mr., Mrs. dsb. Gelar tersebut mungkin akan berguna dalam pemodelan.

In [None]:
raw_train.drop(['Cabin', 'Ticket'], axis=1, inplace=True)
raw_test.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

## Korelasi Antar Numerikal Kolom

In [None]:
plt.subplots(figsize=(10, 15))
data=raw_train.corr()
mask = np.triu(np.ones_like(data, dtype=bool))
sns.heatmap(data, 
            mask=mask, 
            vmax=1, 
            center=0, 
            annot = True,
            square=True, 
            linewidths=.5,
            cbar_kws={"shrink": .5})
plt.show()

# Data Cleaning


In [None]:
# train & test dijadikan satu
all_data = pd.concat([raw_train, raw_test])

## Menangani _Missing Values_

In [None]:
all_data.isna().sum()

### Feature Age
Berdasarkan _heatmap_ korelasi antar numerikal kolom, fitur `Age` terlihat memiliki korelasi dengan `Pclass`,`SibSp`, dan `Parch`. Fitur `Age` memiliki korelasi negatif kuat dengan `Pclass` jika dibandingkan fitur `SibSp` dan `Parch`. Oleh karena itu, dalam mengisi missing data pada kolom `Age` akan memperhatikan `Pclass`-nya. Untuk kategorikal kolomnya lebih masuk akal `Age` juga berhubungan dengan `Sex`.

In [None]:
all_data.groupby(['Pclass', 'Sex'])['Age'].median()

In [None]:
all_data.Age = all_data.groupby(['Pclass', 'Sex'])['Age'].transform(lambda x: x.fillna(x.median()))

### Feature Fare
Berdasarkan _heatmap_ korelasi antar numerikal kolom, fitur `Fare` memiliki korelasi negatif kuat dengan `Pclass`. Untuk kategorikal kolomnya juga akan memperhitungkan `Sex`.

In [None]:
all_data.groupby(['Pclass', 'Sex'])['Fare'].median()

In [None]:
all_data.Fare = all_data.groupby(['Pclass', 'Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))

### Feature Embarked
Embarked adalah categorical feature. dari dataframe, feature Embarked didominasi oleh S. oleh karena itu, missing value pada feature Embarked akan diisi oleh modus pada faeture tersebut, yaitu S.

In [None]:
all_data.Embarked.value_counts()

In [None]:
all_data.Embarked.fillna('S', inplace=True)

In [None]:
all_data.isna().sum()

# _Data Preprocessing_

## Rekayasa Fitur

### Fitur `Family_size`
fitur yang mengindikasi jumlah keluarga setiap penumpang yang berada dalam kapal.

formula: ```Family_size = SibSp + Parch + 1```

In [None]:
all_data['Family_size'] = all_data.SibSp + all_data.Parch + 1

### Fitur `is_Alone`

In [None]:
all_data['is_Alone'] = 0
all_data.loc[all_data['Family_size']==1, 'is_Alone'] = 1

### Fitur `is_Female`

In [None]:
all_data.loc[all_data.Sex == 'female', 'is_female'] = 1
all_data.loc[all_data.Sex == 'male', 'is_female'] = 0

### Fitur `Title`

In [None]:
all_data['Title'] = all_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
all_data['Title'].unique()

In [None]:
all_data['Title'] = all_data['Title'].replace(['Miss', 'Mrs','Ms', 'Mlle', 'Lady', 'Mme', 'the Countess', 'Dona'], 'Miss/Mrs/Ms')
all_data['Title'] = all_data['Title'].replace(['Don', 'Rev', 'Sir', 'Master', 'Jonkheer'], 'Mr')
all_data['Title'] = all_data['Title'].replace(['Dr', 'Major', 'Col', 'Capt'], 'Other')

In [None]:
all_data['Title'].unique()

## Menghapus Fitur-fitur yang Sudah tidak digunakan

In [None]:
all_data.drop(['Name', 'Sex', 'Parch', 'SibSp'], axis=1, inplace=True)
all_data.info()

## Menangani Fitur Non-Numerik

In [None]:
all_data['Title'] = all_data['Title'].replace(['Miss/Mrs/Ms'], 2)
all_data['Title'] = all_data['Title'].replace(['Other'], 1)
all_data['Title'] = all_data['Title'].replace(['Mr'], 0)

In [None]:
all_data = pd.get_dummies(all_data)

In [None]:
all_data.info()

## Memisahkan Train & Test

In [None]:
X_train = all_data[:891]
y_train = X_train.Survived.astype('int')
X_train = X_train.drop(['PassengerId', 'Survived'], axis=1)
X_train.tail()

In [None]:
X_test = all_data[891:]
passengerID = X_test.PassengerId # Save passengerID untuk submission
X_test = X_test.drop(['PassengerId', 'Survived'], axis=1)
X_test.head()

# Pemodelan
Model yang akan digunakan ialah Random Forest, KNN, SVM, Logistic Regression, NaiveBayes, RelevanceVectorMachine

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from scipy.stats import mode

In [None]:
# get a stacking ensemble of models
def get_stacking(models, final):
    # define the base models
    level0 = list()
    for name, model in models.items():
        level0.append((name, model))
    # define meta learner model
    level1 = final
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

In [None]:
def accuracy(model):
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f'accuracy score : {acc}')
    print(f'mean accuracy score : {np.mean(acc)} ({np.std(acc)})')

In [None]:
def feature_importances(model):
    importances = model.feature_importances_
    feature_names = X_train.columns.values
    
    df_feature_importance = pd.DataFrame(index=range(len(feature_names)), columns=['feature', f'{model} importance'])
    df_feature_importance['feature'] = feature_names
    df_feature_importance[f'{model} importance'] = importances
    df_all_features = df_feature_importance.sort_values(by=f'{model} importance', ascending=False)
    
    return df_all_features

## Random Forest

In [None]:
parameter = {'n_estimators': list(range(50, 225, 25)),
             'min_samples_split' : list(range(10,21,2))
            }

model_rf = GridSearchCV(RandomForestClassifier(random_state=0),
                       parameter, n_jobs=-1, verbose=1)
model_rf.fit(X_train, y_train)

print(f'Best parameter = {model_rf.best_params_}')

In [None]:
randfore = RandomForestClassifier(
    n_estimators = model_rf.best_params_['n_estimators'],
    min_samples_split = model_rf.best_params_['min_samples_split'], 
    random_state = 0)
accuracy(randfore)

In [None]:
randfore.fit(X_train, y_train)
feature_importances(randfore)

## K-Nearest Neighbors

In [None]:
parameter = {'n_neighbors': list(range(1,16,2))}

model_knn = GridSearchCV(KNeighborsClassifier(),
                       parameter, n_jobs=-1, verbose=1)
model_knn.fit(StandardScaler().fit_transform(X_train), y_train)

print(f'Best parameter = {model_knn.best_params_}')

In [None]:
knn = KNeighborsClassifier(n_neighbors=model_knn.best_params_['n_neighbors'])
knn_pl = make_pipeline(StandardScaler(), knn)
accuracy(knn_pl)

## Logistic Regression

In [None]:
logreg=LogisticRegression(max_iter=1000)
model_logreg = make_pipeline(RobustScaler(), logreg)
accuracy(logreg)

## Support Vector Machine (SV Classifier)

In [None]:
parameter = {'C': list(range(2,25,2)),
             'gamma' : ['scale', 'auto']
            }

model_sv = GridSearchCV(SVC(),
                       parameter, n_jobs=-1, verbose=1)

model_sv.fit(StandardScaler().fit_transform(X_train), y_train)

print(f'Best parameter = {model_sv.best_params_}')

In [None]:
sv = SVC(
    C = model_sv.best_params_['C'], 
    kernel ='rbf',
    gamma = model_sv.best_params_['gamma'],
)
sv_pl = make_pipeline(StandardScaler(), sv)

accuracy(sv_pl)

## Gradient Boosting

In [None]:
parameter = {'learning_rate': list(np.arange(0.1, 0.6)),
             'min_samples_split' : [10, 15, 20],
             'max_leaf_nodes' : [8, 16, 32]
            }

model_gb = GridSearchCV(GradientBoostingClassifier(),
                       parameter, n_jobs=-1, verbose=1)
model_gb.fit(X_train, y_train)

print(f'Best parameter = {model_gb.best_params_}')

In [None]:
gb = GradientBoostingClassifier(
    learning_rate = model_gb.best_params_['learning_rate'], 
    min_samples_split = model_gb.best_params_['min_samples_split'], 
    max_leaf_nodes = model_gb.best_params_['max_leaf_nodes']
)
accuracy(gb)

In [None]:
gb.fit(X_train, y_train)
feature_importances(gb)

## Stacking

In [None]:
models = {
    'Random Forest':randfore,
    'K-Nearest Neighbors':knn_pl,
    'Gradient Boosting': gb,
    'Support Vector Machine' : sv_pl
    
}
model_stack = get_stacking(models, logreg)
accuracy(model_stack)

In [None]:
model_stack.fit(X_train, y_train)
y_pred = model_stack.predict(X_test)

In [None]:
d_stack = {'PassengerId': passengerID, 'Survived': y_pred}
sub_stack = pd.DataFrame(data=d_stack)

In [None]:
sub_stack

In [None]:
sub_stack.to_csv('submission1.csv', index=False)