In [75]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, make_scorer, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [76]:
df_train = pd.read_csv('../datasets/spaceship_titanic/train.csv')
df_test = pd.read_csv('../datasets/spaceship_titanic/test.csv')

In [77]:
target = df_train.pop('Transported')
target

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [78]:
df_full = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
df_full.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [79]:
df_full.shape

(12970, 13)

### Preprocessing

In [80]:
round(df_full.isna().sum() * 100 / df_full.shape[0], 3)

PassengerId     0.000
HomePlanet      2.221
CryoSleep       2.390
Cabin           2.305
Destination     2.113
Age             2.082
VIP             2.282
RoomService     2.028
FoodCourt       2.228
ShoppingMall    2.359
Spa             2.190
VRDeck          2.066
Name            2.267
dtype: float64

In [81]:
df_full_1 = df_full.copy()

list_missing_cat_columns = list((df_full_1.select_dtypes(['object', 'category']).isna().sum() > 0). index)
list_missing_cat_columns

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'VIP',
 'Name']

In [82]:
for col in list_missing_cat_columns:
    most_frequent_value = df_full_1[col].mode()[0]
    df_full_1[col] = df_full_1[col].fillna(most_frequent_value)

  df_full_1[col] = df_full_1[col].fillna(most_frequent_value)


In [83]:
list_missing_num_columns = list((df_full_1.select_dtypes(np.number).isna().sum() > 0).index)
list_missing_num_columns

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [84]:
for col in list_missing_num_columns:
    df_full_1[col] = df_full_1[col].fillna(df_full_1[col].mode()[0])

In [85]:
df_full_1['CryoSleep'] = df_full_1['CryoSleep'].astype(bool)
df_full_1['VIP'] = df_full_1['VIP'].astype(bool)

In [86]:
def extract_features(df):
    df['PassengerGroup'] = (df['PassengerId'].str.split('_', expand=True))[0]
    
    df['CabinDeck'] = df['Cabin'].str.split('/', expand=True)[0]
    
    df['DeckPosition'] = df['CabinDeck'].apply(lambda deck: 'Lower' if deck in ('A', 'B', 'C', 'D') else 'Higher')
    
    df['CabinSide'] = df['Cabin'].str.split('/', expand=True)[2]
    
    df["FamilyName"] = df["Name"].str.split(' ', expand = True)[1]
    
    # Membuat fitur NoRelatives (Jumlah Anggota Keluarga)
    NoRelatives = df.groupby('FamilyName')['PassengerId'].count().reset_index()
    NoRelatives = NoRelatives.rename(columns = {"PassengerId": "NoRelatives"})
    
    df = df.merge(NoRelatives[["FamilyName", "NoRelatives"]], how = 'left', on = ['FamilyName'])
    
    df["FamilySizeCat"] = pd.cut(df.NoRelatives, bins = [0, 2, 5, 10, 300], labels = ['0 - 2', '3 - 5', '6 - 10', '11 - 208'])

    return df

In [87]:
df_full_2 = df_full_1.copy()
df_full_2 = extract_features(df_full_1)

In [88]:
df_full_2.nunique()

PassengerId       12970
HomePlanet            3
CryoSleep             2
Cabin              9825
Destination           3
Age                  80
VIP                   2
RoomService        1578
FoodCourt          1953
ShoppingMall       1367
Spa                1679
VRDeck             1642
Name              12629
PassengerGroup     9280
CabinDeck             8
DeckPosition          2
CabinSide             2
FamilyName         2406
NoRelatives          20
FamilySizeCat         4
dtype: int64

In [89]:
delete_columns = ['Cabin', 'PassengerId', 'Name', 'FamilyName', 'PassengerGroup']
df_full_2 = df_full_2.drop(delete_columns, axis=1)
df_full_2.shape

(12970, 15)

In [90]:
from sklearn.preprocessing import StandardScaler

df_full_3 = df_full_2.copy()
column = df_full_3.select_dtypes(include=np.number).columns
scaler = StandardScaler()
scaled = scaler.fit_transform(df_full_2[column])
df_full_3.loc[:, column] = scaled

 -0.11318102]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_full_3.loc[:, column] = scaled


In [91]:
df_full_3.select_dtypes(include=['object', 'category']).columns

Index(['HomePlanet', 'Destination', 'CabinDeck', 'DeckPosition', 'CabinSide',
       'FamilySizeCat'],
      dtype='object')

In [92]:
df_full_3.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinDeck,DeckPosition,CabinSide,NoRelatives,FamilySizeCat
0,Europa,False,TRAPPIST-1e,0.729959,False,-0.34029,-0.281822,-0.292365,-0.269707,-0.2571,B,Lower,P,-0.249099,3 - 5
1,Earth,False,TRAPPIST-1e,-0.317601,False,-0.170439,-0.276082,-0.249566,0.22104,-0.219449,F,Higher,S,-0.226446,3 - 5
2,Europa,False,TRAPPIST-1e,2.056868,True,-0.273285,1.998823,-0.292365,5.732776,-0.21517,A,Lower,S,-0.158487,6 - 10
3,Europa,False,TRAPPIST-1e,0.310935,False,-0.34029,0.536429,0.342766,2.706059,-0.091947,A,Lower,S,-0.158487,6 - 10
4,Earth,False,TRAPPIST-1e,-0.876299,False,0.131863,-0.237179,-0.033861,0.235342,-0.255389,F,Higher,S,-0.113181,6 - 10


In [93]:
encoded = df_full_3.copy()

encoded = pd.get_dummies(data=df_full_3,
                         columns=['HomePlanet', 'Destination', 'CabinDeck', 'DeckPosition', 'CabinSide', 'FamilySizeCat'],
                         dtype=int)
encoded

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,NoRelatives,HomePlanet_Earth,...,CabinDeck_G,CabinDeck_T,DeckPosition_Higher,DeckPosition_Lower,CabinSide_P,CabinSide_S,FamilySizeCat_0 - 2,FamilySizeCat_3 - 5,FamilySizeCat_6 - 10,FamilySizeCat_11 - 208
0,False,0.729959,False,-0.340290,-0.281822,-0.292365,-0.269707,-0.257100,-0.249099,0,...,0,0,0,1,1,0,0,1,0,0
1,False,-0.317601,False,-0.170439,-0.276082,-0.249566,0.221040,-0.219449,-0.226446,1,...,0,0,1,0,0,1,0,1,0,0
2,False,2.056868,True,-0.273285,1.998823,-0.292365,5.732776,-0.215170,-0.158487,0,...,0,0,0,1,0,1,0,0,1,0
3,False,0.310935,False,-0.340290,0.536429,0.342766,2.706059,-0.091947,-0.158487,0,...,0,0,0,1,0,1,0,0,1,0
4,False,-0.876299,False,0.131863,-0.237179,-0.033861,0.235342,-0.255389,-0.113181,1,...,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,True,0.380772,False,-0.340290,-0.281822,-0.292365,-0.269707,-0.257100,-0.090528,1,...,1,0,1,0,0,1,0,0,1,0
12966,False,0.939471,False,-0.340290,0.258364,-0.263262,-0.260768,-0.133877,-0.181140,1,...,1,0,1,0,1,0,0,0,1,0
12967,True,-0.736625,False,-0.340290,-0.281822,-0.292365,-0.269707,-0.257100,-0.226446,0,...,0,0,0,1,1,0,0,1,0,0
12968,False,-0.736625,False,-0.340290,1.427386,-0.292365,-0.269707,0.190440,-0.158487,0,...,0,0,0,1,1,0,0,0,1,0


In [94]:
data_4 = encoded.copy()

train_data_final = data_4.loc[:df_train.index.max(), :].copy()
test_data_final = data_4.loc[df_train.index.max()+1:, :].reset_index(drop=True).copy()

print(train_data_final.shape)
print(test_data_final.shape)

(8693, 31)
(4277, 31)


In [95]:
X = train_data_final.copy()
y = target.astype(int)

### Cross Validation

In [96]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('SVM', SVC()))

results = []
names = []

In [97]:
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

KNN: 0.765326 (0.010700)
NB: 0.732539 (0.030739)
DT: 0.723111 (0.011506)
SVM: 0.797767 (0.010221)


Dari hasil K-Fold Cross Validation di atas, dapat disimpulkan bahwa model dengan tingkat akurasi terbaik adalah SVM, sehingga pada proses training dan testing akan menggunakan model SVM.

In [98]:
model = SVC()
kf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

scoring = {
    'accuracy' : 'accuracy',
    'precision' : make_scorer(precision_score),
    'recall' : make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

results = cross_validate(model, X, y, cv=kf, scoring=scoring)

print(f"Accuracy: {results['test_accuracy'].mean()}")
print(f"Precision: {results['test_precision'].mean()}")
print(f"Recall: {results['test_recall'].mean()}")
print(f"F1-Score: {results['test_f1'].mean()}")

Accuracy: 0.797767152097139
Precision: 0.7899702538992437
Recall: 0.8154362977127153
F1-Score: 0.8024164103881043


### Training the Model

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [100]:
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

In [101]:
prediction = svm_classifier.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, prediction)}")

Accuracy: 0.7983128834355828


### Evaluasi Model

In [102]:
from sklearn.metrics import classification_report

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.80      0.78      0.79      1291
           1       0.79      0.81      0.80      1317

    accuracy                           0.80      2608
   macro avg       0.80      0.80      0.80      2608
weighted avg       0.80      0.80      0.80      2608



### Tuning Hyperparameter

In [103]:
list(range(10, 51, 20))

[10, 30, 50]

In [104]:
model = SVC()

param_grid = {
    'C': list(range(10, 51, 20)),
    'gamma': ['auto', 'scale'],
    'kernel': ['rbf']
}

kf = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)

scoring = {
    'accuracy' : 'accuracy',
    'precision' : make_scorer(precision_score),
    'recall' : make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kf, scoring='accuracy', refit=True, verbose=2)

grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_}")
print(f"Best estimator: {grid_search.best_estimator_}")

best_model = grid_search.best_estimator_

results = cross_validate(best_model, X, y, cv=kf, scoring=scoring)

print(f"Accuracy: {results['test_accuracy'].mean()}")
print(f"Precision: {results['test_precision'].mean()}")
print(f"Recall: {results['test_recall'].mean()}")
print(f"F1-Score: {results['test_f1'].mean()}")

Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV] END .......................C=10, gamma=auto, kernel=rbf; total time=   2.1s
[CV] END .......................C=10, gamma=auto, kernel=rbf; total time=   1.9s
[CV] END .......................C=10, gamma=auto, kernel=rbf; total time=   1.9s
[CV] END .......................C=10, gamma=auto, kernel=rbf; total time=   2.1s
[CV] END .......................C=10, gamma=auto, kernel=rbf; total time=   1.7s
[CV] END .......................C=10, gamma=auto, kernel=rbf; total time=   2.2s
[CV] END .......................C=10, gamma=auto, kernel=rbf; total time=   1.7s
[CV] END .......................C=10, gamma=auto, kernel=rbf; total time=   1.6s
[CV] END .......................C=10, gamma=auto, kernel=rbf; total time=   1.6s
[CV] END .......................C=10, gamma=auto, kernel=rbf; total time=   1.5s
[CV] END ......................C=10, gamma=scale, kernel=rbf; total time=   1.8s
[CV] END ......................C=10, gamma=scale

Ini adalah hasil tuning

Best parameters: {'C': 30, 'gamma': 'auto', 'kernel': 'rbf'}
Best accuracy: 0.8007573773527505
Best estimator: SVC(C=30, gamma='auto')
Accuracy: 0.8007573773527505
Precision: 0.7947098540762301
Recall: 0.8152074647607703
F1-Score: 0.8047310144433307

In [105]:


# clf = GridSearchCV(SVC(), {
#     'gamma': ['auto', 'scale'],
#     'C': list(range(10, 100, 20)),
#     'kernel': ['rbf', 'linear']
# }, cv=5, return_train_score=False)

# clf.fit(X_train, y_train)
# print(f"Best parameters: {clf.best_params_}")
# print(f"Best score: {clf.best_score_}")
# print(f"Best estimator: {clf.best_estimator_}")

In [106]:
best_model = grid_search.best_estimator_
prediction = best_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, prediction)}")

Accuracy: 0.8282208588957055


### Evaluation After Tuning Hyperparameters

In [107]:
from sklearn.metrics import classification_report

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.84      0.81      0.82      1291
           1       0.82      0.85      0.83      1317

    accuracy                           0.83      2608
   macro avg       0.83      0.83      0.83      2608
weighted avg       0.83      0.83      0.83      2608



### Submission

In [108]:
prediction = best_model.predict(test_data_final)
prediction

array([1, 0, 1, ..., 1, 1, 1])

In [109]:
sample_submission = pd.read_csv('../datasets/spaceship_titanic/sample_submission.csv')

In [110]:
pred_dict = {'PassengerId': df_test['PassengerId'], 'Transported': prediction}
pred_df = pd.DataFrame(pred_dict)

pred_df['Transported'] = pred_df['Transported'].map({1: True, 0: False})
pred_df.to_csv('submission_svm_mode_tuning.csv', index=False)

In [111]:
import sklearn
print(sklearn.__version__)

1.4.1.post1
