In [144]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [145]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [146]:
target = df_train.pop('Transported')
target

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [147]:
df_full = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
df_full.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [148]:
df_full.shape

(12970, 13)

In [149]:
df_full.dtypes

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
dtype: object

### Preprocessing

In [150]:
round(df_full.isna().sum() * 100 / df_full.shape[0], 3)

PassengerId     0.000
HomePlanet      2.221
CryoSleep       2.390
Cabin           2.305
Destination     2.113
Age             2.082
VIP             2.282
RoomService     2.028
FoodCourt       2.228
ShoppingMall    2.359
Spa             2.190
VRDeck          2.066
Name            2.267
dtype: float64

In [151]:
df_full_1 = df_full.copy()

list_missing_cat_columns = list((df_full_1.select_dtypes(['object', 'category']).isna().sum() > 0). index)
list_missing_cat_columns

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'VIP',
 'Name']

In [152]:
for col in list_missing_cat_columns:
    most_frequent_value = df_full_1[col].mode()[0]
    df_full_1[col] = df_full_1[col].fillna(most_frequent_value)

In [153]:
list_missing_num_columns = list((df_full_1.select_dtypes(np.number).isna().sum() > 0).index)
list_missing_num_columns

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [154]:
for col in list_missing_num_columns:
    df_full_1[col] = df_full_1[col].fillna(df_full_1[col].interpolate())

In [156]:
df_full_1.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
dtype: int64

In [157]:
df_full_1['CryoSleep'] = df_full_1['CryoSleep'].astype(bool)
df_full_1['VIP'] = df_full_1['VIP'].astype(bool)

In [158]:
def extract_features(df):
    df['PassengerGroup'] = (df['PassengerId'].str.split('_', expand=True))[0]
    
    df['CabinDeck'] = df['Cabin'].str.split('/', expand=True)[0]
    
    df['DeckPosition'] = df['CabinDeck'].apply(lambda deck: 'Lower' if deck in ('A', 'B', 'C', 'D') else 'Higher')
    
    df['CabinSide'] = df['Cabin'].str.split('/', expand=True)[2]
    
    df["FamilyName"] = df["Name"].str.split(' ', expand = True)[1]
    
    # Membuat fitur NoRelatives (Jumlah Anggota Keluarga)
    NoRelatives = df.groupby('FamilyName')['PassengerId'].count().reset_index()
    NoRelatives = NoRelatives.rename(columns = {"PassengerId": "NoRelatives"})
    
    df = df.merge(NoRelatives[["FamilyName", "NoRelatives"]], how = 'left', on = ['FamilyName'])
    
    df["FamilySizeCat"] = pd.cut(df.NoRelatives, bins = [0, 2, 5, 10, 300], labels = ['0 - 2', '3 - 5', '6 - 10', '11 - 208'])

    return df

In [159]:
df_full_2 = df_full_1.copy()
df_full_2 = extract_features(df_full_1)

In [160]:
df_full_2.nunique()

PassengerId       12970
HomePlanet            3
CryoSleep             2
Cabin              9825
Destination           3
Age                 125
VIP                   2
RoomService        1665
FoodCourt          2056
ShoppingMall       1454
Spa                1775
VRDeck             1715
Name              12629
PassengerGroup     9280
CabinDeck             8
DeckPosition          2
CabinSide             2
FamilyName         2406
NoRelatives          20
FamilySizeCat         4
dtype: int64

In [161]:
delete_columns = ['Cabin', 'PassengerId', 'Name', 'FamilyName', 'PassengerGroup']
df_full_2 = df_full_2.drop(delete_columns, axis=1)
df_full_2.shape

(12970, 15)

In [162]:
from sklearn.preprocessing import StandardScaler

df_full_3 = df_full_2.copy()
column = df_full_3.select_dtypes(include=np.number).columns
scaler = StandardScaler()
scaled = scaler.fit_transform(df_full_2[column])
df_full_3.loc[:, column] = scaled

In [163]:
df_full_3.select_dtypes(include=['object', 'category']).columns

Index(['HomePlanet', 'Destination', 'CabinDeck', 'DeckPosition', 'CabinSide',
       'FamilySizeCat'],
      dtype='object')

In [164]:
df_full_3.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinDeck,DeckPosition,CabinSide,NoRelatives,FamilySizeCat
0,Europa,False,TRAPPIST-1e,0.714593,False,-0.346419,-0.287142,-0.29737,-0.27488,-0.261156,B,Lower,P,-0.249099,3 - 5
1,Earth,False,TRAPPIST-1e,-0.333064,False,-0.177029,-0.281426,-0.254628,0.212625,-0.223642,F,Higher,S,-0.226446,3 - 5
2,Europa,False,TRAPPIST-1e,2.041626,True,-0.279595,1.983971,-0.29737,5.687956,-0.219379,A,Lower,S,-0.158487,6 - 10
3,Europa,False,TRAPPIST-1e,0.29553,False,-0.346419,0.52769,0.336922,2.68123,-0.096608,A,Lower,S,-0.158487,6 - 10
4,Earth,False,TRAPPIST-1e,-0.891815,False,0.124454,-0.242685,-0.039208,0.226833,-0.259451,F,Higher,S,-0.113181,6 - 10


In [165]:
from sklearn import preprocessing
encoded = df_full_3.copy()

columns=['HomePlanet', 'Destination', 'CabinDeck', 'DeckPosition', 'CabinSide', 'FamilySizeCat']

for col in columns:

    # Initialize a LabelEncoder object
    label_encoder = preprocessing.LabelEncoder()
    label_encoder.fit(encoded[col].unique())
    encoded[col] = label_encoder.transform(encoded[col])
    print(f"{col}: {encoded[col].unique()}")

encoded

HomePlanet: [1 0 2]
Destination: [2 1 0]
CabinDeck: [1 5 0 6 4 3 2 7]
DeckPosition: [1 0]
CabinSide: [0 1]
FamilySizeCat: [2 3 1 0]


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,CabinDeck,DeckPosition,CabinSide,NoRelatives,FamilySizeCat
0,1,False,2,0.714593,False,-0.346419,-0.287142,-0.297370,-0.274880,-0.261156,1,1,0,-0.249099,2
1,0,False,2,-0.333064,False,-0.177029,-0.281426,-0.254628,0.212625,-0.223642,5,0,1,-0.226446,2
2,1,False,2,2.041626,True,-0.279595,1.983971,-0.297370,5.687956,-0.219379,0,1,1,-0.158487,3
3,1,False,2,0.295530,False,-0.346419,0.527690,0.336922,2.681230,-0.096608,0,1,1,-0.158487,3
4,0,False,2,-0.891815,False,0.124454,-0.242685,-0.039208,0.226833,-0.259451,5,0,1,-0.113181,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,0,True,2,0.365374,False,-0.346419,-0.287142,-0.297370,-0.274880,-0.261156,6,0,1,-0.090528,3
12966,0,False,2,0.924125,False,-0.346419,0.250787,-0.268305,-0.266000,-0.138384,6,0,0,-0.181140,3
12967,2,True,0,0.947406,False,-0.346419,-0.287142,-0.297370,-0.274880,-0.261156,3,1,0,-0.226446,2
12968,1,False,2,0.970687,False,-0.346419,1.414922,-0.297370,-0.274880,0.184744,3,1,0,-0.158487,3


In [167]:
data_4 = encoded.copy()

train_data_final = data_4.loc[:df_train.index.max(), :].copy()
test_data_final = data_4.loc[df_train.index.max()+1:, :].reset_index(drop=True).copy()

print(train_data_final.shape)
print(test_data_final.shape)

(8693, 15)
(4277, 15)


In [168]:
X = train_data_final.copy()
y = target.astype(int)

### Cross Validation

In [169]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('SVM', SVC()))

results = []
names = []

In [170]:
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

KNN: 0.769928 (0.013240)
NB: 0.707006 (0.011284)
DT: 0.727712 (0.011623)
SVM: 0.787297 (0.012041)


Dari hasil K-Fold Cross Validation di atas, dapat disimpulkan bahwa model dengan tingkat akurasi terbaik adalah SVM, sehingga pada proses training dan testing akan menggunakan model SVM.

### Training the Model

In [171]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [172]:
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

In [173]:
prediction = svm_classifier.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, prediction)}")

Accuracy: 0.7887269938650306


### Evaluasi Model

In [174]:
from sklearn.metrics import classification_report

print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.77      0.82      0.79      1291
           1       0.81      0.76      0.78      1317

    accuracy                           0.79      2608
   macro avg       0.79      0.79      0.79      2608
weighted avg       0.79      0.79      0.79      2608



### Submission

In [178]:
prediction = svm_classifier.predict(test_data_final)
prediction

array([1, 0, 1, ..., 1, 1, 1])

In [180]:
pred_dict = {'PassengerId': df_test['PassengerId'], 'Transported': prediction}
pred_df = pd.DataFrame(pred_dict)

pred_df['Transported'] = pred_df['Transported'].map({1: True, 0: False})
pred_df.to_csv('submission_nontun_inter.csv', index=False)