# Cabin

In [1]:
import jupyter_black
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
jupyter_black.load()

In [3]:
le = LabelEncoder()

# Prepare

In [4]:
df = pd.read_csv("./datasets/prepared.csv", index_col="PassengerId")
df

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isTest,Name_FirstName,Name_Title,Name_LastName,FamilySize,isAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0.0,3,1,22.0,1,0,0,-0.503595,,2,0,29,6,150,2,0
2,1.0,1,0,38.0,1,0,12,0.734503,C85,0,0,61,7,104,2,0
3,1.0,3,0,26.0,0,0,21,-0.490544,,2,0,175,4,149,1,1
4,1.0,1,0,35.0,1,0,26,0.382925,C123,2,0,88,7,96,2,0
5,0.0,3,1,35.0,0,0,26,-0.488127,,2,0,5,6,182,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,,3,1,,0,0,2,-0.488127,,2,1,175,6,149,1,1
1306,,1,0,39.0,0,0,12,1.461829,C105,0,1,175,9,149,1,1
1307,,3,1,38.5,0,0,19,-0.503595,,2,1,175,6,149,1,1
1308,,3,1,,0,0,26,-0.488127,,2,1,220,6,72,1,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 1 to 1309
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Survived        891 non-null    float64
 1   Pclass          1309 non-null   int64  
 2   Sex             1309 non-null   int64  
 3   Age             1046 non-null   float64
 4   SibSp           1309 non-null   int64  
 5   Parch           1309 non-null   int64  
 6   Ticket          1309 non-null   int64  
 7   Fare            1309 non-null   float64
 8   Cabin           295 non-null    object 
 9   Embarked        1309 non-null   int64  
 10  isTest          1309 non-null   int64  
 11  Name_FirstName  1309 non-null   int64  
 12  Name_Title      1309 non-null   int64  
 13  Name_LastName   1309 non-null   int64  
 14  FamilySize      1309 non-null   int64  
 15  isAlone         1309 non-null   int64  
dtypes: float64(3), int64(12), object(1)
memory usage: 173.9+ KB


In [6]:
test_ids = df[df["Cabin"].isna()].index
test_ids

Index([   1,    3,    5,    6,    8,    9,   10,   13,   14,   15,
       ...
       1295, 1298, 1300, 1301, 1302, 1304, 1305, 1307, 1308, 1309],
      dtype='int64', name='PassengerId', length=1014)

In [7]:
y_train = df[~df.index.isin(test_ids)][["Cabin"]]
y_train["Cabin"] = le.fit_transform(y_train["Cabin"].str[0])
y_train

Unnamed: 0_level_0,Cabin
PassengerId,Unnamed: 1_level_1
2,2
4,2
7,4
11,6
12,2
...,...
1296,3
1297,3
1299,2
1303,2


In [8]:
y_train["Cabin"].value_counts()

Cabin
2    94
1    65
3    46
4    41
0    22
5    21
6     5
7     1
Name: count, dtype: int64

In [9]:
X = df.drop(["Survived", "Age", "Cabin", "isTest"], axis=1)
X

Unnamed: 0_level_0,Pclass,Sex,SibSp,Parch,Ticket,Fare,Embarked,Name_FirstName,Name_Title,Name_LastName,FamilySize,isAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,3,1,1,0,0,-0.503595,2,29,6,150,2,0
2,1,0,1,0,12,0.734503,0,61,7,104,2,0
3,3,0,0,0,21,-0.490544,2,175,4,149,1,1
4,1,0,1,0,26,0.382925,2,88,7,96,2,0
5,3,1,0,0,26,-0.488127,2,5,6,182,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,1,0,0,2,-0.488127,2,175,6,149,1,1
1306,1,0,0,0,12,1.461829,0,175,9,149,1,1
1307,3,1,0,0,19,-0.503595,2,175,6,149,1,1
1308,3,1,0,0,26,-0.488127,2,220,6,72,1,1


In [10]:
X_train = X[~X.index.isin(test_ids)]
X_test = X[X.index.isin(test_ids)]
X_train.shape, X_test.shape

((295, 12), (1014, 12))

In [11]:
X_train, X_true, y_train, y_true = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)
X_train.shape, X_true.shape, y_train.shape, y_true.shape

((265, 12), (30, 12), (265, 1), (30, 1))

# Train

In [12]:
model = CatBoostClassifier(loss_function="MultiClass")

In [13]:
model.fit(Pool(X_train, y_train), verbose=False, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f82f5f68a10>

# Predict

In [14]:
y_preds = model.predict(X_test).ravel()
y_preds

array([5, 4, 5, ..., 5, 5, 5])

In [15]:
y_test = pd.DataFrame({"PassengerId": X_test.index, "Cabin": y_preds}).set_index(
    "PassengerId"
)
y_test

Unnamed: 0_level_0,Cabin
PassengerId,Unnamed: 1_level_1
1,5
3,4
5,5
6,5
8,5
...,...
1304,4
1305,5
1307,5
1308,5


In [16]:
df1 = pd.concat([y_train, y_true, y_test]).sort_index()
df1

Unnamed: 0_level_0,Cabin
PassengerId,Unnamed: 1_level_1
1,5
2,2
3,4
4,2
5,5
...,...
1305,5
1306,2
1307,5
1308,5


In [17]:
df1["isCabinPreds"] = df1.index.isin(test_ids).astype(int)
df1

Unnamed: 0_level_0,Cabin,isCabinPreds
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5,1
2,2,0
3,4,1
4,2,0
5,5,1
...,...,...
1305,5,1
1306,2,0
1307,5,1
1308,5,1


# Save

In [19]:
df1.to_csv("./datasets/prepared_cabin.csv")