In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd

# Preprocessing

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
titanic_dataset = pd.read_csv("titanic/train.csv")
titanic_test = pd.read_csv("titanic/test.csv")

In [5]:
titanic_dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
titanic_dataset.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
titanic_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
titanic_dataset.corr()["Survived"].sort_values

<bound method Series.sort_values of PassengerId   -0.005007
Survived       1.000000
Pclass        -0.338481
Age           -0.077221
SibSp         -0.035322
Parch          0.081629
Fare           0.257307
Name: Survived, dtype: float64>

In [9]:
X_all = titanic_dataset.drop("Survived", axis=1)
y = titanic_dataset["Survived"]

In [10]:
num_features = ["Age", "Fare"]

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

In [11]:
# type(X_all)
# X_copy = X_all.copy()
# X_copy['Cabin'] = X_copy['Cabin'].map(lambda a: a if isinstance(a, float) else a[0])
# X_copy['Cabin'].unique()

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

# class GetTitle(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self
#     def transform(self, X, y=None):
#         # print(f"{X} in Title")
#         X['Name'] = X['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
#         return X

class GetCabin(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # print(f"{X} in Title")
        X['Cabin'] = X['Cabin'].map(lambda a: a if isinstance(a, float) else a[0])
        return X

class GetFamily(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # print(f"{X} in Title")
        X["Family"] = X["SibSp"].values + X['Parch'].values
        X["Family"] = X["Family"].map(lambda a: int(bool(a)))
        X.pop("SibSp")
        X.pop("Parch")
        return X

In [13]:
# X_all["SibSp"].values + X_all['Parch'].values

In [14]:
# tempobj = GetTitle()
# tempx = tempobj.transform(X_all)
# tempx

In [15]:
# cat_features = ["Pclass", "Sex", "Name", "Cabin", "Embarked", "SibSp", "Parch"]
# cat_features = ["Pclass", "Sex", "Embarked", "SibSp", "Parch"]
cat_features = ["Pclass", "Sex", "Cabin", "Embarked", "SibSp", "Parch"]


cat_pipeline = Pipeline(steps=[
    # ('title', GetTitle()),
    ('cabin', GetCabin()),
    ('familyp', GetFamily()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown= 'ignore')),
])

In [16]:
cat_pipeline

Pipeline(steps=[('cabin', GetCabin()), ('familyp', GetFamily()),
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [17]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)])

X = preprocessor.fit_transform(X_all)
X_test = preprocessor.transform(titanic_test)

In [18]:
X

array([[-0.56573646, -0.50244517,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 0.66386103,  0.78684529,  1.        , ...,  0.        ,
         0.        ,  1.        ],
       [-0.25833709, -0.48885426,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [-0.1046374 , -0.17626324,  0.        , ...,  1.        ,
         0.        ,  1.        ],
       [-0.25833709, -0.04438104,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.20276197, -0.49237783,  0.        , ...,  0.        ,
         1.        ,  0.        ]])

In [19]:
X_test.shape

(418, 20)

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [21]:
X_train.shape, y_train.shape

((668, 20), (668,))

# Functional API

In [22]:
# input_1 = keras.layers.Input(shape=X_train.shape[1:])
# hidden_1 = keras.layers.Dense(30, activation="relu")(input_1)
# hidden_2 = keras.layers.Dense(30, activation="relu")(hidden_1)
# hidden_3 = keras.layers.Dense(30, activation="relu")(hidden_2)
# output_1 = keras.layers.Dense(1, activation="sigmoid")(hidden_3)

In [23]:
# model = keras.Model(input_1, output_1)

In [24]:
# model.summary()

In [25]:
# model.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=1e-3), metrics=['acc'])

In [26]:
# tf.keras.backend.get_value(model.optimizer.learning_rate)

In [27]:
# class CustomCallback (keras.callbacks.Callback):
#     def __init__ (self, step):
#         self.step = step
#         self.lrhistory = []
#         self.losshistory = [] 
#     def on_batch_end (self, batch, logs):
#         self.lrhistory.append(tf.keras.backend.get_value(model.optimizer.learning_rate))
#         tf.keras.backend.set_value(self.model.optimizer.learning_rate, self.model.optimizer.learning_rate * self.step)
#         self.losshistory.append(logs["loss"])

In [28]:
# lrfinder_callback = CustomCallback(1 + 1e-2)

In [29]:

checkpoint_callback = keras.callbacks.ModelCheckpoint("best.h5", save_best_only=True)
earlystopping_callback = keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)
# tensorboard_callback = keras.callbacks.TensorBoard(log_dir="./logs")


In [30]:
# history = model.fit(X_train, y_train, epochs=1, validation_data=(X_valid, y_valid),
#     callbacks=[lrfinder_callback])

In [31]:
# import matplotlib.pyplot as plt

# plt.plot(lrfinder_callback.lrhistory, lrfinder_callback.losshistory)

In [32]:

# lrfinder_callback.lrhistory[np.argmin(lrfinder_callback.losshistory)]

In [33]:
# model.history.params

In [34]:
# model_best = keras.models.load_model("best.h5")

In [35]:
# preds = model.predict(X_test)

# Keras Wrapper

In [36]:
X_train.shape[1:]

(20,)

In [37]:
def model_builder (numHidden, numNeurons, learningRate):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=X_train.shape[1:]))
    for layer in range(numHidden):
        model.add(keras.layers.Dense(numNeurons, activation="relu"))
    model.add(keras.layers.Dense(1, activation="sigmoid"))
    optimizer = keras.optimizers.SGD(learning_rate=learningRate)
    model.compile(loss="mse", optimizer=optimizer, metrics=['acc'])
    return model

In [38]:
keras_wrapper = keras.wrappers.scikit_learn.KerasRegressor(model_builder)

In [39]:
# keras_wrapper.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid))

In [40]:
# keras_wrapper.score(X_valid, y_valid)
# keras_wrapper.predict(X_test)

In [41]:
np.arange(start=3e-4, stop=3e-2, step=1e-3).shape

(30,)

In [42]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    "numHidden": np.arange(5, 15),
    "numNeurons": np.arange(100, 200),
    "learningRate": np.arange(start=3e-4, stop=3e-1, step=1e-3),
}

rnd_search = RandomizedSearchCV(keras_wrapper, params, n_iter=15, cv=3)
rnd_search.fit(X_train, y_train, epochs=120, validation_data=(X_valid, y_valid), callbacks=[earlystopping_callback])

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoc

RandomizedSearchCV(cv=3,
                   estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7fcaca92dbe0>,
                   n_iter=15,
                   param_distributions={'learningRate': array([0.0003, 0.0013, 0.0023, 0.0033, 0.0043, 0.0053, 0.0063, 0.0073,
       0.0083, 0.0093, 0.0103, 0.0113, 0.0123, 0.0133, 0.0143, 0.0153,
       0.0163, 0.0173, 0.0183, 0.0193, 0.0203, 0.0213, 0.0223, 0.0233,
       0.0243, 0.0253, 0.0263, 0.0273,...
       113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
       152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
       165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177,
       178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
       191, 192, 193, 194, 195, 196, 197, 198, 199])})

In [43]:
rnd_search.best_params_

{'numNeurons': 199, 'numHidden': 14, 'learningRate': 0.25830000000000003}

In [44]:
rnd_search.best_score_

-0.1320397506157557

In [45]:
best_model = model_builder(**rnd_search.best_params_)

In [54]:
import time

tensorboard_callback = keras.callbacks.TensorBoard(log_dir=time.strftime("./logs/%Y-%m-%d-%H-%M-%S"))
history = best_model.fit(X_train, y_train, epochs=120, validation_data=(X_valid, y_valid),
    callbacks=[checkpoint_callback, earlystopping_callback, tensorboard_callback])

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120


In [55]:
model_best = keras.models.load_model("best.h5")

In [56]:
preds = model_best.predict(X_test)

In [57]:
preds

array([[0.06646661],
       [0.09817545],
       [0.02220542],
       [0.09167654],
       [0.48291332],
       [0.10442694],
       [0.70645446],
       [0.06126449],
       [0.4914619 ],
       [0.0928059 ],
       [0.09025544],
       [0.36215493],
       [0.9870029 ],
       [0.01525775],
       [0.98162466],
       [0.9620891 ],
       [0.07059129],
       [0.29208758],
       [0.48447907],
       [0.1869697 ],
       [0.16679452],
       [0.5004552 ],
       [0.87238896],
       [0.44254124],
       [0.99890935],
       [0.01932523],
       [0.9966769 ],
       [0.2229904 ],
       [0.19733536],
       [0.42111862],
       [0.03075758],
       [0.0647126 ],
       [0.48439208],
       [0.47750872],
       [0.1929438 ],
       [0.35014918],
       [0.47421053],
       [0.4864833 ],
       [0.09254694],
       [0.48464713],
       [0.08165333],
       [0.46636033],
       [0.05895022],
       [0.8719626 ],
       [0.9912942 ],
       [0.08933196],
       [0.475985  ],
       [0.084

# Export to csv

In [58]:
import numpy as np

submission = pd.DataFrame({
    "PassengerId": titanic_test['PassengerId'],
    "Survived": np.round(preds).astype(int).reshape(preds.shape[0])
})

submission.to_csv(time.strftime("submission_%Y-%m-%d-%H-%M-%S"), index=False)
