In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

# System
import math
import os
import sys

# Data
import numpy as np
import pandas as pd

# Plot
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output

# Util
from six.moves import urllib
from sklearn.model_selection import train_test_split

# MachineLearning
try:
  # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow.compat.v2.feature_column as fc

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers

In [None]:
tf.compat.v1.enable_eager_execution()

pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_rows = 15

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data = train_data.reindex(np.random.permutation(train_data.index))

In [None]:
train_data.info()
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.info()

In [None]:
categorical_columns = ["Pclass", "Sex", "Embarked"]
numeric_columns = ["PassengerId", "Age", "SibSp", "Parch", "Fare"]
embedding_columns = ["Name", "Ticket", "Cabin"]
label = "Survived"

In [None]:
# Filling nas usually requires more than just doing 0 or median. Try to divide the column into multiple groups (based on correlations of categorical features) and within each group find median and assign

for dataset in [train_data, test_data]:
    for i, sex in enumerate(['male', 'female']):
        for j, pclass in enumerate([1, 2, 3]):
            guess_df = dataset[(dataset['Sex'] == sex) & (dataset['Pclass'] == pclass)]['Age'].dropna()

            # age_mean = guess_df.mean()
            # age_std = guess_df.std()
            # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == sex) & (dataset.Pclass == pclass), 'Age'] = int( age_guess/0.5 + 0.5 ) * 0.5

    dataset['Age'] = dataset['Age'].astype(int)

In [None]:
for feature_name in numeric_columns:
    train_data[feature_name] = pd.to_numeric(train_data[feature_name], errors='coerce')

mean_age = np.mean(train_data["Age"])
# print(mean_age)
# dftrain.head(20)
# train_data["Age"].fillna(0, inplace=True)
# train_data["Cabin"].fillna("", inplace=True)
# train_data["Embarked"].fillna("", inplace=True)

In [None]:
# for feature_name in numeric_columns:
#     test_data[feature_name] = pd.to_numeric(train_data[test_data], errors='coerce')

mean_age = np.mean(test_data["Age"])
# print(mean_age)
# dftrain.head(20)

# test_data["Age"].fillna(0, inplace=True)
test_data["Cabin"].fillna("", inplace=True)
test_data["Embarked"].fillna("", inplace=True)

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train_data[numeric_columns + [label]].astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
# Single Feature Analysis
# NumericalFeature vs Label
# Binning
g = sns.FacetGrid(train_data, col='Survived')
g.map(plt.hist, 'Fare', bins=20)

In [None]:
pal = {'male':"green", 'female':"Pink"}
sns.set(style="darkgrid")
plt.subplots(figsize = (15,8))
ax = sns.barplot(x = "Sex", 
                 y = "Survived", 
                 data=train_data, 
                 palette = pal,
                 linewidth=5,
                 order = ['female','male'],
                 capsize = .05,

                )

plt.title("Survived/Non-Survived Passenger Gender Distribution", fontsize = 25,loc = 'center', pad = 40)
plt.ylabel("% of passenger survived", fontsize = 15, )
plt.xlabel("Sex",fontsize = 15);

In [None]:
# Single Feature Analysis
# Kernel Density Plot
# Numerical Feature
fig = plt.figure(figsize=(15,8),)
## I have included to different ways to code a plot below, choose the one that suites you. 
ax=sns.kdeplot(train_data.Pclass[train_data.Survived == 0] , 
               color='gray',
               shade=True,
               label='not survived')
ax=sns.kdeplot(train_data.loc[(train_data['Survived'] == 1),'Pclass'] , 
               color='g',
               shade=True, 
               label='survived', 
              )
plt.title('Passenger Class Distribution - Survived vs Non-Survived', fontsize = 25, pad = 40)
plt.ylabel("Frequency of Passenger Survived", fontsize = 15, labelpad = 20)
plt.xlabel("Passenger Class", fontsize = 15,labelpad =20)
## Converting xticks into words for better understanding
labels = ['Upper', 'Middle', 'Lower']
plt.xticks(sorted(train_data.Pclass.unique()), labels);

In [None]:
grid = sns.FacetGrid(train_data, hue='Sex', size=5.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', alpha=.5, bins=20)
grid.add_legend();

In [None]:
# Objective is to move the point in the point plots towards the ends of the Y axis i.e <0.25 and >0.75

grid = sns.FacetGrid(train_data, col='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()

In [None]:
g = sns.FacetGrid(train_data, size=5,hue="Survived", col ="Sex", margin_titles=True)
g.map(plt.scatter, "Fare", "Age",edgecolor="w").add_legend()
g.fig.suptitle("Survived by Sex, Fare and Age", size = 25)
plt.subplots_adjust(top=0.85)

In [None]:
pd.DataFrame(abs(train_data.corr()['Survived']).sort_values(ascending = False))

-------------------------------------------------------------**FEATURE COLUMN MANAGEMENT**-----------------------------------------------------------------

In [None]:
LABEL = "Survived"
CATEGORICAL_COLUMNS = ['Sex', 'Pclass', 'SibSp', 'Parch']
NUMERIC_COLUMNS = ['Age']
LABEL = 'Survived'

dftrain, dfeval = train_test_split(train_data, test_size=0.3)

ytrain = dftrain.pop(LABEL)
yeval = dfeval.pop(LABEL)


In [None]:
dftrain.head()
ytrain.head()
# dftrain.describe()

In [None]:
nans = lambda df: df[df.isnull().any(axis=1)]
nans(dftrain[["Fare"]])

dftrain["Fare"].describe()

In [None]:
dftrain['LogFare'] = np.log(dftrain['Fare']+1)
dfeval['LogFare'] = np.log(dfeval['Fare']+1)

In [None]:
# NumericColumn
for feature_name in NUMERIC_COLUMNS:
    dftrain[feature_name].hist(bins=150)

# dftrain['Fare'].describe()
# CategoricalColumn
# for feature_name in CATEGORICAL_COLUMNS:
#     dftrain[feature_name].value_counts().plot(kind='barh')

In [None]:
feature_columns = []
categorical_feature_columns = []
crossed_feature_columns = []
numeric_feature_columns = []

for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = dftrain[feature_name].unique()
    categorical_feature_columns.append(
        feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                feature_name, 
                vocabulary
            )
        )
    )

epsilon = 0.1

# for feature_name in NUMERIC_COLUMNS:
#     numeric_feature_columns.append(
#         tf.feature_column.numeric_column(
#             feature_name, 
#             dtype=tf.float32,
#             normalizer_fn=lambda val: (val - dftrain.mean()[feature_name]) / (epsilon + dftrain.std()[feature_name])
#         )
#     )
    
numeric_feature_columns.append(tf.feature_column.numeric_column(
            'LogFare', 
            dtype=tf.float32
        ))

crossed_feature = feature_column.crossed_column([tf.feature_column.categorical_column_with_vocabulary_list(
                'SibSp', 
                vocabulary
            ), tf.feature_column.categorical_column_with_vocabulary_list(
                'Parch', 
                vocabulary
            )], hash_bucket_size=15)
crossed_feature = feature_column.indicator_column(crossed_feature)
crossed_feature_columns.append(crossed_feature)

age_buckets = feature_column.bucketized_column(tf.feature_column.numeric_column(
            feature_name, 
            dtype=tf.float32,
            normalizer_fn=lambda val: (val - dftrain.mean()[feature_name]) / (epsilon + dftrain.std()[feature_name])
        ), boundaries=[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70])

feature_columns.append(age_buckets)
feature_columns.extend(categorical_feature_columns[:2]) 
feature_columns.extend(crossed_feature_columns)
feature_columns.extend(numeric_feature_columns)
print(len(feature_columns))

In [None]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
# Config
BATCH_SIZE = 10
NUM_EPOCHS = 50
NUM_TRAINING_STEPS = 10000

print(len(dftrain))

In [None]:
# InputFunctors
def df_to_dataset(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    # Dataset needs to be re-assigned
    if shuffle:
        ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds

def testdf_to_dataset(data_df, num_epochs=10, shuffle=True, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(dict(data_df))
    # Dataset needs to be re-assigned
    if shuffle:
        ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds

train_ds = df_to_dataset(dftrain, ytrain, num_epochs=NUM_EPOCHS, shuffle=True, batch_size=BATCH_SIZE)
eval_ds = df_to_dataset(dfeval, yeval, num_epochs=1, shuffle=False, batch_size=BATCH_SIZE)
test_ds = testdf_to_dataset(test_data, num_epochs=1, shuffle=False, batch_size=BATCH_SIZE)

ds = df_to_dataset(dftrain, ytrain, batch_size=5)
for feature_batch, label_batch in ds.take(1):
    print('Length of batch: ', len(label_batch))
    print('Number of features: ', len(feature_batch))
    print('Some feature keys:', list(feature_batch.keys()))
    print('A batch of Labels:', label_batch.numpy())
    print()

# InspectInput of model
def demo(example_batch, feature_column):
    feature_layer = layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

example_batch = next(iter(ds))[0]
demo(example_batch, feature_columns)

In [None]:
dftrain.describe(include=['O'])

In [None]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(6, activation='sigmoid'),
#   layers.Dense(16, activation='sigmoid'),
  layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=eval_ds,
          epochs=5)


# loss, accuracy = model.evaluate(test_ds)
# print("Accuracy", accuracy)

In [None]:
def make_input_fn(data_df, label_df, n_epochs=None, shuffle=True, batch_size=32):
    def input_fn():
        return df_to_dataset(data_df, label_df, n_epochs, shuffle, batch_size)
    return input_fn

train_input_fn = make_input_fn(dftrain, ytrain, n_epochs=NUM_EPOCHS, shuffle=True, batch_size=BATCH_SIZE)
eval_input_fn = make_input_fn(dfeval, yeval, n_epochs=1, shuffle=False, batch_size=BATCH_SIZE)

In [None]:
n_batches = 5
est = tf.estimator.BoostedTreesClassifier(feature_columns,
                                          n_batches_per_layer=n_batches)

# The model will stop training once the specified number of trees is built, not
# based on the number of steps.
est.train(train_input_fn, max_steps=100)

# Eval.
result = est.evaluate(eval_input_fn)
clear_output()
print(pd.Series(result))

In [None]:
predictions = [1 if x>=0.5 else 0 for x in model.predict(test_ds) ]

In [None]:
output = pd.DataFrame()
output["PassengerId"] = test_data["PassengerId"]
output["Survived"] = predictions

output.to_csv("/kaggle/working/predictions_3fcross_83.58.csv", index = False)