In [0]:
!pip install -q sklearn

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc

import tensorflow as tf

In [0]:
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv') # training data
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv') # testing data

In [0]:
dfeval.head()

In [0]:
y_train = dftrain.pop('survived') # It removes the Survived column from the dataframe and we can consider them as a output variable.
y_eval = dfeval.pop('survived') # Similar, we are removing the column from the dataframe.

dftrain.head()

In [0]:
dftrain.describe() # It gives a more details analysis of the dataframe(of all the numerical columns).

In [22]:
dftrain.shape # Provides the number of rows X number of columns

(627, 9)

In [0]:
dftrain.age.hist(bins=20)

In [0]:
print(dftrain.sex.value_counts())
dftrain.sex.value_counts().plot(kind = 'barh')

In [0]:
print(dftrain['class'].value_counts())
dftrain['class'].value_counts().plot(kind = 'barh')

In [0]:
print(pd.concat([dftrain,y_train], axis = 1).groupby('sex').survived.mean()) # It gives the percentage of survival based on their sex
pd.concat([dftrain,y_train], axis = 1).groupby('sex').survived.mean().plot(kind = 'barh').set_xlabel('% of survival')

We are using the above plot to understand our dataset. 
Summary:
Majority of the passengers are in their mid 20's
There are more male passengers compared to the demale passengers
Majority are from third class
and Females have high survival rate when compared to men.


In [0]:
dfeval.shape

In [0]:
# Categorical columns - They are the non-numeric columns (Or columns with some kinnd of category) to which we will be assigning some integer values.
# Numeric columns - Nothing but integer columns.

In [0]:
dftrain.columns

In [0]:
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch','class', 'deck', 'embark_town', 'alone']
NUMERIC_COLUMNS = ['age','fare']

In [0]:
# We are using the feature_column module from the TensorFlow to assign a set of values to all the unique records in each of the above columns
feature_columns = []

# For CATEGORICAL columns

for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = dftrain[feature_name].unique() # It provides the list of unique values in each of the columns and store them in vocabulary
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

# For Numerical columns

for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(feature_name,dtype = tf.float32))

In [0]:
print(feature_columns)

In [0]:
# Preparing our input data for the model

def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32): # here epochs - the no of times the data has to be shown to the model, for its understanding. Here the data is provided in the form of batch(batch_size defines the no of records in each batch)
  def input_function():  # inner function, this will be returned
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))  # create tf.data.Dataset object with data and its label
    if shuffle:
      ds = ds.shuffle(1000)  # randomize order of data
    ds = ds.batch(batch_size).repeat(num_epochs)  # split dataset into batches of 32 and repeat process for number of epochs
    return ds  # return a batch of the dataset
  return input_function  # return a function object for use

train_input_fn = make_input_fn(dftrain, y_train)  # here we will call the input_function that was returned to us to get a dataset object we can feed to the model
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

In [0]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
linear_est.train(train_input_fn) # For the training purpose
result = linear_est.evaluate(eval_input_fn)
clear_output()
print(result['accuracy'])

In [0]:
print(result)

In [0]:
# We are predicting the values for the test dataset

result = list(linear_est.predict(eval_input_fn))
print(result)

# To find the probability of survival for the first record
print(dfeval.loc[0])
print(y_eval.loc[0])
print(result[0]['probabilities'][1])


# As per the result - The person has 3% chances of survival.