In [16]:
!pip install -q sklearn

In [17]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

import tensorflow.compat.v2.feature_column as fc

import tensorflow as tf

In [18]:
#ex. predict how likly someone is to survive the titantic, given this dataset

#LOAD DATA
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv') #training data
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv') #testing data
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')

In [19]:
#less testing (eval) data
dftrain.shape #(627, 9)
dfeval.shape #(264, 9) 

(264, 9)

In [20]:
#categorical data has to be encoded into integer values.
#the model does not care what a category is, it just needs to know that values are different or the same. 
# sex: {famale: 0, male: 1}
# class: {"First": 0, Second: 1, Third: 2}

CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
NUMERIC_COLUMNS = ['age', 'fare']

In [42]:
feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = dftrain[feature_name].unique()  # gets a list of all unique values from given feature column
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)) #this creates a numpy array with feature name and vocab
    
for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))
    #note:ommit unique value function, can be many dif. numbers

In [48]:
#print(feature_columns) - broken up:
print("Categorical:")
#print(dftrain['sex'].unique())
#print(dftrain['n_siblings_spouses'].unique())
#print(dftrain['parch'].unique())
#print(dftrain['class'].unique())
#print(dftrain['deck'].unique())
print(dftrain['embark_town'].unique())
#print(dftrain['alone'].unique())
print("Numerical:")
#print(dftrain['age'].unique())
#print(dftrain['fare'].unique())

Categorical:
['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
Numerical:


In [49]:
print(feature_columns[0])
#Displays: column name-- 'sex', unique features-- 'male' & 'female'

VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)


In [None]:
#TRAINING 
#when you have large datasets, they cannot be loaded into the model all at once over RAM. Instead, we use BATCHES.
    #for this specific model, data is going to be streamed into small BATCHES of 32. 
#The BATCHES will be fed to the model multiple times according to the number of EPOCHS. 
#An EPOCH is simply one stream of our entire dataset. The number of EPOCHS we define is the amount of times our model will see the entire dataset. 
    #we use epochs in hope that after seeing the same data multiple times the model will make a better prediction. 
    
#incrementally add epochs to fine tune the model. adding a bunch of epochs in the beginning can result in over-fitting. 

In [74]:
#INPUT FUNCTION
#input function defines how the data will be broken into epochs and batches
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function(): #inner function, this will be returned
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df)) #create tf.data.Dataset object with data and its label
        if shuffle:
            ds = ds.shuffle(1000) #randomize order of data
        ds = ds.batch(batch_size).repeat(num_epochs) #split dataset into batches of 32 and repeat process for number of epochs
        return ds #return a batch of the dataset
    return input_function #return a function object for use

train_input_fn = make_input_fn(dftrain, y_train) #here we will call the inpt_function that was returned
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

#CREATING THE MODEL
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)

linear_est.train(train_input_fn) #train
result = linear_est.evaluate(eval_input_fn) #get model metrics/stats by testing on testing data

clear_output() #clears concole output
print(result['accuracy'])

0.75


In [72]:
print(result) #dataset statistics

{'accuracy': 0.7651515, 'accuracy_baseline': 0.625, 'auc': 0.8261096, 'auc_precision_recall': 0.76914394, 'average_loss': 0.54977614, 'label/mean': 0.375, 'loss': 0.5416099, 'precision': 0.8032787, 'prediction/mean': 0.24950437, 'recall': 0.4949495, 'global_step': 200}


In [83]:
#TensorFLow is great for making predictions on lots of stuff at once. Not great for getting 1-piece of data.
#Make a prediction for every single point in a dataset. 

#PREDICTIONS
result = list(linear_est.predict(eval_input_fn))
print(result[0]) #this is the dictionary of one prediction
print(result[0]['probabilities'][0]) #probability of not surviving
print(result[0]['probabilities'][1]) #probability of surviving
#The probability of survival for the 1st passenger in the eval data is 6%. 

#Did the person survive?
print(y_eval.loc[0]) #0 - dead

#look at the rest of that data for 1st passenger
print(dfeval.loc[0])
#sex                          male
#age                          35.0
#n_siblings_spouses              0
#parch                           0
#fare                         8.05
#class                       Third
#deck                      unknown
#embark_town           Southampton
#alone                           y

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/wsuser/tmp_4gtwqyd/model.ckpt-200
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
{'logits': array([-2.7469084], dtype=float32), 'logistic': array([0.06026149], dtype=float32), 'probabilities': array([0.9397385 , 0.06026149], dtype=float32), 'class_ids': array([0]), 'classes': array([b'0'], dtype=object), 'all_class_ids': array([0, 1], dtype=int32), 'all_classes': array([b'0', b'1'], dtype=object)}
0.9397385
0.060261488
0
sex                          male
age                          35.0
n_siblings_spouses              0
parch                           0
fare                         8.05
class                       Third
deck                      unknown
embark_town           Southampton
alone                           y
Name: 0, dtype: object
