In [66]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
import re

# Visualization
from pandas_profiling import ProfileReport
#import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# display
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [84]:
KAGGLE_EVAL_METRIC = 'logloss' # string name for loss function in xgboost

TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

label = "rain_tomorrow"

# Load Data

In [85]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [86]:
train.head(1)

Unnamed: 0,id,date,location,min_temp,max_temp,rainfall,evaporation,sunshine,wind_gust_dir,wind_gust_speed,...,humidity9am,humidity3pm,pressure9am,pressure3pm,cloud9am,cloud3pm,temp9am,temp3pm,rain_today,rain_tomorrow
0,6364,2010-11-11,BadgerysCreek,16.2,28.9,11.4,,,SW,37.0,...,78.0,52.0,1017.2,1011.7,,,20.3,27.9,1.0,0


In [87]:
# Inputation
# Replaces numerical NaN (representing missing values in Pandas Dataframe) with 0s.
# ...Neural Nets don't work well with numerical NaNs.
def impute_numbers(dataset_df):
    for col in dataset_df.columns:
        if dataset_df[col].dtype not in [str, object]:
            dataset_df[col] = dataset_df[col].fillna(0)
    return dataset_df

# Split the dataset into a training and testing dataset.
def split_dataset(dataset, test_ratio=0.20):
    """Splits a panda dataframe in two."""
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

def prepare_dataset(dataset):
    name = {"Ticket" : tf.strings.split(dataset["Ticket"]).numpy()}
    return name

In [88]:
features = ['rain_tomorrow','humidity9am', 'humidity3pm', 'cloud9am', 
            'cloud3pm', 'wind_gust_speed','temp3pm','rain_today','location']# 'humidity_diff', 'temp_diff']

In [89]:
train = train[features]

In [90]:
train = impute_numbers(train)
train_ds_pd, test_ds_pd = split_dataset(train)
print("{} examples in training, {} examples for testing.".format(len(train_ds_pd), len(test_ds_pd)))

27380 examples in training, 6811 examples for testing.


In [91]:
train_ds_pd

Unnamed: 0,rain_tomorrow,humidity9am,humidity3pm,cloud9am,cloud3pm,wind_gust_speed,temp3pm,rain_today,location
0,0,78.0,52.0,0.0,0.0,37.0,27.9,1.0,BadgerysCreek
1,1,81.0,65.0,7.0,7.0,35.0,24.4,0.0,Sale
2,0,93.0,61.0,0.0,0.0,31.0,14.6,0.0,Nhil
3,0,66.0,37.0,0.0,0.0,35.0,24.7,0.0,Townsville
4,0,17.0,7.0,0.0,0.0,61.0,41.1,0.0,Uluru
...,...,...,...,...,...,...,...,...,...
34186,1,81.0,86.0,8.0,8.0,15.0,23.7,1.0,Brisbane
34187,1,88.0,51.0,7.0,5.0,76.0,12.5,1.0,MountGambier
34188,0,65.0,39.0,0.0,0.0,35.0,26.9,0.0,Richmond
34189,1,85.0,88.0,7.0,7.0,28.0,6.2,1.0,MelbourneAirport


In [92]:
# Convert the datasets into tensorflow datasets
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label)

# Feature Engineer

## Define Features and Target
Define categorical and numeric features manually because sometimes cols of type float/int should be categorical and vice versa

In [96]:
features = ['humidity9am', 'humidity3pm', 'cloud9am', 'cloud3pm', 
            'wind_gust_speed','temp3pm','rain_today','location']# 'humidity_diff', 'temp_diff']

In [97]:
train_ds_pd[features].head(1)

Unnamed: 0,humidity9am,humidity3pm,cloud9am,cloud3pm,wind_gust_speed,temp3pm,rain_today,location
0,78.0,52.0,0.0,0.0,37.0,27.9,1.0,BadgerysCreek


In [98]:
input_1 = tf.keras.Input(shape=(1,), name="humidity9am", dtype="float")
input_2 = tf.keras.Input(shape=(1,), name="humidity3pm", dtype="float")
input_3 = tf.keras.Input(shape=(1,), name="cloud9am", dtype="float")
input_4 = tf.keras.Input(shape=(1,), name="cloud3pm", dtype="float")
input_5 = tf.keras.Input(shape=(1,), name="wind_gust_speed", dtype="float")
input_6 = tf.keras.Input(shape=(1,), name="temp3pm", dtype="float")
input_7 = tf.keras.Input(shape=(1,), name="rain_today", dtype="float")
input_8 = tf.keras.Input(shape=(1,), name="location", dtype="string")

nn_raw_inputs = [input_1, input_2,input_3,input_4,input_5,input_6,input_7,input_8]

In [113]:
Normalization = tf.keras.layers.experimental.preprocessing.Normalization
CategoryEncoding = tf.keras.layers.experimental.preprocessing.CategoryEncoding
StringLookup = tf.keras.layers.experimental.preprocessing.StringLookup

# Numerical
values = train_ds_pd["humidity9am"].values
input_1_normalizer = Normalization()
input_1_normalizer.adapt(values)

values = train_ds_pd["humidity3pm"].values
input_2_normalizer = Normalization()
input_2_normalizer.adapt(values)

values = train_ds_pd["cloud9am"].values
input_3_normalizer = Normalization()
input_3_normalizer.adapt(values)

values = train_ds_pd["cloud3pm"].values
input_4_normalizer = Normalization()
input_4_normalizer.adapt(values)

values = train_ds_pd["wind_gust_speed"].values
input_5_normalizer = Normalization()
input_5_normalizer.adapt(values)

values = train_ds_pd["temp3pm"].values
input_6_normalizer = Normalization()
input_6_normalizer.adapt(values)

values = train_ds_pd["rain_today"].values
input_7_normalizer = Normalization()
input_7_normalizer.adapt(values)

values = train_ds_pd["location"].values
input_8_indexer = StringLookup(max_tokens=100000)
input_8_indexer.adapt(values)
input_8_onehot = CategoryEncoding(output_mode="binary", max_tokens=100000)


normalized_input_1 = input_1_normalizer(input_1)
normalized_input_2 = input_2_normalizer(input_2)
normalized_input_3 = input_3_normalizer(input_3)
normalized_input_4 = input_4_normalizer(input_4)
normalized_input_5 = input_5_normalizer(input_5)
normalized_input_6 = input_6_normalizer(input_6)
normalized_input_7 = input_7_normalizer(input_7)
normalized_input_8 = input_8_onehot(input_8_indexer(input_8))


nn_processed_inputs = [normalized_input_1, normalized_input_2,normalized_input_3,
                      normalized_input_4,normalized_input_5,normalized_input_6,
                      normalized_input_7,normalized_input_8]





In [114]:
y = tf.keras.layers.Concatenate()(nn_processed_inputs)
y = tf.keras.layers.Dense(32, activation=tf.nn.relu6)(y)
last_layer = tf.keras.layers.Dense(16, activation=tf.nn.relu, name="last")(y)

# "3" for the three label classes. If it were a binary classification, the
# output dim would be 1.
classification_output = tf.keras.layers.Dense(2)(y)

nn_model = tf.keras.models.Model(nn_raw_inputs, classification_output)

# To reduce the risk of mistakes, group both the decision forest and the
# neural network in a single keras model.
nn_without_head = tf.keras.models.Model(inputs=nn_model.inputs, outputs=last_layer)
df_and_nn_model = tfdf.keras.RandomForestModel(preprocessing=nn_without_head)

In [115]:
nn_model.inputs

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'humidity9am')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'humidity3pm')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'cloud9am')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'cloud3pm')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'wind_gust_speed')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'temp3pm')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'rain_today')>,
 <KerasTensor: shape=(None, 1) dtype=string (created by layer 'location')>]

In [116]:
nn_model.compile(
  optimizer=tf.keras.optimizers.Adam(),
  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=["accuracy"])

nn_model.fit(x=train_ds, validation_data=test_ds, epochs=10, verbose=False)
print("Evaluation:", nn_model.evaluate(test_ds))

Evaluation: [0.3685091733932495, 0.8470121622085571]


In [117]:
df_and_nn_model.compile(metrics=["accuracy"])
df_and_nn_model.fit(x=train_ds)
print("Evaluation:", df_and_nn_model.evaluate(test_ds))





Evaluation: [0.0, 0.8465717434883118]


In [None]:
0.8385356068611145