In [156]:
# Standard stack
import datetime
import pandas as pd
import numpy as np
from datetime import date
import re

# Visualization
from pandas_profiling import ProfileReport
#import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import tensorflow as tf
import tensorflow_decision_forests as tfdf
try:
    from wurlitzer import sys_pipes
except:
    from colabtools.googlelog import CaptureLog as sys_pipes

# Scikit-learn packages
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold

# display
from IPython.core.magic import register_line_magic
from IPython.display import Javascript

In [157]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"
label = "outcome_type"

In [158]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
submission = pd.read_csv(TEST_PATH)

In [159]:
classes = train[label].unique().tolist()
print(f"Label classes: {classes}")

train[label] = train[label].map(classes.index)

Label classes: ['adoption', 'no outcome', 'transfer']


In [160]:
train.columns

Index(['id', 'age_upon_outcome', 'animal_type', 'breed', 'color',
       'date_of_birth', 'datetime', 'name', 'outcome_type', 'sex',
       'spay_neuter'],
      dtype='object')

In [161]:
test.columns

Index(['id', 'age_upon_outcome', 'animal_type', 'breed', 'color',
       'date_of_birth', 'datetime', 'name', 'sex', 'spay_neuter'],
      dtype='object')

In [162]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54408 entries, 0 to 54407
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                54408 non-null  int64 
 1   age_upon_outcome  54404 non-null  object
 2   animal_type       54408 non-null  object
 3   breed             54408 non-null  object
 4   color             54408 non-null  object
 5   date_of_birth     54408 non-null  object
 6   datetime          54408 non-null  object
 7   name              37975 non-null  object
 8   outcome_type      54408 non-null  int64 
 9   sex               54408 non-null  object
 10  spay_neuter       54408 non-null  object
dtypes: int64(2), object(9)
memory usage: 4.6+ MB


In [163]:
def inpute_missing(dataset):
    """ 
    Edit this to fix nulls. Default version replaces all int/float with 0
    """
    for col in dataset.columns:
        if dataset[col].dtype not in [str, object]:
            dataset[col] = dataset[col].fillna(dataset[col].mean())
    return dataset

train = inpute_missing(train)
test = inpute_missing(test)

In [164]:
train.head()

Unnamed: 0,id,age_upon_outcome,animal_type,breed,color,date_of_birth,datetime,name,outcome_type,sex,spay_neuter
0,1265,2 years,Cat,Russian Blue Mix,Blue,2014-04-21,2016-08-05 14:15:00,Smokey,0,Male,Fixed
1,24053,1 year,Other,Bat Mix,Brown,2016-10-24,2017-10-25 08:02:00,,1,Unknown,Unknown
2,4785,2 months,Dog,Chihuahua Shorthair/Pomeranian,Brown,2014-01-04,2014-03-08 16:37:00,,2,Female,Fixed
3,65439,2 years,Cat,Domestic Shorthair Mix,Brown Tabby/White,2012-11-26,2014-12-04 12:21:00,Momma Kitty,2,Female,Fixed
4,45732,1 year,Dog,Rat Terrier Mix,White/Chocolate,2014-07-10,2016-01-16 16:46:00,Estrella,2,Female,Fixed


In [198]:
def nlp_transforms(dataset):
    """ 
    NLP tranforms here. Default, None...
    """
    banned = ['brindle',
    'tabby',
    'merle',
    'torbie',
    'point',
    'smoke',
    'tiger',
    'tick'
    ]
    
    # Colors
    dataset['color'] = dataset['color'].str.lower()
    dataset['color'] = np.where(dataset['color'] == 'Tortie', 'Torbie', dataset['color'])
    
    dataset['color_type'] = None
    dataset['color_type'] = np.where(dataset['color'].str.contains('brindle'), "brindle", dataset['color_type'])
    dataset['color_type'] = np.where(dataset['color'].str.contains('tabby'), "tabby", dataset['color_type'])
    dataset['color_type'] = np.where(dataset['color'].str.contains('merle'), "merle", dataset['color_type'])
    dataset['color_type'] = np.where(dataset['color'].str.contains('torbie'), "torbie", dataset['color_type'])
    dataset['color_type'] = np.where(dataset['color'].str.contains('point'), "point", dataset['color_type'])
    dataset['color_type'] = np.where(dataset['color'].str.contains('smoke'), "smoke", dataset['color_type'])
    dataset['color_type'] = np.where(dataset['color'].str.contains('tiger'), "tiger", dataset['color_type'])
    dataset['color_type'] = np.where(dataset['color'].str.contains('tick'), "tick", dataset['color_type'])
    
    dataset['num_colors'] = dataset['color'].str.split('/').str.len()
    dataset['color_one'] = dataset['color'].apply(lambda x: x.split('/')[0])
    dataset['color_two'] = dataset['color'].apply(lambda x: x.split('/')[1] if len(x.split('/')) > 1 else 'None')

    dataset['color_one'] = dataset['color_one'].str.replace('/',' ')
    dataset['color_one'] = [' '.join([item for item in x.split() 
                       if item not in banned]) 
                       for x in dataset["color_one"]]

    dataset['color_two'] = dataset['color_two'].str.replace('/',' ')
    dataset['color_two'] = [' '.join([item for item in x.split() 
                       if item not in banned]) 
                       for x in dataset["color_two"]]\
    
    dataset['num_colors'] = np.where((dataset['color_one'] == 'tricolor') | (dataset['color_two'] == 'tricolor'), 3, dataset['num_colors'])
    
    # Breeds
    dataset['breed'] = dataset['breed'].str.lower()
    
    dataset['breed_one'] = dataset['breed'].apply(lambda x: x.split('/')[0])
    dataset['breed_two'] = dataset['breed'].apply(lambda x: x.split('/')[1] if len(x.split('/')) ==2 else 'None')
    dataset['breed_three'] = dataset['breed'].apply(lambda x: x.split('/')[2] if len(x.split('/')) ==3 else 'None')
    
    dataset['num_breed'] = dataset['breed'].str.split('/').str.len()
    dataset['mixed'] = np.where(dataset['breed'].str.contains('mix'), 1,0)
    dataset['mixed'] = np.where(dataset['num_breed'] > 1, 1 ,dataset['mixed'])
    return dataset

train = nlp_transforms(train)
test = nlp_transforms(test)

In [199]:
train.head()

Unnamed: 0,id,age_upon_outcome,animal_type,breed,color,date_of_birth,datetime,name,outcome_type,sex,...,num_breed,mixed,year,month,day,hour,weekday,breed_one,breed_two,breed_three
0,1265,2 years,Cat,russian blue mix,blue,2014-04-21,2016-08-05 14:15:00,Smokey,0,Male,...,1,1,2016,8,5,14,Friday,russian blue mix,,
1,24053,1 year,Other,bat mix,brown,2016-10-24,2017-10-25 08:02:00,,1,Unknown,...,1,1,2017,10,25,8,Wednesday,bat mix,,
2,4785,2 months,Dog,chihuahua shorthair/pomeranian,brown,2014-01-04,2014-03-08 16:37:00,,2,Female,...,2,1,2014,3,8,16,Saturday,chihuahua shorthair,pomeranian,
3,65439,2 years,Cat,domestic shorthair mix,brown tabby/white,2012-11-26,2014-12-04 12:21:00,Momma Kitty,2,Female,...,1,1,2014,12,4,12,Thursday,domestic shorthair mix,,
4,45732,1 year,Dog,rat terrier mix,white/chocolate,2014-07-10,2016-01-16 16:46:00,Estrella,2,Female,...,1,1,2016,1,16,16,Saturday,rat terrier mix,,


In [204]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train.breed)
X_train_counts.shape


(54408, 358)

In [205]:
X_train_counts

<54408x358 sparse matrix of type '<class 'numpy.int64'>'
	with 159715 stored elements in Compressed Sparse Row format>

In [185]:
test.breed.nunique()

1251

In [183]:
def computation_transforms(dataset):
    """ 
    New Feature Engineering. Default, None...
    """
    
    # Date stuff
    dataset['datetime'] = pd.to_datetime(dataset['datetime'])
    dataset['year'] = dataset['datetime'].dt.year
    dataset['month'] = dataset['datetime'].dt.month
    dataset['day'] = dataset['datetime'].dt.day
    dataset['hour'] = dataset['datetime'].dt.hour
    dataset['weekday'] = dataset['datetime'].dt.day_name()
    
    # color??
    return dataset

train = computation_transforms(train)
test = computation_transforms(test)

In [153]:
train

Unnamed: 0,id,age_upon_outcome,animal_type,breed,color,date_of_birth,datetime,name,outcome_type,sex,spay_neuter,color_type,num_colors,color_one,color_two,year,month,day,hour,weekday
0,1265,2 years,Cat,Russian Blue Mix,blue,2014-04-21,2016-08-05 14:15:00,Smokey,0,Male,Fixed,,1,blue,,2016,8,5,14,Friday
1,24053,1 year,Other,Bat Mix,brown,2016-10-24,2017-10-25 08:02:00,,1,Unknown,Unknown,,1,brown,,2017,10,25,8,Wednesday
2,4785,2 months,Dog,Chihuahua Shorthair/Pomeranian,brown,2014-01-04,2014-03-08 16:37:00,,2,Female,Fixed,,1,brown,,2014,3,8,16,Saturday
3,65439,2 years,Cat,Domestic Shorthair Mix,brown tabby/white,2012-11-26,2014-12-04 12:21:00,Momma Kitty,2,Female,Fixed,tabby,2,brown,white,2014,12,4,12,Thursday
4,45732,1 year,Dog,Rat Terrier Mix,white/chocolate,2014-07-10,2016-01-16 16:46:00,Estrella,2,Female,Fixed,,2,white,chocolate,2016,1,16,16,Saturday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54403,48705,6 months,Dog,Basset Hound Mix,brown brindle/white,2016-05-01,2016-11-06 17:53:00,*Mobey,0,Male,Fixed,brindle,2,brown,white,2016,11,6,17,Sunday
54404,42473,8 years,Dog,Pointer/Labrador Retriever,white/black,2006-09-16,2014-09-27 14:40:00,Daisy,0,Female,Fixed,,2,white,black,2014,9,27,14,Saturday
54405,57001,4 months,Cat,Domestic Shorthair Mix,white/orange tabby,2014-04-07,2014-08-23 16:51:00,French Fry,2,Male,Fixed,tabby,2,white,orange tabby,2014,8,23,16,Saturday
54406,74609,3 years,Cat,Domestic Medium Hair Mix,calico,2013-07-07,2016-08-10 18:43:00,Orangee,0,Female,Fixed,,1,calico,,2016,8,10,18,Wednesday


# Train

In [None]:
models = {
    #'rf_default': tfdf.keras.RandomForestModel(),
    #'gbt_default': tfdf.keras.GradientBoostedTreesModel(),
    'gbt_tune1': tfdf.keras.GradientBoostedTreesModel(hyperparameter_template="benchmark_rank1"),
    }

# Run a 10-folds cross-validation.
accuraties_per_fold = []

for key in models:
    print(key)
    for  fold_idx, (train_indices, test_indices) in enumerate(KFold(n_splits=10, shuffle=True, random_state=42).split(train)):

        print(f"Running fold {fold_idx+1}")

        # Extract the training and testing examples.
        sub_train_df = train.iloc[train_indices]
        sub_test_df = train.iloc[test_indices]

        # Convert the examples into tensorflow datasets.
        sub_train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(sub_train_df, label=label)
        sub_test_df = tfdf.keras.pd_dataframe_to_tf_dataset(sub_test_df, label=label)

        # Train the model.
        models[key].compile(metrics=["BinaryCrossentropy"])
        models[key].fit(sub_train_ds, verbose=False)

        # Evaluate the model.
        evaluation = models[key].evaluate(sub_test_df, return_dict=True, verbose=False)
        #print(f"Evaluation {evaluation}")

        accuraties_per_fold.append(evaluation["binary_crossentropy"])

    print(f"Cross-validated Score: {np.mean(accuraties_per_fold)} for model: " + key)

gbt_tune1
Running fold 1


In [None]:
# Train Best Model

In [None]:
train_df, test_df = train_test_split(train)
train_tf = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label)
test_tf = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, label=label)

In [49]:
evaluation = {}
for key in models:
    print(key)
    #Evaluate: metric logsloss: BinaryCrossentropy
    models[key].compile(metrics=["BinaryCrossentropy"])
    
    #with sys_pipes():
    models[key].fit(x=train_tf)
    evaluation[key] = models[key].evaluate(test_tf, return_dict=True)

gbt_tune1
gbt_tune2
