In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from IPython.display import display
from sklearn.metrics import accuracy_score



In [3]:
PATH = 'data/'

In [4]:
!ls {PATH}

sample_submission.csv  shelter-animal-outcomes.zip  test.csv  train.csv


In [5]:
df = pd.read_csv(f'{PATH}train.csv', low_memory= False, parse_dates= ['DateTime'])
df.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [6]:
def display_all(df):
    with pd.option_context('display.max_rows', 1000, 'display.max_columns', 1000):
        display(df)

In [7]:
display_all(df.tail())

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
26724,A702446,,2015-05-14 11:56:00,Transfer,Partner,Cat,Intact Male,1 month,Domestic Shorthair Mix,Brown Tabby/White
26725,A718934,,2016-01-20 18:59:00,Transfer,SCRP,Cat,Spayed Female,3 months,Domestic Shorthair Mix,Brown Tabby
26726,A698128,Zeus,2015-03-09 13:33:00,Adoption,,Dog,Neutered Male,4 years,Old English Bulldog Mix,White/Tan
26727,A677478,,2014-04-27 12:22:00,Transfer,Partner,Cat,Intact Male,4 weeks,Domestic Shorthair Mix,Black
26728,A706629,,2015-07-02 09:00:00,Transfer,SCRP,Cat,Intact Male,1 year,Domestic Shorthair Mix,Brown Tabby/White


In [8]:
df.columns

Index(['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype',
       'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color'],
      dtype='object')

In [9]:
display_all(df.describe(include = 'all').T)

Unnamed: 0,count,unique,top,freq,first,last
AnimalID,26729,26729,A711889,1,NaT,NaT
Name,19038,6374,Max,136,NaT,NaT
DateTime,26729,22918,2015-08-11 00:00:00,19,2013-10-01 09:31:00,2016-02-21 19:17:00
OutcomeType,26729,5,Adoption,10769,NaT,NaT
OutcomeSubtype,13117,16,Partner,7816,NaT,NaT
AnimalType,26729,2,Dog,15595,NaT,NaT
SexuponOutcome,26728,5,Neutered Male,9779,NaT,NaT
AgeuponOutcome,26711,44,1 year,3969,NaT,NaT
Breed,26729,1380,Domestic Shorthair Mix,8810,NaT,NaT
Color,26729,366,Black/White,2824,NaT,NaT


In [10]:
set(df.OutcomeType)

{'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'}

In [11]:
set(df.OutcomeSubtype)

{'Aggressive',
 'At Vet',
 'Barn',
 'Behavior',
 'Court/Investigation',
 'Enroute',
 'Foster',
 'In Foster',
 'In Kennel',
 'In Surgery',
 'Medical',
 'Offsite',
 'Partner',
 'Rabies Risk',
 'SCRP',
 'Suffering',
 nan}

In [12]:
set(df.AnimalType)

{'Cat', 'Dog'}

In [13]:
set(df.SexuponOutcome)

{'Intact Female',
 'Intact Male',
 'Neutered Male',
 'Spayed Female',
 'Unknown',
 nan}

In [14]:
len(df)

26729

In [15]:
add_datepart(df, 'DateTime')

In [16]:
df.DateTimeDayofyear.head()

0     43
1    286
2     31
3    192
4    319
Name: DateTimeDayofyear, dtype: int64

In [17]:
train_cats(df)

In [18]:
display_all(df.isnull().sum().sort_index()/len(df))

AgeuponOutcome              0.000673
AnimalID                    0.000000
AnimalType                  0.000000
Breed                       0.000000
Color                       0.000000
DateTimeDay                 0.000000
DateTimeDayofweek           0.000000
DateTimeDayofyear           0.000000
DateTimeElapsed             0.000000
DateTimeIs_month_end        0.000000
DateTimeIs_month_start      0.000000
DateTimeIs_quarter_end      0.000000
DateTimeIs_quarter_start    0.000000
DateTimeIs_year_end         0.000000
DateTimeIs_year_start       0.000000
DateTimeMonth               0.000000
DateTimeWeek                0.000000
DateTimeYear                0.000000
Name                        0.287740
OutcomeSubtype              0.509260
OutcomeType                 0.000000
SexuponOutcome              0.000037
dtype: float64

In [19]:
X, y, nas = proc_df(df, 'OutcomeType')

In [20]:
m = RandomForestClassifier(n_jobs = -1)
m.fit(X, y)
m.score(X, y)

1.0

In [21]:
def validation_split(a, n): return a[:n].copy(), a[n:].copy()

n_valid = 5500
n_trn = len(X) - n_valid
raw_train, raw_valid = validation_split(df, n_trn)
X_train, X_valid = validation_split(X, n_trn)
y_train, y_valid = validation_split(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((21229, 21), (21229,), (5500, 21), (5500,))

In [24]:
def display_score(m):
    res = [
        m.score(X_train, y_train),
        m.score(X_valid, y_valid)
    ]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)

In [25]:
m = RandomForestClassifier(n_jobs = -1)
%time m.fit(X_train, y_train)
display_score(m)

CPU times: user 4.51 s, sys: 52.3 ms, total: 4.56 s
Wall time: 550 ms
[1.0, 0.8694545454545455]
