In [1]:
# Importing required libraries 

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier




In [2]:
data = pd.read_csv('train.csv')
print(data.head())

  AnimalID     Name             DateTime      OutcomeType OutcomeSubtype  \
0  A671945  Hambone  2014-02-12 18:22:00  Return_to_owner            NaN   
1  A656520    Emily  2013-10-13 12:44:00       Euthanasia      Suffering   
2  A686464   Pearce  2015-01-31 12:28:00         Adoption         Foster   
3  A683430      NaN  2014-07-11 19:09:00         Transfer        Partner   
4  A667013      NaN  2013-11-15 12:52:00         Transfer        Partner   

  AnimalType SexuponOutcome AgeuponOutcome                        Breed  \
0        Dog  Neutered Male         1 year        Shetland Sheepdog Mix   
1        Cat  Spayed Female         1 year       Domestic Shorthair Mix   
2        Dog  Neutered Male        2 years                 Pit Bull Mix   
3        Cat    Intact Male        3 weeks       Domestic Shorthair Mix   
4        Dog  Neutered Male        2 years  Lhasa Apso/Miniature Poodle   

         Color  
0  Brown/White  
1  Cream Tabby  
2   Blue/White  
3   Blue Cream  
4      

In [3]:
# these tables we decided would not have any outcome to our project
data.drop(['AnimalID', 'OutcomeSubtype'], axis=1, inplace=True)

In [4]:
print(data.head())
print(list(data))

      Name             DateTime      OutcomeType AnimalType SexuponOutcome  \
0  Hambone  2014-02-12 18:22:00  Return_to_owner        Dog  Neutered Male   
1    Emily  2013-10-13 12:44:00       Euthanasia        Cat  Spayed Female   
2   Pearce  2015-01-31 12:28:00         Adoption        Dog  Neutered Male   
3      NaN  2014-07-11 19:09:00         Transfer        Cat    Intact Male   
4      NaN  2013-11-15 12:52:00         Transfer        Dog  Neutered Male   

  AgeuponOutcome                        Breed        Color  
0         1 year        Shetland Sheepdog Mix  Brown/White  
1         1 year       Domestic Shorthair Mix  Cream Tabby  
2        2 years                 Pit Bull Mix   Blue/White  
3        3 weeks       Domestic Shorthair Mix   Blue Cream  
4        2 years  Lhasa Apso/Miniature Poodle          Tan  
['Name', 'DateTime', 'OutcomeType', 'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color']


In [5]:
# turn all data into numerical
data['AnimalType'] = data['AnimalType'].map({'Cat': 0, 'Dog': 1})

# different approach since there is more than two values
outcome_unique = data['OutcomeType'].unique()
outcome_numeric = dict(zip(outcome_unique, range(len(outcome_unique))))
data['OutcomeType'] = data['OutcomeType'].replace(outcome_numeric)

sex_unique = data['SexuponOutcome'].unique()
sex_numeric = dict(zip(sex_unique, range(len(sex_unique))))
data['SexuponOutcome'] = data['SexuponOutcome'].replace(sex_numeric)

data.head()

Unnamed: 0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,0,1,0,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,1,0,1,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,2,1,0,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,3,0,2,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,3,1,0,2 years,Lhasa Apso/Miniature Poodle,Tan


In [6]:
# Separate breeds into Mix and Non-Mix
data['Breed'] = data['Breed'].str.contains('mix', case=False).astype(int)
data.head()

Unnamed: 0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,0,1,0,1 year,1,Brown/White
1,Emily,2013-10-13 12:44:00,1,0,1,1 year,1,Cream Tabby
2,Pearce,2015-01-31 12:28:00,2,1,0,2 years,1,Blue/White
3,,2014-07-11 19:09:00,3,0,2,3 weeks,1,Blue Cream
4,,2013-11-15 12:52:00,3,1,0,2 years,0,Tan


In [7]:
# extracting the date
datetime = pd.to_datetime(data['DateTime']).dt
data['Year'] = datetime.year
data['Month'] = datetime.month
data['Day'] = datetime.day
data.drop(['DateTime'], axis=1, inplace=True)
data.head()

Unnamed: 0,Name,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,Day
0,Hambone,0,1,0,1 year,1,Brown/White,2014,2,12
1,Emily,1,0,1,1 year,1,Cream Tabby,2013,10,13
2,Pearce,2,1,0,2 years,1,Blue/White,2015,1,31
3,,3,0,2,3 weeks,1,Blue Cream,2014,7,11
4,,3,1,0,2 years,0,Tan,2013,11,15


In [8]:
# fill in the NaN with 0s
data['Name'] = data['Name'].fillna(0)
data.head()

# change names to numeric values
for names in data['Name']:
    if isinstance(names, str) and not 0:
        data['Name'].replace(names, '1', inplace=True)
data['Name'] = data['Name'].astype(int)
data.head()

Unnamed: 0,Name,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,Day
0,1,0,1,0,1 year,1,Brown/White,2014,2,12
1,1,1,0,1,1 year,1,Cream Tabby,2013,10,13
2,1,2,1,0,2 years,1,Blue/White,2015,1,31
3,0,3,0,2,3 weeks,1,Blue Cream,2014,7,11
4,0,3,1,0,2 years,0,Tan,2013,11,15


In [16]:
# turn time into days 
def map_values(x):
    try:
        age_split = x.split()
    except:
        return None
    if 'year' in age_split[1]:
        return float(age_split[0]) * 365
    elif 'month' in age_split[1]:
        return float(age_split[0]) * (365/12)
    elif 'week' in age_split[1]:
        return float(age_split[0]) * 7
    elif 'day' in age_split[1]: 
        return float(age_split[0])

data['DayAge'] = data['AgeuponOutcome'].map(map_values)
data['DayAge'] = data['DayAge'].fillna(data['DayAge'].mean())
data.drop(['AgeuponOutcome'], axis=1, inplace=True)
data.head()

Unnamed: 0,Name,OutcomeType,AnimalType,SexuponOutcome,Breed,Color,Year,Month,Day,DayAge
0,1,0,1,0,1,Brown/White,2014,2,12,365.0
1,1,1,0,1,1,Cream Tabby,2013,10,13,365.0
2,1,2,1,0,1,Blue/White,2015,1,31,730.0
3,0,3,0,2,1,Blue Cream,2014,7,11,21.0
4,0,3,1,0,0,Tan,2013,11,15,730.0


In [17]:
# handling colors
# To make it less complicated, just take the first color of the column
for colors in data['Color']:
    color_split = colors.split('/')
    data['Color'] = data['Color'].replace(colors, color_split[0])
data.head()

Unnamed: 0,Name,OutcomeType,AnimalType,SexuponOutcome,Breed,Color,Year,Month,Day,DayAge
0,1,0,1,0,1,Brown,2014,2,12,365.0
1,1,1,0,1,1,Cream Tabby,2013,10,13,365.0
2,1,2,1,0,1,Blue,2015,1,31,730.0
3,0,3,0,2,1,Blue Cream,2014,7,11,21.0
4,0,3,1,0,0,Tan,2013,11,15,730.0
