In [35]:
# Importing required libraries 

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
import pandas as pd 
import numpy as np 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Training our training set

In [36]:
animal_df = pd.read_csv('train.csv')

# Ignoring Rows with NaN values (to avoid any future issues with implementations)
animal_df.dropna(inplace= True)

# animal_df.fillna(animal_df.mean(), inplace=True)


# Replace NaN values with Zero's
# animal_df = animal_df.fillna(0)

# animal_df.fillna(animal_df.mean())

# print(animal_df.isnull().sum())
animal_df.head()

# (26729, 10) --> Without ignoring rows with NaNs

# (6979, 10) --> Ignoring rows with NaNs

# (26729, 6) --> Ingoring columns with NaNs


# animal_df.shape


Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
12,A684601,Rocket,2016-02-03 11:27:00,Adoption,Foster,Dog,Neutered Male,4 years,Pit Bull Mix,Brown


# Removing Columns we do not need:

In [37]:
animal_df.drop(['AnimalID','Name','OutcomeSubtype'], axis=1, inplace=True)
animal_df.head()

Unnamed: 0,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
1,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
5,2014-04-25 13:04:00,Transfer,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,2015-03-28 13:11:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
12,2016-02-03 11:27:00,Adoption,Dog,Neutered Male,4 years,Pit Bull Mix,Brown


# Converting our Label Vector from Categorical to numerical:

In [38]:
# Giving numerical values to our label
def categorical_to_numeric(x):
    if x == 'Return_to_owner':
        return 0
    elif x == 'Euthanasia':
        return 1
    elif x == 'Adoption':
        return 2
    elif x == 'Transfer':
        return 3
    
animal_df['OutcomeType'] = animal_df['OutcomeType'].apply(categorical_to_numeric).astype(np.float32)

animal_df.head()

Unnamed: 0,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
1,2013-10-13 12:44:00,1.0,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,2015-01-31 12:28:00,2.0,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
5,2014-04-25 13:04:00,3.0,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,2015-03-28 13:11:00,3.0,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
12,2016-02-03 11:27:00,2.0,Dog,Neutered Male,4 years,Pit Bull Mix,Brown


# Converting our DateTime Column to Year, Month, Day:

In [39]:
datetime = pd.to_datetime(animal_df['DateTime']).dt
animal_df['Year'] = datetime.year
animal_df['Month'] = datetime.month
animal_df['Day'] = datetime.day
animal_df.drop(['DateTime'], axis=1, inplace=True)

animal_df.head()

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,Day
1,1.0,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,2013,10,13
2,2.0,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,2015,1,31
5,3.0,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,2014,4,25
6,3.0,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby,2015,3,28
12,2.0,Dog,Neutered Male,4 years,Pit Bull Mix,Brown,2016,2,3


# Converting AgeUponOutcome to total number of weeks:

In [40]:
def change_year_month_to_weeks(x):
    count, parse_age = x.split(" ")
    if parse_age == 'month':
        return int(4)
    elif parse_age == 'months':
        return 4 * int(count)
    elif parse_age == 'year':
        return int(52)
    elif parse_age == 'years':
        return 52 * int(count)
    elif parse_age == 'week':
        return int(count)
    elif parse_age == 'weeks':
        return int(count)
    
animal_df['AgeuponOutcome'] = animal_df['AgeuponOutcome'].apply(change_year_month_to_weeks).astype(np.float32)

animal_df.head()

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,Day
1,1.0,Cat,Spayed Female,52.0,Domestic Shorthair Mix,Cream Tabby,2013,10,13
2,2.0,Dog,Neutered Male,104.0,Pit Bull Mix,Blue/White,2015,1,31
5,3.0,Dog,Intact Female,4.0,Cairn Terrier/Chihuahua Shorthair,Black/Tan,2014,4,25
6,3.0,Cat,Intact Male,3.0,Domestic Shorthair Mix,Blue Tabby,2015,3,28
12,2.0,Dog,Neutered Male,208.0,Pit Bull Mix,Brown,2016,2,3


# Using One-Hot Encoding on our categorical features:

In [41]:
#Used OneHotEncoding on Non-Numerical Features

list_nonNumerical = ['AnimalType','Breed','Color','SexuponOutcome']

for column in list_nonNumerical:
    dummies = pd.get_dummies(animal_df[column])
    animal_df[dummies.columns] = dummies
    
oneHotEncoding_features = animal_df

# (26729, 31081) --> Without Ignoring Rows with NaNs (After OneHotEncoding)

# (6979, 10211) --> Ignoring Rows with NaNs (After OneHotEncoding)

# (26729, 1755) --> Ignoring Columns with NaNs (After OneHotEncoding)


# #get first 5 rows of new table

oneHotEncoding_features.head()

# getnewFeatures_X.shape


Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,Day,Cat,...,White/Yellow,Yellow,Yellow Brindle,Yellow Brindle/White,Yellow/White,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown
1,1.0,Cat,Spayed Female,52.0,Domestic Shorthair Mix,Cream Tabby,2013,10,13,1,...,0,0,0,0,0,0,0,0,1,0
2,2.0,Dog,Neutered Male,104.0,Pit Bull Mix,Blue/White,2015,1,31,0,...,0,0,0,0,0,0,0,1,0,0
5,3.0,Dog,Intact Female,4.0,Cairn Terrier/Chihuahua Shorthair,Black/Tan,2014,4,25,0,...,0,0,0,0,0,1,0,0,0,0
6,3.0,Cat,Intact Male,3.0,Domestic Shorthair Mix,Blue Tabby,2015,3,28,1,...,0,0,0,0,0,0,1,0,0,0
12,2.0,Dog,Neutered Male,208.0,Pit Bull Mix,Brown,2016,2,3,0,...,0,0,0,0,0,0,0,1,0,0


In [42]:
# Checking the NaN values after doing OneHotEncoding

print(oneHotEncoding_features.isnull().sum())

OutcomeType                                        70
AnimalType                                          0
SexuponOutcome                                      0
AgeuponOutcome                                      9
Breed                                               0
Color                                               0
Year                                                0
Month                                               0
Day                                                 0
Cat                                                 0
Dog                                                 0
Affenpinscher Mix                                   0
Airedale Terrier                                    0
Airedale Terrier Mix                                0
Airedale Terrier/Labrador Retriever                 0
Akita                                               0
Akita Mix                                           0
Akita/Labrador Retriever                            0
Akita/Pit Bull              

In [43]:
# Remove Remaining NaN values for our label column:

oneHotEncoding_features.dropna(inplace = True)


print(oneHotEncoding_features.isnull().sum())

OutcomeType                                        0
AnimalType                                         0
SexuponOutcome                                     0
AgeuponOutcome                                     0
Breed                                              0
Color                                              0
Year                                               0
Month                                              0
Day                                                0
Cat                                                0
Dog                                                0
Affenpinscher Mix                                  0
Airedale Terrier                                   0
Airedale Terrier Mix                               0
Airedale Terrier/Labrador Retriever                0
Akita                                              0
Akita Mix                                          0
Akita/Labrador Retriever                           0
Akita/Pit Bull                                

# Building our Label Vector:

In [44]:
y = oneHotEncoding_features['OutcomeType']

y.head()

1     1.0
2     2.0
5     3.0
6     3.0
12    2.0
Name: OutcomeType, dtype: float32

# After doing OneHotEncoding, now will drop non-numerical columns and keep Number Columns. This will allow us to build our Feature Matrix:

In [45]:
# Dropping Non-numerical columns and our label column:
oneHotEncoding_features.drop(['SexuponOutcome','OutcomeType','AnimalType','Breed','Color'], axis=1, inplace=True)

X = oneHotEncoding_features

X.head()

# print(getnewFeatures_X.isnull().sum())

# getnewFeatures_X.shape

Unnamed: 0,AgeuponOutcome,Year,Month,Day,Cat,Dog,Affenpinscher Mix,Airedale Terrier,Airedale Terrier Mix,Airedale Terrier/Labrador Retriever,...,White/Yellow,Yellow,Yellow Brindle,Yellow Brindle/White,Yellow/White,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown
1,52.0,2013,10,13,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,104.0,2015,1,31,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,4.0,2014,4,25,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6,3.0,2015,3,28,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
12,208.0,2016,2,3,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Splitting our data into testing and training: 

In [46]:
# Splitting the dataset into testing and training:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

print("X_Train size: " + str(X_train.shape))
print("y_train size: " + str(y_train.shape))
print("X_Test size: " + str(X_test.shape))
print("y_test size: " + str(y_test.shape))

X_Train size: (5522, 903)
y_train size: (5522,)
X_Test size: (1381, 903)
y_test size: (1381,)


# Using Random Forest:

In [47]:
#Instantiated Random Forest Classifier
my_RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=2)

In [48]:
# Training on the training set:
my_RandomForest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=19, n_jobs=1,
            oob_score=False, random_state=2, verbose=0, warm_start=False)

In [51]:
# Testing on the testing set:
y_predict_rf = my_RandomForest.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_predict_rf)

print("Random Forest accuracy: " + str(rf_accuracy))

Random Forest accuracy: 0.737871107893
