In [1]:
# Importing required libraries 

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
import pandas as pd 
import numpy as np 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



# Training our training set

In [2]:
animal_df = pd.read_csv('train.csv')

# Ignoring Rows with NaN values (to avoid any future issues with implementations)
animal_df.dropna(inplace= True)

# Replace NaN values with Zero's
# animal_df = animal_df.fillna(0)

animal_df.head()

# (26729, 10) --> Without ignoring rows with NaNs

# (6979, 10) --> Ignoring rows with NaNs
# animal_df.shape


Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
12,A684601,Rocket,2016-02-03 11:27:00,Adoption,Foster,Dog,Neutered Male,4 years,Pit Bull Mix,Brown


# Removing Columns we do not need:

In [3]:
animal_df.drop(['AnimalID', 'OutcomeSubtype'], axis=1, inplace=True)

# Building our Feature Matrix (Will be ignoring the AnimalID and OutcomeSubtype)

In [4]:
grabbing_features = ['Name','DateTime','AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color']

X = animal_df[grabbing_features]

X.head()
# X.columns

Unnamed: 0,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
1,Emily,2013-10-13 12:44:00,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
5,Elsa,2014-04-25 13:04:00,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,Jimmy,2015-03-28 13:11:00,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
12,Rocket,2016-02-03 11:27:00,Dog,Neutered Male,4 years,Pit Bull Mix,Brown


# Building our Label Vector 

In [5]:
# Giving numerical values to our label
def categorical_to_numeric(x):
    if x == 'Return_to_owner':
        return 0
    elif x == 'Euthanasia':
        return 1
    elif x == 'Adoption':
        return 2
    elif x == 'Transfer':
        return 3
    
animal_df['OutcomeType'] = animal_df['OutcomeType'].apply(categorical_to_numeric)

y = animal_df['OutcomeType']

y.head()
    
# animal_df.head()

1     1.0
2     2.0
5     3.0
6     3.0
12    2.0
Name: OutcomeType, dtype: float64

# Using One-Hot Encoding on our Features:

In [6]:
#Used OneHotEncoding on Non-Numerical Features

list_nonNumerical = ['Name','DateTime','AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color']

for column in list_nonNumerical:
    dummies = pd.get_dummies(animal_df[column])
    animal_df[dummies.columns] = dummies
    
getnewFeatures_X = animal_df

# (26729, 31081) --> Without Ignoring Rows with NaNs (After OneHotEncoding)

# (6979, 10211) --> Ignoring Rows with NaNs (After OneHotEncoding)

# #get first 5 rows of new table

getnewFeatures_X.head()

# getnewFeatures_X.shape


Unnamed: 0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Mario,'Stache,...,White/Red,White/Red Tick,White/Tan,White/Tricolor,White/White,White/Yellow,Yellow,Yellow Brindle,Yellow Brindle/White,Yellow/White
1,Emily,2013-10-13 12:44:00,1.0,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Pearce,2015-01-31 12:28:00,2.0,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Elsa,2014-04-25 13:04:00,3.0,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Jimmy,2015-03-28 13:11:00,3.0,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby,0,0,...,0,0,0,0,0,0,0,0,0,0
12,Rocket,2016-02-03 11:27:00,2.0,Dog,Neutered Male,4 years,Pit Bull Mix,Brown,0,0,...,0,0,0,0,0,0,0,0,0,0


# After doing OneHotEncoding, now will drop non-numerical columns and keep Number Columns. This will allow us to build our Feature Matrix:

In [7]:
# Dropping Non-numerical columns
getnewFeatures_X.drop(['Name','DateTime','OutcomeType','AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color'], axis=1, inplace=True)

# getnewFeatures_X.shape

X = getnewFeatures_X

X.head()

Unnamed: 0,Mario,'Stache,Aaron,Abbie,Abbott,Abby,Abigail,Abraham,Ace,Achilles,...,White/Red,White/Red Tick,White/Tan,White/Tricolor,White/White,White/Yellow,Yellow,Yellow Brindle,Yellow Brindle/White,Yellow/White
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Splitting our data into testing and training: 

In [8]:
# Splitting the dataset into testing and training:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

print("X_Train size: " + str(X_train.shape))
print("y_train size: " + str(y_train.shape))
print("X_Test size: " + str(X_test.shape))
print("y_test size: " + str(y_test.shape))

X_Train size: (5583, 10201)
y_train size: (5583,)
X_Test size: (1396, 10201)
y_test size: (1396,)


# Using Random Forest:

In [9]:
#Instantiated Random Forest Classifier
my_RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=2)

In [None]:
X.fillna(X_train.mean(), inplace=True)

# Training on the training set:
my_RandomForest.fit(X_train, y_train)

In [None]:
# # Testing on the testing set:
# y_predict_rf = my_RandomForest.predict(X_test)

# print("Predicated Labels: ")
# print(y_predict_rf)