In [25]:
# Importing required libraries 

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
import pandas as pd 
import numpy as np 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Training our training set

In [26]:
animal_df = pd.read_csv('train.csv')

# Ignoring Rows with NaN values (to avoid any future issues with implementations)
# animal_df.dropna(inplace= True)

# animal_df.fillna(animal_df.mean(), inplace=True)


# Replace NaN values with Zero's
# animal_df = animal_df.fillna(0)

# animal_df.fillna(animal_df.mean())

# print(animal_df.isnull().sum())
animal_df.head()

# (26729, 10) --> Without ignoring rows with NaNs

# (6979, 10) --> Ignoring rows with NaNs

# (26729, 6) --> Ingoring columns with NaNs


# animal_df.shape


Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


# Removing Columns we do not need:

In [27]:
animal_df.drop(['AnimalID','Name','OutcomeSubtype'], axis=1, inplace=True)


animal_df.dropna(inplace= True)

# animal_df.shape

# (26710, 7) --> Size of Data when ignoring columns with most NaNs but also ignoring rows with NaNs thus 
# we're able to keep most of our data 

# print(animal_df.isnull().sum())

# df = df[~df['your column'].isin(['list of strings'])]


# animal_df.shape
animal_df.head()

Unnamed: 0,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,2014-07-11 19:09:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,2013-11-15 12:52:00,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


# Will be setting up our Data and  Predictions based on DOGS only:

In [29]:
# Ignoring every row that contains 'Cat'

animal_df = animal_df[~animal_df['AnimalType'].isin(['Cat'])]

# animal_df.shape

# (15593, 7) --> size of data when ignoring rows that contain 'Cat'

animal_df.head()

Unnamed: 0,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
2,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
4,2013-11-15 12:52:00,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
5,2014-04-25 13:04:00,Transfer,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
8,2014-02-04 17:17:00,Adoption,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White


# Converting our Label Vector from Categorical to numerical:

In [30]:
# Giving numerical values to our label
def categorical_to_numeric(x):
    if x == 'Return_to_owner':
        return 0
    elif x == 'Euthanasia':
        return 1
    elif x == 'Adoption':
        return 2
    elif x == 'Transfer':
        return 3
    
animal_df['OutcomeType'] = animal_df['OutcomeType'].apply(categorical_to_numeric).astype(np.float32)


animal_df.dropna(inplace= True)

# animal_df.shape

# (26513, 7) --> Size of Data when removing rows with NaNs from our OutCometype Column

# print(animal_df.isnull().sum())

animal_df.head()

Unnamed: 0,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,2014-02-12 18:22:00,0.0,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
2,2015-01-31 12:28:00,2.0,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
4,2013-11-15 12:52:00,3.0,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
5,2014-04-25 13:04:00,3.0,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
8,2014-02-04 17:17:00,2.0,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White


# Converting our DateTime Column to Year, Month, Day:

In [31]:
datetime = pd.to_datetime(animal_df['DateTime']).dt
animal_df['Year'] = datetime.year
animal_df['Month'] = datetime.month
animal_df['Day'] = datetime.day
animal_df.drop(['DateTime'], axis=1, inplace=True)

animal_df.head()

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,Day
0,0.0,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,2014,2,12
2,2.0,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,2015,1,31
4,3.0,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,2013,11,15
5,3.0,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,2014,4,25
8,2.0,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White,2014,2,4


# Converting AgeUponOutcome to total number of weeks:

In [32]:
def change_year_month_to_weeks(x):
    count, parse_age = x.split(" ")
    if parse_age == 'month':
        return int(4)
    elif parse_age == 'months':
        return 4 * int(count)
    elif parse_age == 'year':
        return int(52)
    elif parse_age == 'years':
        return 52 * int(count)
    elif parse_age == 'week':
        return int(count)
    elif parse_age == 'weeks':
        return int(count)
    
animal_df['AgeuponOutcome'] = animal_df['AgeuponOutcome'].apply(change_year_month_to_weeks).astype(np.float32)

animal_df.head()

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,Day
0,0.0,Dog,Neutered Male,52.0,Shetland Sheepdog Mix,Brown/White,2014,2,12
2,2.0,Dog,Neutered Male,104.0,Pit Bull Mix,Blue/White,2015,1,31
4,3.0,Dog,Neutered Male,104.0,Lhasa Apso/Miniature Poodle,Tan,2013,11,15
5,3.0,Dog,Intact Female,4.0,Cairn Terrier/Chihuahua Shorthair,Black/Tan,2014,4,25
8,2.0,Dog,Spayed Female,20.0,American Pit Bull Terrier Mix,Red/White,2014,2,4


# Using One-Hot Encoding on our categorical features:

In [33]:
#Used OneHotEncoding on Non-Numerical Features

list_nonNumerical = ['AnimalType','Breed','Color','SexuponOutcome']

for column in list_nonNumerical:
    dummies = pd.get_dummies(animal_df[column])
    animal_df[dummies.columns] = dummies
    
oneHotEncoding_features = animal_df

# (26729, 31081) --> Without Ignoring Rows with NaNs (After OneHotEncoding)

# (6979, 10211) --> Ignoring Rows with NaNs (After OneHotEncoding)

# (26729, 1755) --> Ignoring Columns with NaNs (After OneHotEncoding)


# #get first 5 rows of new table

oneHotEncoding_features.head()

# getnewFeatures_X.shape


Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,Day,Dog,...,Yellow Brindle/Tan,Yellow Brindle/White,Yellow/Black,Yellow/White,Yellow/Yellow,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown
0,0.0,Dog,Neutered Male,52.0,Shetland Sheepdog Mix,Brown/White,2014,2,12,1,...,0,0,0,0,0,0,0,1,0,0
2,2.0,Dog,Neutered Male,104.0,Pit Bull Mix,Blue/White,2015,1,31,1,...,0,0,0,0,0,0,0,1,0,0
4,3.0,Dog,Neutered Male,104.0,Lhasa Apso/Miniature Poodle,Tan,2013,11,15,1,...,0,0,0,0,0,0,0,1,0,0
5,3.0,Dog,Intact Female,4.0,Cairn Terrier/Chihuahua Shorthair,Black/Tan,2014,4,25,1,...,0,0,0,0,0,1,0,0,0,0
8,2.0,Dog,Spayed Female,20.0,American Pit Bull Terrier Mix,Red/White,2014,2,4,1,...,0,0,0,0,0,0,0,0,1,0


In [34]:
# Checking the NaN values after doing OneHotEncoding

print(oneHotEncoding_features.isnull().sum())

OutcomeType                              0
AnimalType                               0
SexuponOutcome                           0
AgeuponOutcome                          76
Breed                                    0
Color                                    0
Year                                     0
Month                                    0
Day                                      0
Dog                                      0
Affenpinscher Mix                        0
Afghan Hound Mix                         0
Airedale Terrier                         0
Airedale Terrier Mix                     0
Airedale Terrier/Labrador Retriever      0
Airedale Terrier/Miniature Schnauzer     0
Akita                                    0
Akita Mix                                0
Akita/Australian Cattle Dog              0
Akita/Chow Chow                          0
Akita/German Shepherd                    0
Akita/Labrador Retriever                 0
Akita/Pit Bull                           0
Akita/Siber

In [42]:
# Remove Remaining NaN values for our label column:

oneHotEncoding_features.dropna(inplace = True)


print(oneHotEncoding_features.isnull().sum())

# oneHotEncoding_features.shape

# (26137, 1761) --> size of Data after oneHotEncoding and ignoring remaining NaNs in our data
# (15467, 1596) --> size of Data after doing oneHotEncoding only Dog Data (also includes removing NaNs where necessary)

OutcomeType                             0
AnimalType                              0
SexuponOutcome                          0
AgeuponOutcome                          0
Breed                                   0
Color                                   0
Year                                    0
Month                                   0
Day                                     0
Dog                                     0
Affenpinscher Mix                       0
Afghan Hound Mix                        0
Airedale Terrier                        0
Airedale Terrier Mix                    0
Airedale Terrier/Labrador Retriever     0
Airedale Terrier/Miniature Schnauzer    0
Akita                                   0
Akita Mix                               0
Akita/Australian Cattle Dog             0
Akita/Chow Chow                         0
Akita/German Shepherd                   0
Akita/Labrador Retriever                0
Akita/Pit Bull                          0
Akita/Siberian Husky              

# Building our Label Vector:

In [43]:
y = oneHotEncoding_features['OutcomeType']

y.head()

0    0.0
2    2.0
4    3.0
5    3.0
8    2.0
Name: OutcomeType, dtype: float32

# After doing OneHotEncoding, now will drop non-numerical columns and keep Number Columns. This will allow us to build our Feature Matrix:

In [44]:
# Dropping Non-numerical columns and our label column:
oneHotEncoding_features.drop(['SexuponOutcome','OutcomeType','AnimalType','Breed','Color'], axis=1, inplace=True)

X = oneHotEncoding_features

X.head()

# print(getnewFeatures_X.isnull().sum())

# getnewFeatures_X.shape

Unnamed: 0,AgeuponOutcome,Year,Month,Day,Dog,Affenpinscher Mix,Afghan Hound Mix,Airedale Terrier,Airedale Terrier Mix,Airedale Terrier/Labrador Retriever,...,Yellow Brindle/Tan,Yellow Brindle/White,Yellow/Black,Yellow/White,Yellow/Yellow,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown
0,52.0,2014,2,12,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,104.0,2015,1,31,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,104.0,2013,11,15,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,4.0,2014,4,25,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8,20.0,2014,2,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Splitting our data into testing and training: 

In [45]:
# Splitting the dataset into testing and training:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

print("X_Train size: " + str(X_train.shape))
print("y_train size: " + str(y_train.shape))
print("X_Test size: " + str(X_test.shape))
print("y_test size: " + str(y_test.shape))

X_Train size: (12373, 1591)
y_train size: (12373,)
X_Test size: (3094, 1591)
y_test size: (3094,)


# Using Random Forest:

In [46]:
#Instantiated Random Forest Classifier
my_RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=2)

In [47]:
# Training on the training set:
my_RandomForest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=19, n_jobs=1,
            oob_score=False, random_state=2, verbose=0, warm_start=False)

In [48]:
# Testing on the testing set:
y_predict_rf = my_RandomForest.predict(X_test)

rf_accuracy = accuracy_score(y_test, y_predict_rf)

print("Random Forest accuracy: " + str(rf_accuracy))

Random Forest accuracy: 0.52617970265
