### Importing required libraries 

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier




In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [3]:
# these tables we decided would not have any outcome to our project
data.drop(['AnimalID', 'OutcomeSubtype'], axis=1, inplace=True)

### Turn all data into numeric

In [4]:
# turn all data into numerical
data['AnimalType'] = data['AnimalType'].map({'Cat': 0, 'Dog': 1})

# different approach since there is more than two values
outcome_unique = data['OutcomeType'].unique()
outcome_numeric = dict(zip(outcome_unique, range(len(outcome_unique))))
data['OutcomeType'] = data['OutcomeType'].replace(outcome_numeric)

sex_unique = data['SexuponOutcome'].unique()
sex_numeric = dict(zip(sex_unique, range(len(sex_unique))))
data['SexuponOutcome'] = data['SexuponOutcome'].replace(sex_numeric)

data.head()

Unnamed: 0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,0,1,0,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,1,0,1,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,2,1,0,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,3,0,2,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,3,1,0,2 years,Lhasa Apso/Miniature Poodle,Tan


In [5]:
# Separate breeds into Mix and Non-Mix
data['Breed'] = data['Breed'].str.contains('mix', case=False).astype(int)
data.head()

Unnamed: 0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,0,1,0,1 year,1,Brown/White
1,Emily,2013-10-13 12:44:00,1,0,1,1 year,1,Cream Tabby
2,Pearce,2015-01-31 12:28:00,2,1,0,2 years,1,Blue/White
3,,2014-07-11 19:09:00,3,0,2,3 weeks,1,Blue Cream
4,,2013-11-15 12:52:00,3,1,0,2 years,0,Tan


In [6]:
# extracting the date
datetime = pd.to_datetime(data['DateTime']).dt
data['Year'] = datetime.year
data['Month'] = datetime.month
data['Day'] = datetime.day
data.drop(['DateTime'], axis=1, inplace=True)
data.head()

Unnamed: 0,Name,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,Day
0,Hambone,0,1,0,1 year,1,Brown/White,2014,2,12
1,Emily,1,0,1,1 year,1,Cream Tabby,2013,10,13
2,Pearce,2,1,0,2 years,1,Blue/White,2015,1,31
3,,3,0,2,3 weeks,1,Blue Cream,2014,7,11
4,,3,1,0,2 years,0,Tan,2013,11,15


In [7]:
# fill in the NaN with 0s
data['Name'] = data['Name'].fillna(0)
data.head()

# change names to numeric values
for names in data['Name']:
    if isinstance(names, str) and not 0:
        data['Name'].replace(names, '1', inplace=True)
data['Name'] = data['Name'].astype(int)
data.head()

Unnamed: 0,Name,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Year,Month,Day
0,1,0,1,0,1 year,1,Brown/White,2014,2,12
1,1,1,0,1,1 year,1,Cream Tabby,2013,10,13
2,1,2,1,0,2 years,1,Blue/White,2015,1,31
3,0,3,0,2,3 weeks,1,Blue Cream,2014,7,11
4,0,3,1,0,2 years,0,Tan,2013,11,15


In [9]:
# turn time into days 
def map_values(x):
    try:
        age_split = x.split()
    except:
        return None
    if 'year' in age_split[1]:
        return float(age_split[0]) * 365
    elif 'month' in age_split[1]:
        return float(age_split[0]) * (365/12)
    elif 'week' in age_split[1]:
        return float(age_split[0]) * 7
    elif 'day' in age_split[1]: 
        return float(age_split[0])

data['DayAge'] = data['AgeuponOutcome'].map(map_values)
data['DayAge'] = data['DayAge'].fillna(data['DayAge'].mean())
data.drop(['AgeuponOutcome'], axis=1, inplace=True)

data.head()

Unnamed: 0,Name,OutcomeType,AnimalType,SexuponOutcome,Breed,Color,Year,Month,Day,DayAge
0,1,0,1,0,1,Brown/White,2014,2,12,365.0
1,1,1,0,1,1,Cream Tabby,2013,10,13,365.0
2,1,2,1,0,1,Blue/White,2015,1,31,730.0
3,0,3,0,2,1,Blue Cream,2014,7,11,21.0
4,0,3,1,0,0,Tan,2013,11,15,730.0


In [10]:
# handling colors
# To make it less complicated, just take the first color of the column
for colors in data['Color']:
    color_split = colors.split('/')
    data['Color'] = data['Color'].replace(colors, color_split[0])
data.head()

Unnamed: 0,Name,OutcomeType,AnimalType,SexuponOutcome,Breed,Color,Year,Month,Day,DayAge
0,1,0,1,0,1,Brown,2014,2,12,365.0
1,1,1,0,1,1,Cream Tabby,2013,10,13,365.0
2,1,2,1,0,1,Blue,2015,1,31,730.0
3,0,3,0,2,1,Blue Cream,2014,7,11,21.0
4,0,3,1,0,0,Tan,2013,11,15,730.0


In [11]:
# splitting color one more time by spaces
for colors in data['Color']:
    one_color = colors.split()
    data['Color'] = data['Color'].replace(colors, one_color[0])
data.head()

Unnamed: 0,Name,OutcomeType,AnimalType,SexuponOutcome,Breed,Color,Year,Month,Day,DayAge
0,1,0,1,0,1,Brown,2014,2,12,365.0
1,1,1,0,1,1,Cream,2013,10,13,365.0
2,1,2,1,0,1,Blue,2015,1,31,730.0
3,0,3,0,2,1,Blue,2014,7,11,21.0
4,0,3,1,0,0,Tan,2013,11,15,730.0


In [12]:
# turning colors into numeric values
color_unique = data['Color'].unique()
color_numeric = dict(zip(color_unique, range(len(color_unique))))
data['Color'] = data['Color'].replace(color_numeric)
data.head()

Unnamed: 0,Name,OutcomeType,AnimalType,SexuponOutcome,Breed,Color,Year,Month,Day,DayAge
0,1,0,1,0,1,0,2014,2,12,365.0
1,1,1,0,1,1,1,2013,10,13,365.0
2,1,2,1,0,1,2,2015,1,31,730.0
3,0,3,0,2,1,2,2014,7,11,21.0
4,0,3,1,0,0,3,2013,11,15,730.0


### Random Forest

In [13]:
# finally converting them back into integers
data['DayAge'] = data['DayAge'].astype(int)

features = [i for i in list(data) if i not in 'OutcomeType']

X = data[features]
y = data['OutcomeType']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

random_forest = RandomForestClassifier(n_estimators=19, bootstrap=True, random_state=2, n_jobs=-1)
random_forest.fit(X_train, y_train)

y_predict = random_forest.predict(X_test)
score_rf = accuracy_score(y_test, y_predict)

print('OutcomeType: {:.7f}'.format(score_rf))

OutcomeType: 0.6200274


### Outcomes

In [14]:
print(outcome_unique)
print(y_predict)
(list(zip(features, random_forest.feature_importances_)))

['Return_to_owner' 'Euthanasia' 'Adoption' 'Transfer' 'Died']
[2 2 0 ... 0 0 2]


[('Name', 0.049809725045300866),
 ('AnimalType', 0.031290370235255695),
 ('SexuponOutcome', 0.15192102243954064),
 ('Breed', 0.01818083348040685),
 ('Color', 0.1215103588847541),
 ('Year', 0.052336868103047746),
 ('Month', 0.13313536642269141),
 ('Day', 0.22192351790362627),
 ('DayAge', 0.21989193748537633)]

In [75]:
predict_frame = pd.DataFrame(y_predict)
outcome_frame = {'Return_to_owner': 0, 'Euthanasia': 1, 'Adoption': 2, 'Transfer': 3,  'Died': 4}
outcome_predict = pd.DataFrame(outcome_unique[y_predict])
outcome_test = pd.DataFrame(outcome_unique[y_test])
outcome.count()

0    8019
dtype: int64

In [76]:
outcome.head()

Unnamed: 0,0
0,Adoption
1,Adoption
2,Return_to_owner
3,Adoption
4,Transfer


In [79]:
pd.crosstab(outcome_test[0], outcome_predict[0] , rownames=['Actual Outcome'], colnames=['Predicted Outcome'])

Predicted Outcome,Adoption,Died,Euthanasia,Return_to_owner,Transfer
Actual Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adoption,2426,2,15,434,368
Died,8,2,12,3,37
Euthanasia,76,2,101,94,219
Return_to_owner,534,1,27,626,227
Transfer,608,8,86,286,1817


In [58]:
adoption_count = 0
died_count = 0
euthanasia_count = 0
owner_count = 0
transfer_count = 0

for i in y_test:
    if i is 0:
        owner_count += 1
    elif i is 1:
        euthanasia_count += 1
    elif i is 2:
        adoption_count += 1
    elif i is 3:
        transfer_count += 1
    else:
        died_count += 1
        
        
#     print(i)
print(owner_count, euthanasia_count, adoption_count, transfer_count, died_count)
owner_count + euthanasia_count + adoption_count + transfer_count + died_count

1415 492 3245 2805 62


8019