### Importing required libraries 

In [1]:
import pandas as pd 
import numpy as np

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier



In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [3]:
# these tables we decided would not have any outcome to our project
data.drop(['AnimalID'], axis=1, inplace=True)

### Turning values into Numerical Values

In [4]:
# cats and dogs to 0 and 1
data['AnimalType'] = data['AnimalType'].map({'Cat': 0, 'Dog': 1})

# different approach since there is more than two values
outcome_unique = data['OutcomeType'].unique()
outcome_numeric = dict(zip(outcome_unique, range(len(outcome_unique))))
data['OutcomeType'] = data['OutcomeType'].replace(outcome_numeric)

subtype_unique = data['OutcomeSubtype'].unique()
subtype_numeric = dict(zip(subtype_unique, range(len(subtype_unique))))
data['OutcomeSubtype'] = data['OutcomeSubtype'].replace(subtype_numeric)

# see the data
data.head()

Unnamed: 0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,0,0,1,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,1,1,0,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,Pearce,2015-01-31 12:28:00,2,2,1,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,,2014-07-11 19:09:00,3,3,0,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,,2013-11-15 12:52:00,3,3,1,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


### OneHotEncoding on SexuponOutcome, Breed, and Color

In [5]:
to_convert = ['SexuponOutcome', 'Breed', 'Color']

for col in to_convert:
    dummies = pd.get_dummies(data[col])
    data[dummies.columns] = dummies
    
data.head()

Unnamed: 0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Intact Female,...,White/Yellow,White/Yellow Brindle,Yellow,Yellow Brindle,Yellow Brindle/Blue,Yellow Brindle/Tan,Yellow Brindle/White,Yellow/Black,Yellow/White,Yellow/Yellow
0,Hambone,2014-02-12 18:22:00,0,0,1,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,0,...,0,0,0,0,0,0,0,0,0,0
1,Emily,2013-10-13 12:44:00,1,1,0,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,0,...,0,0,0,0,0,0,0,0,0,0
2,Pearce,2015-01-31 12:28:00,2,2,1,Neutered Male,2 years,Pit Bull Mix,Blue/White,0,...,0,0,0,0,0,0,0,0,0,0
3,,2014-07-11 19:09:00,3,3,0,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,0,...,0,0,0,0,0,0,0,0,0,0
4,,2013-11-15 12:52:00,3,3,1,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,0,...,0,0,0,0,0,0,0,0,0,0


### Extracting the Date

In [6]:
datetime = pd.to_datetime(data['DateTime']).dt
data['Year'] = datetime.year
data['Month'] = datetime.month
data['Day'] = datetime.day
data.drop(['DateTime'], axis=1, inplace=True)
data.head()

Unnamed: 0,Name,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Intact Female,Intact Male,...,Yellow Brindle,Yellow Brindle/Blue,Yellow Brindle/Tan,Yellow Brindle/White,Yellow/Black,Yellow/White,Yellow/Yellow,Year,Month,Day
0,Hambone,0,0,1,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,0,0,...,0,0,0,0,0,0,0,2014,2,12
1,Emily,1,1,0,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,0,0,...,0,0,0,0,0,0,0,2013,10,13
2,Pearce,2,2,1,Neutered Male,2 years,Pit Bull Mix,Blue/White,0,0,...,0,0,0,0,0,0,0,2015,1,31
3,,3,3,0,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,0,1,...,0,0,0,0,0,0,0,2014,7,11
4,,3,3,1,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,0,0,...,0,0,0,0,0,0,0,2013,11,15


### Filling the Name Values With Integers

In [7]:
# fill in the NaN with 0s
data['Name'] = data['Name'].fillna(0)
data.head()

# change names to numeric values
for names in data['Name']:
    if isinstance(names, str) and not 0:
        data['Name'].replace(names, '1', inplace=True)
data['Name'] = data['Name'].astype(int)
data.head()

Unnamed: 0,Name,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Intact Female,Intact Male,...,Yellow Brindle,Yellow Brindle/Blue,Yellow Brindle/Tan,Yellow Brindle/White,Yellow/Black,Yellow/White,Yellow/Yellow,Year,Month,Day
0,1,0,0,1,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,0,0,...,0,0,0,0,0,0,0,2014,2,12
1,1,1,1,0,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,0,0,...,0,0,0,0,0,0,0,2013,10,13
2,1,2,2,1,Neutered Male,2 years,Pit Bull Mix,Blue/White,0,0,...,0,0,0,0,0,0,0,2015,1,31
3,0,3,3,0,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,0,1,...,0,0,0,0,0,0,0,2014,7,11
4,0,3,3,1,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,0,0,...,0,0,0,0,0,0,0,2013,11,15


### Turning Time Into Days

In [8]:
def map_values(x):
    try:
        age_split = x.split()
    except:
        return None
    if 'year' in age_split[1]:
        return float(age_split[0]) * 365
    elif 'month' in age_split[1]:
        return float(age_split[0]) * (365/12)
    elif 'week' in age_split[1]:
        return float(age_split[0]) * 7
    elif 'day' in age_split[1]: 
        return float(age_split[0])

data['DayAge'] = data['AgeuponOutcome'].map(map_values)
data['DayAge'] = data['DayAge'].fillna(data['DayAge'].mean())
data.drop(['AgeuponOutcome'], axis=1, inplace=True)

data.head()

Unnamed: 0,Name,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,Breed,Color,Intact Female,Intact Male,Neutered Male,...,Yellow Brindle/Blue,Yellow Brindle/Tan,Yellow Brindle/White,Yellow/Black,Yellow/White,Yellow/Yellow,Year,Month,Day,DayAge
0,1,0,0,1,Neutered Male,Shetland Sheepdog Mix,Brown/White,0,0,1,...,0,0,0,0,0,0,2014,2,12,365.0
1,1,1,1,0,Spayed Female,Domestic Shorthair Mix,Cream Tabby,0,0,0,...,0,0,0,0,0,0,2013,10,13,365.0
2,1,2,2,1,Neutered Male,Pit Bull Mix,Blue/White,0,0,1,...,0,0,0,0,0,0,2015,1,31,730.0
3,0,3,3,0,Intact Male,Domestic Shorthair Mix,Blue Cream,0,1,0,...,0,0,0,0,0,0,2014,7,11,21.0
4,0,3,3,1,Neutered Male,Lhasa Apso/Miniature Poodle,Tan,0,0,1,...,0,0,0,0,0,0,2013,11,15,730.0


### Dropping the Columns used for OHE

In [9]:
data.drop(['SexuponOutcome', 'Breed', 'Color'], axis=1, inplace=True)
data.head()

Unnamed: 0,Name,OutcomeType,OutcomeSubtype,AnimalType,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown,Abyssinian Mix,...,Yellow Brindle/Blue,Yellow Brindle/Tan,Yellow Brindle/White,Yellow/Black,Yellow/White,Yellow/Yellow,Year,Month,Day,DayAge
0,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,2014,2,12,365.0
1,1,1,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,2013,10,13,365.0
2,1,2,2,1,0,0,1,0,0,0,...,0,0,0,0,0,0,2015,1,31,730.0
3,0,3,3,0,0,1,0,0,0,0,...,0,0,0,0,0,0,2014,7,11,21.0
4,0,3,3,1,0,0,1,0,0,0,...,0,0,0,0,0,0,2013,11,15,730.0


### Random Forest

In [10]:
# finally converting them back into integers
data['DayAge'] = data['DayAge'].astype(int)

features = [i for i in list(data) if i not in ['OutcomeType', 'OutcomeSubtype']]

X = data[features]
y = data['OutcomeType']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

random_forest = RandomForestClassifier(n_estimators=19, bootstrap=True, random_state=2, n_jobs=-1)
random_forest.fit(X_train, y_train)

y_predict = random_forest.predict(X_test)
score_rf = accuracy_score(y_test, y_predict)

print('Random Forest Accuracy: {:.7f}'.format(score_rf))

Random Forest Accuracy: 0.6231450


### Quantifying Accuracy of Random Forest
The closer our log loss is to 0, the better the classifier

In [11]:
rf_proba = random_forest.predict_proba(X_test)
log_loss = log_loss(y_test, rf_proba)

log_loss

2.0608413342378586

### Outcomes

In [12]:
print(outcome_unique)
print(y_predict)

['Return_to_owner' 'Euthanasia' 'Adoption' 'Transfer' 'Died']
[2 2 3 ..., 0 0 2]


In [13]:
predict_frame = pd.DataFrame(y_predict)
outcome_frame = {'Return_to_owner': 0, 'Euthanasia': 1, 'Adoption': 2, 'Transfer': 3,  'Died': 4}
outcome_predict = pd.DataFrame(outcome_unique[y_predict])
outcome_test = pd.DataFrame(outcome_unique[y_test])
outcome_predict.count()

0    8019
dtype: int64

In [14]:
outcome_predict.head()

Unnamed: 0,0
0,Adoption
1,Adoption
2,Transfer
3,Adoption
4,Transfer


### Confusion Matrix

In [15]:
pd.crosstab(outcome_test[0], outcome_predict[0] , rownames=['Actual Outcome'], colnames=['Predicted Outcome'])

Predicted Outcome,Adoption,Died,Euthanasia,Return_to_owner,Transfer
Actual Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adoption,2576,0,12,389,268
Died,6,2,1,2,51
Euthanasia,87,1,43,91,270
Return_to_owner,621,0,31,569,194
Transfer,702,4,44,248,1807


### AdaBoost

In [16]:
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X_train, y_train)
ada_predict = ada.predict(X_test)
ada_score = accuracy_score(y_test, ada_predict)
ada_score

0.6166604314752463

### Decision Tree

In [17]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_predict = dt.predict(X_test)
dt_score = accuracy_score(y_test, dt_predict)
dt_score

0.56727771542586358

### KNN

In [18]:
for k in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn.fit(X_train, y_train)
    knn_predict = knn.predict(X_test)
    knn_score = accuracy_score(y_test, knn_predict)
    print('KNN Accuracy; K = {}: {:.7f}'.format(k, knn_score))

KNN Accuracy; K = 1: 0.5447063
KNN Accuracy; K = 2: 0.5225090
KNN Accuracy; K = 3: 0.5479486
KNN Accuracy; K = 4: 0.5504427
KNN Accuracy; K = 5: 0.5566779
KNN Accuracy; K = 6: 0.5555556
KNN Accuracy; K = 7: 0.5526874
KNN Accuracy; K = 8: 0.5533109
KNN Accuracy; K = 9: 0.5515650
KNN Accuracy; K = 10: 0.5524380


From the first 10 K values, we can see that 5 and 6 seems to be the best K value.

### PCA

In [19]:
n_components = 50
pca = PCA(n_components)

X_train_new = pca.fit_transform(X_train)
X_test_new = pca.transform(X_test)

### Random Forest after PCA

In [20]:
random_forest_pca = RandomForestClassifier(n_estimators=19, bootstrap=True, random_state=2, n_jobs=-1)
random_forest_pca.fit(X_train_new, y_train)

y_predict_pca = random_forest_pca.predict(X_test_new)
score_rf_pca = accuracy_score(y_test, y_predict_pca)

print('Random Forest Accuracy after PCA: {:.7f}'.format(score_rf_pca))

Random Forest Accuracy after PCA: 0.6181569


### AdaBoost after PCA

In [21]:
ada_pca = AdaBoostClassifier(n_estimators=100)
ada_pca.fit(X_train_new, y_train)
ada_predict_pca = ada_pca.predict(X_test_new)
ada_score_pca = accuracy_score(y_test, ada_predict_pca)
ada_score_pca

0.58473625140291807

### Decision Tree after PCA

In [22]:
dt_pca = DecisionTreeClassifier()
dt_pca.fit(X_train, y_train)
dt_predict_pca = dt_pca.predict(X_test)
dt_score_pca = accuracy_score(y_test, dt_predict_pca)
dt_score_pca

0.56914827285197656

### KNN after PCA

In [28]:
for k in range(1, 11):
    knn_pca = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn_pca.fit(X_train_new, y_train)
    knn_predict_pca = knn_pca.predict(X_test_new)
    knn_score_pca = accuracy_score(y_test, knn_predict_pca)
    print('KNN Accuracy with PCA; K = {}: {:.7f}'.format(k, knn_score_pca))

KNN Accuracy with PCA; K = 1: 0.5409652
KNN Accuracy with PCA; K = 2: 0.5188926
KNN Accuracy with PCA; K = 3: 0.5455792
KNN Accuracy with PCA; K = 4: 0.5546826
KNN Accuracy with PCA; K = 5: 0.5528121
KNN Accuracy with PCA; K = 6: 0.5484474
KNN Accuracy with PCA; K = 7: 0.5546826
KNN Accuracy with PCA; K = 8: 0.5505674
KNN Accuracy with PCA; K = 9: 0.5515650
KNN Accuracy with PCA; K = 10: 0.5563038


Comparing the score values between using PCA and not using PCA, we can see that there were not any positive accuracy changes.

### Testing OutcomeSubtype as a feature

In [24]:
features_subtype = [i for i in list(data) if i not in 'OutcomeType']

X_subtype = data[features_subtype]
y_subtype = data['OutcomeType']

X_train_subtype, X_test_subtype, y_train_subtype, y_test_subtype = train_test_split(X_subtype, y_subtype, test_size=0.3, random_state=2)

random_forest_subtype = RandomForestClassifier(n_estimators=19, bootstrap=True, random_state=2, n_jobs=-1)
random_forest_subtype.fit(X_train_subtype, y_train_subtype)

y_predict_subtype = random_forest_subtype.predict(X_test_subtype)
score_rf_subtype = accuracy_score(y_test_subtype, y_predict_subtype)

print('Random Forest Accuracy: {:.7f}'.format(score_rf_subtype))

Random Forest Accuracy: 0.8385085


### PCA with OutcomeSubtype

In [25]:
pca_new = PCA(n_components)
X_train_subtype_new = pca_new.fit_transform(X_train_subtype)
X_test_subtype_new = pca_new.transform(X_test_subtype)

In [27]:
random_forest_subtype_pca = RandomForestClassifier(n_estimators=19, bootstrap=True, random_state=2, n_jobs=-1)
random_forest_subtype_pca.fit(X_train_subtype_new, y_train_subtype)

y_predict_subtype_pca = random_forest_subtype_pca.predict(X_test_subtype_new)
score_rf_subtype_pca = accuracy_score(y_test_subtype, y_predict_subtype_pca)

print('Random Forest Accuracy: {:.7f}'.format(score_rf_subtype_pca))

Random Forest Accuracy: 0.8418755
