In [218]:
import pandas as pd
'''import seaborn as sns
import numpy as np'''
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, Imputer, OneHotEncoder


def get_sex(x):
    x = str(x)
    if x.find('Male') >= 0: return 'male'
    if x.find('Female') >= 0: return 'female'
    return 'unknown'

def get_neutered(x):
    x = str(x)
    if x.find('Spayed') >= 0: return 'neutered'
    if x.find('Neutered') >= 0: return 'neutered'
    if x.find('Intact') >= 0: return 'intact'
    return 'unknown'


def get_mix(x):
    x = str(x)
    if x.find('Mix') >=0 :
        return 'mix'
    return 'not'

def calc_age_in_years(x):
    x = str(x)
    if x == 'nan': return 0
    age = int(x.split()[0]) # split with space, get number
    if x.find('year')> -1: return age
    if x.find('month')> -1: return age/12
    if x.find('week') > -1 : return age/52
    if x.find('day') > -1 : return age/365
    else : return 0

def prepareData(train):
    train['Sex'] = train.SexuponOutcome.apply(get_sex)
    train['Neutered'] = train.SexuponOutcome.apply(get_neutered)
    train['Mix'] = train.Breed.apply(get_mix)
    train['AgeuponOutcome'] = train.AgeuponOutcome.apply(calc_age_in_years)
    return train


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


train = prepareData(train)
test = prepareData(test)

y = train.OutcomeType
train.drop(["Name", "OutcomeType", "OutcomeSubtype","SexuponOutcome","DateTime"], axis=1, inplace=True)
test.drop(["Name", "SexuponOutcome","DateTime"], axis=1, inplace=True)


train.head()


Unnamed: 0,AnimalID,AnimalType,AgeuponOutcome,Breed,Color,Sex,Neutered,Mix
0,A671945,Dog,1.0,Shetland Sheepdog Mix,Brown/White,male,neutered,mix
1,A656520,Cat,1.0,Domestic Shorthair Mix,Cream Tabby,female,neutered,mix
2,A686464,Dog,2.0,Pit Bull Mix,Blue/White,male,neutered,mix
3,A683430,Cat,0.057692,Domestic Shorthair Mix,Blue Cream,male,intact,mix
4,A667013,Dog,2.0,Lhasa Apso/Miniature Poodle,Tan,male,neutered,not


In [219]:
test.head()

Unnamed: 0,ID,AnimalType,AgeuponOutcome,Breed,Color,Sex,Neutered,Mix
0,1,Dog,0.833333,Labrador Retriever Mix,Red/White,female,intact,mix
1,2,Dog,2.0,German Shepherd/Siberian Husky,Black/Tan,female,neutered,not
2,3,Cat,1.0,Domestic Shorthair Mix,Brown Tabby,male,neutered,mix
3,4,Dog,0.333333,Collie Smooth Mix,Tricolor,male,intact,mix
4,5,Dog,2.0,Miniature Poodle Mix,White,male,neutered,mix


In [220]:
def parse_breed(data):
    breed_split = [s.split('/') for s in data.Breed]
    bag_of_breeds = []
    for l in breed_split:
        primary_secondary_breed = []
        for breed in l:
            if "Mix" in breed:
                breed = ' '.join(breed.split()[:-1])
            primary_secondary_breed.append(breed)
        if len(primary_secondary_breed) == 1:
            if "Mix" in l[0]:
                primary_secondary_breed.append("Unknown")
            else:
                primary_secondary_breed.append("None")
        if len(primary_secondary_breed) > 2: # handles weird edge case "Plott Hound/Black/Tan Hound"
            primary_secondary_breed = [primary_secondary_breed[0], "Unknown"]
        bag_of_breeds.append(primary_secondary_breed)
    return bag_of_breeds


train_breeds = parse_breed(train)
test_breeds = parse_breed(test)
train['PrimaryBreed'] = [b[0] for b in train_breeds]
train['SecondaryBreed'] = [b[1] for b in train_breeds]
test['PrimaryBreed'] = [b[0] for b in test_breeds]
test['SecondaryBreed'] = [b[1] for b in test_breeds]

train = train.drop("Breed", axis=1)
test = test.drop("Breed", axis=1)


In [221]:
train.shape

(26729, 9)

In [222]:
test.shape

(11456, 9)

In [223]:
# add train and test after each other. Do the fits, separate
combo = train.append(test)
combo.shape

(38185, 10)

In [224]:
# do the fits
for var in ['AnimalType', 'Sex', 'Neutered','Mix']:
    bin_fit = LabelBinarizer().fit(combo[var])
    train[var] = bin_fit.transform(train[var])
    test[var] = bin_fit.transform(test[var])

train.head()


Unnamed: 0,AnimalID,AnimalType,AgeuponOutcome,Color,Sex,Neutered,Mix,PrimaryBreed,SecondaryBreed
0,A671945,1,1.0,Brown/White,0,0,0,Shetland Sheepdog,Unknown
1,A656520,0,1.0,Cream Tabby,1,0,0,Domestic Shorthair,Unknown
2,A686464,1,2.0,Blue/White,0,0,0,Pit Bull,Unknown
3,A683430,0,0.057692,Blue Cream,0,1,0,Domestic Shorthair,Unknown
4,A667013,1,2.0,Tan,0,0,1,Lhasa Apso,Miniature Poodle


In [225]:
for var in ['PrimaryBreed','SecondaryBreed','Color']:
    label_fit = LabelEncoder().fit(combo[var])
    train[var] = label_fit.transform(train[var])
    test[var] = label_fit.transform(test[var])
    
    
train.head()

Unnamed: 0,AnimalID,AnimalType,AgeuponOutcome,Color,Sex,Neutered,Mix,PrimaryBreed,SecondaryBreed
0,A671945,1,1.0,146,0,0,0,191,150
1,A656520,0,1.0,184,1,0,0,85,150
2,A686464,1,2.0,97,0,0,0,168,150
3,A683430,0,0.057692,47,0,1,0,85,150
4,A667013,1,2.0,311,0,0,1,134,99


In [226]:
train.columns

Index(['AnimalID', 'AnimalType', 'AgeuponOutcome', 'Color', 'Sex', 'Neutered',
       'Mix', 'PrimaryBreed', 'SecondaryBreed'],
      dtype='object')

In [227]:
train.columns[0]

'AnimalID'

In [228]:
train.columns[1:]

Index(['AnimalType', 'AgeuponOutcome', 'Color', 'Sex', 'Neutered', 'Mix',
       'PrimaryBreed', 'SecondaryBreed'],
      dtype='object')

In [231]:
'''clf = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=25)
clf.fit(train[train.columns[1:]], y)
clf.score(train[train.columns[1:]], y)
'''
# here is where the model training happens
# slow possibly comment it out while trying stuff!#

clf = GridSearchCV(DecisionTreeClassifier(), param_grid={'max_depth':[4,5,6,7,8,9,10,12],
                                                        'min_samples_split':[2, 5, 10, 15, 20, 25],
                                                        'min_samples_leaf':[1, 2, 5, 8, 12, 15, 20]},
                  scoring='log_loss',
                  n_jobs=-1)
clf.fit(train[train.columns[1:]], y)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [4, 5, 6, 7, 8, 9, 10, 12], 'min_samples_leaf': [1, 2, 5, 8, 12, 15, 20], 'min_samples_split': [2, 5, 10, 15, 20, 25]},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [232]:
class_probabilites = clf.predict_proba(test[test.columns[1:]])
''' this gets 0.78401 on the submission leader board'''

In [233]:
submission = 'ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer\n'
for i in range(len(test.ID)):
    submission += str(test.ID[i]) + ',' + ','.join([str(j) for j in class_probabilites[i]]) + '\n'
f = open("submissionGridSearch.csv", "w")
f.write(submission)
f.close()