In [356]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, Imputer, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

def get_sex(x):
    x = str(x)
    if x.find('Male') >= 0: return 'male'
    if x.find('Female') >= 0: return 'female'
    return 'unknown'

def get_neutered(x):
    x = str(x)
    if x.find('Spayed') >= 0: return 'neutered'
    if x.find('Neutered') >= 0: return 'neutered'
    if x.find('Intact') >= 0: return 'intact'
    return 'unknown'


def get_mix(x):
    x = str(x)
    if x.find('Mix') >=0 :
        return 'mix'
    return 'not'

def calc_age_in_years(x):
    x = str(x)
    if x == 'nan': return 0
    age = int(x.split()[0]) # split with space, get number
    if x.find('year')> -1: return age
    if x.find('month')> -1: return age/12
    if x.find('week') > -1 : return age/52
    if x.find('day') > -1 : return age/365
    else : return 0
    
    
#Split up components of Datetime
def format_datetime(dataset):
    times = [pd.Timestamp(t) for t in dataset.DateTime]
    years = [int(t.year) for t in times]
    months = [int(t.month) for t in times]
    days = [int(t.day) for t in times]
    hours = [float(t.hour + t.minute/60.0) for t in times]
    weekdays = [t.weekday() for t in times]
    return years, months, days, hours, weekdays


def prepareData(train):
    train['Sex'] = train.SexuponOutcome.apply(get_sex)
    train['Neutered'] = train.SexuponOutcome.apply(get_neutered)
    train['Mix'] = train.Breed.apply(get_mix)
    train['AgeuponOutcome'] = train.AgeuponOutcome.apply(calc_age_in_years)
    return train


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


years_train, months_train, days_train, hours_train, weekday_train = format_datetime(train)
years_test, months_test, days_test, hours_test, weekday_test = format_datetime(test)
train['Year'], train['Month'], train['Day'], train['Hour'], train['Weekday'] \
    = years_train, months_train, days_train, hours_train, weekday_train
test['Year'], test['Month'], test['Day'], test['Hour'], test['Weekday'] \
    = years_test, months_test, days_test, hours_test, weekday_test


train = prepareData(train)
test = prepareData(test)

y = train.OutcomeType
train.drop(["Name", "OutcomeType", "OutcomeSubtype","SexuponOutcome","DateTime"], axis=1, inplace=True)
test.drop(["Name", "SexuponOutcome","DateTime"], axis=1, inplace=True)


train.head()


Unnamed: 0,AnimalID,AnimalType,AgeuponOutcome,Breed,Color,Year,Month,Day,Hour,Weekday,Sex,Neutered,Mix
0,A671945,Dog,1.0,Shetland Sheepdog Mix,Brown/White,2014,2,12,18.366667,2,male,neutered,mix
1,A656520,Cat,1.0,Domestic Shorthair Mix,Cream Tabby,2013,10,13,12.733333,6,female,neutered,mix
2,A686464,Dog,2.0,Pit Bull Mix,Blue/White,2015,1,31,12.466667,5,male,neutered,mix
3,A683430,Cat,0.057692,Domestic Shorthair Mix,Blue Cream,2014,7,11,19.15,4,male,intact,mix
4,A667013,Dog,2.0,Lhasa Apso/Miniature Poodle,Tan,2013,11,15,12.866667,4,male,neutered,not


In [357]:
test.head()

Unnamed: 0,ID,AnimalType,AgeuponOutcome,Breed,Color,Year,Month,Day,Hour,Weekday,Sex,Neutered,Mix
0,1,Dog,0.833333,Labrador Retriever Mix,Red/White,2015,10,12,12.25,0,female,intact,mix
1,2,Dog,2.0,German Shepherd/Siberian Husky,Black/Tan,2014,7,26,17.983333,5,female,neutered,not
2,3,Cat,1.0,Domestic Shorthair Mix,Brown Tabby,2016,1,13,12.333333,2,male,neutered,mix
3,4,Dog,0.333333,Collie Smooth Mix,Tricolor,2013,12,28,18.2,5,male,intact,mix
4,5,Dog,2.0,Miniature Poodle Mix,White,2015,9,24,17.983333,3,male,neutered,mix


In [358]:
def parse_breed(data):
    breed_split = [s.split('/') for s in data.Breed]
    bag_of_breeds = []
    for l in breed_split:
        primary_secondary_breed = []
        for breed in l:
            if "Mix" in breed:
                breed = ' '.join(breed.split()[:-1])
            primary_secondary_breed.append(breed)
        if len(primary_secondary_breed) == 1:
            if "Mix" in l[0]:
                primary_secondary_breed.append("Unknown")
            else:
                primary_secondary_breed.append("None")
        if len(primary_secondary_breed) > 2: # handles weird edge case "Plott Hound/Black/Tan Hound"
            primary_secondary_breed = [primary_secondary_breed[0], "Unknown"]
        bag_of_breeds.append(primary_secondary_breed)
    return bag_of_breeds


train_breeds = parse_breed(train)
test_breeds = parse_breed(test)
train['PrimaryBreed'] = [b[0] for b in train_breeds]
train['SecondaryBreed'] = [b[1] for b in train_breeds]
test['PrimaryBreed'] = [b[0] for b in test_breeds]
test['SecondaryBreed'] = [b[1] for b in test_breeds]

train = train.drop("Breed", axis=1)
test = test.drop("Breed", axis=1)

train.head()

Unnamed: 0,AnimalID,AnimalType,AgeuponOutcome,Color,Year,Month,Day,Hour,Weekday,Sex,Neutered,Mix,PrimaryBreed,SecondaryBreed
0,A671945,Dog,1.0,Brown/White,2014,2,12,18.366667,2,male,neutered,mix,Shetland Sheepdog,Unknown
1,A656520,Cat,1.0,Cream Tabby,2013,10,13,12.733333,6,female,neutered,mix,Domestic Shorthair,Unknown
2,A686464,Dog,2.0,Blue/White,2015,1,31,12.466667,5,male,neutered,mix,Pit Bull,Unknown
3,A683430,Cat,0.057692,Blue Cream,2014,7,11,19.15,4,male,intact,mix,Domestic Shorthair,Unknown
4,A667013,Dog,2.0,Tan,2013,11,15,12.866667,4,male,neutered,not,Lhasa Apso,Miniature Poodle


In [359]:
from bs4 import BeautifulSoup
import requests

# breeds in the wikipedia pages are parsed and put in a dictionary that
# maps them to their breed group. 

def parseBreedToGroup (url,skip,tableLocation,breedGroupMap,groupName):
    group=[]
    r = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data)
    table = soup.find_all('table')[tableLocation]
    tds = table.find_all('td')

    counter = 0
    for td in tds:
        counter = counter + 1
        if counter%skip == 0 :
            a = td.find_all('a')
            if a!=[]:
                breed = a[0].get_text()
                group.append(breed)
                breedGroupMap[breed]=groupName
        else:
            continue
    return breedGroupMap

breedGroupMap = {}

# herding
url = "https://en.wikipedia.org/wiki/Herding_Group"
tableLocation = 0
groupName = 'herding'
tableLocation = 0
skip = 2
parseBreedToGroup (url,skip,tableLocation,breedGroupMap,groupName)


# sporting group
url = "https://en.wikipedia.org/wiki/Hound"
groupName = 'hound'
tableLocation = 2
skip = 2
parseBreedToGroup (url,skip,tableLocation,breedGroupMap,groupName)


url = "https://en.wikipedia.org/wiki/Toy_Group"
groupName = 'toy'
tableLocation = 0
skip = 1
parseBreedToGroup (url,skip,tableLocation,breedGroupMap,groupName)


url = "https://en.wikipedia.org/wiki/Working_Group_(dogs)"
groupName = 'working'
tableLocation = 0
skip = 1
parseBreedToGroup (url,skip,tableLocation,breedGroupMap,groupName)


url = 'https://en.wikipedia.org/wiki/Terrier_Group'
groupName = 'terrier'
terriergroup=[]
tableLocation = 0
skip = 1
parseBreedToGroup (url,skip,tableLocation,breedGroupMap,groupName)


url = 'https://en.wikipedia.org/wiki/Sporting_Group'
tableLocation = 0
skip = 2
groupName = 'sport'
parseBreedToGroup (url,skip,tableLocation,breedGroupMap,groupName)


url = 'https://en.wikipedia.org/wiki/Non-Sporting_Group'
tableLocation = 0
skip = 2
groupName = 'nonsport'
parseBreedToGroup (url,skip,tableLocation,breedGroupMap,groupName)

breedGroupMap



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


{'Affenpinscher': 'toy',
 'Afghan Hound': 'hound',
 'Airedale Terrier': 'terrier',
 'Akita': 'working',
 'Alaskan Malamute': 'working',
 'American Cocker Spaniel': 'sport',
 'American English Coonhound': 'hound',
 'American Eskimo Dog': 'nonsport',
 'American Foxhound': 'hound',
 'American Hairless Terrier': 'terrier',
 'American Pit Bull Terrier': 'terrier',
 'American Staffordshire Terrier': 'terrier',
 'American Water Spaniel': 'sport',
 'Anatolian Shepherd': 'working',
 'Andalusian Hound': 'hound',
 'Andalusian Hound Maneto': 'hound',
 'Australian Cattle Dog': 'working',
 'Australian Dingo': 'hound',
 'Australian Kelpie': 'working',
 'Australian Shepherd': 'working',
 'Australian Silky Terrier': 'toy',
 'Australian Stumpy Tail Cattle Dog': 'working',
 'Australian Terrier': 'terrier',
 'Austrian Pinscher': 'terrier',
 'Azawakh': 'hound',
 'Barbet': 'sport',
 'Basenji': 'hound',
 'Basset Bleu de Gascogne': 'hound',
 'Basset Fauve de Bretagne': 'hound',
 'Basset Hound': 'hound',
 'Bav

In [360]:
def replace_breed_by_group (x):
    if x in breedGroupMap:
        return breedGroupMap[x]
    else:
        return x

train['PrimaryBreed'] = train.PrimaryBreed.apply(replace_breed_by_group)
train['SecondaryBreed'] = train.SecondaryBreed.apply(replace_breed_by_group)
test['PrimaryBreed'] = test.PrimaryBreed.apply(replace_breed_by_group)
test['SecondaryBreed'] = test.SecondaryBreed.apply(replace_breed_by_group)

train.head()

Unnamed: 0,AnimalID,AnimalType,AgeuponOutcome,Color,Year,Month,Day,Hour,Weekday,Sex,Neutered,Mix,PrimaryBreed,SecondaryBreed
0,A671945,Dog,1.0,Brown/White,2014,2,12,18.366667,2,male,neutered,mix,working,Unknown
1,A656520,Cat,1.0,Cream Tabby,2013,10,13,12.733333,6,female,neutered,mix,Domestic Shorthair,Unknown
2,A686464,Dog,2.0,Blue/White,2015,1,31,12.466667,5,male,neutered,mix,Pit Bull,Unknown
3,A683430,Cat,0.057692,Blue Cream,2014,7,11,19.15,4,male,intact,mix,Domestic Shorthair,Unknown
4,A667013,Dog,2.0,Tan,2013,11,15,12.866667,4,male,neutered,not,nonsport,Miniature Poodle


In [361]:
# combine train and test. Do the fits
combo = train.append(test)
combo.shape

# do the fits
for var in ['AnimalType', 'Sex', 'Neutered','Mix']:
    bin_fit = LabelBinarizer().fit(combo[var])
    train[var] = bin_fit.transform(train[var])
    test[var] = bin_fit.transform(test[var])

train.head()


Unnamed: 0,AnimalID,AnimalType,AgeuponOutcome,Color,Year,Month,Day,Hour,Weekday,Sex,Neutered,Mix,PrimaryBreed,SecondaryBreed
0,A671945,1,1.0,Brown/White,2014,2,12,18.366667,2,0,0,0,working,Unknown
1,A656520,0,1.0,Cream Tabby,2013,10,13,12.733333,6,1,0,0,Domestic Shorthair,Unknown
2,A686464,1,2.0,Blue/White,2015,1,31,12.466667,5,0,0,0,Pit Bull,Unknown
3,A683430,0,0.057692,Blue Cream,2014,7,11,19.15,4,0,1,0,Domestic Shorthair,Unknown
4,A667013,1,2.0,Tan,2013,11,15,12.866667,4,0,0,1,nonsport,Miniature Poodle


In [362]:
for var in ['PrimaryBreed','SecondaryBreed','Color']:
    label_fit = LabelEncoder().fit(combo[var])
    train[var] = label_fit.transform(train[var])
    test[var] = label_fit.transform(test[var])
    
    
train.head()

Unnamed: 0,AnimalID,AnimalType,AgeuponOutcome,Color,Year,Month,Day,Hour,Weekday,Sex,Neutered,Mix,PrimaryBreed,SecondaryBreed
0,A671945,1,1.0,146,2014,2,12,18.366667,2,0,0,0,120,62
1,A656520,0,1.0,184,2013,10,13,12.733333,6,1,0,0,43,62
2,A686464,1,2.0,97,2015,1,31,12.466667,5,0,0,0,80,62
3,A683430,0,0.057692,47,2014,7,11,19.15,4,0,1,0,43,62
4,A667013,1,2.0,311,2013,11,15,12.866667,4,0,0,1,116,38


In [363]:
'''clf = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5, min_samples_split=25)
clf.fit(train[train.columns[1:]], y)
clf.score(train[train.columns[1:]], y)
'''
# here is where the model training happens
# slow possibly comment it out while trying stuff!#
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.05, n_estimators=400,
                           subsample=0.7)


'''clf = GridSearchCV(DecisionTreeClassifier(), param_grid={'max_depth':[4,5,6,7,8,9,10,12],
                                                        'min_samples_split':[2, 5, 10, 15, 20, 25],
                                                        'min_samples_leaf':[1, 2, 5, 8, 12, 15, 20]},
                  scoring='log_loss',
                  n_jobs=-1)'''
clf.fit(train[train.columns[1:]], y)

GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=400,
              presort='auto', random_state=None, subsample=0.7, verbose=0,
              warm_start=False)

In [364]:
class_probabilites = clf.predict_proba(test[test.columns[1:]])

In [365]:
submission = 'ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer\n'
for i in range(len(test.ID)):
    submission += str(test.ID[i]) + ',' + ','.join([str(j) for j in class_probabilites[i]]) + '\n'
f = open("submissionBoosterClassifier_Breeds.csv", "w")
f.write(submission)
f.close()

# submission score with booster classifier is 0.78165
