In [1]:
# adapted version of https://www.kaggle.com/cbrogan/titanic/xgboost-example-python/code
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np
import sys, os
%matplotlib inline

In [2]:
# Load the data
train_df = pd.read_csv('../data/train.csv', header=0)
test_df = pd.read_csv('../data/test.csv', header=0)
train_df.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [3]:
train_df['Intact'], train_df['Sex'] = train_df.SexuponOutcome.str.split(' ').str
train_df['Intact'] = train_df['Intact'].replace('Unknown', np.nan)
train_df['Intact'] = train_df['Intact'].replace('Spayed', 'Neutered')

test_df['Intact'], test_df['Sex'] = test_df.SexuponOutcome.str.split(' ').str
test_df['Intact'] = test_df['Intact'].replace('Unknown', np.nan)

def calc_age_in_days(df):
    factor = {'year': 365, 'month': 31, 'week': 7, 'day': 1}
    result = []
    for age in df.AgeuponOutcome:
        if str(age) != 'nan':
            value, unit = age.split(' ')
            days = int(value) * factor[unit.replace('s', '')] # ignore year[s], month[s], ...
            result.append(days)
        else:
            result.append(np.nan)
    df['AgeuponOutcomeInDays'] = result
    return df

def calc_age_in_years(df):
    result = []
    for age in df.AgeuponOutcomeInDays:
        if str(age) != 'nan':
            years = int(age / 365)
            result.append(years)
        else:
            result.append(np.nan)
    df['AgeuponOutcomeInYears'] = result
    return df

train_df = calc_age_in_days(train_df)
train_df = calc_age_in_years(train_df)
test_df = calc_age_in_days(test_df)
test_df = calc_age_in_years(test_df)


train_df[5:10]

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Intact,Sex,AgeuponOutcomeInDays,AgeuponOutcomeInYears
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,Intact,Female,31.0,0.0
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby,Intact,Male,21.0,0.0
7,A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby,,,21.0,0.0
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White,Neutered,Female,155.0,0.0
9,A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White,Neutered,Female,365.0,1.0


Split `SexuponOutcome` into `Intact` and `Sex`

In [22]:
# We'll impute missing values using the median for numeric columns and the most
# common value for string columns.
# This is based on some nice code by 'sveitser' at http://stackoverflow.com/a/25562948
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

feature_columns_to_use = ['AnimalType', 'Sex', 'Intact', 'Breed', 'AgeuponOutcomeInYears', 'Color']
# feature_columns_to_use = ['AnimalType', 'SexuponOutcome', 'Breed', 'AgeuponOutcomeInDays']  # <- best features so far

nonnumeric_columns = feature_columns_to_use

# Join the features from train and test together before imputing missing values,
# in case their distribution is slightly different
# Do we really wanna do this? S/b said this is bad practice for some reason
big_X = train_df[feature_columns_to_use].append(test_df[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)
big_X_imputed[5:10]

Unnamed: 0,AnimalType,Sex,Intact,Breed,AgeuponOutcomeInYears,Color
5,Dog,Female,Intact,Cairn Terrier/Chihuahua Shorthair,0.0,Black/Tan
6,Cat,Male,Intact,Domestic Shorthair Mix,0.0,Blue Tabby
7,Cat,Male,Neutered,Domestic Shorthair Mix,0.0,Brown Tabby
8,Dog,Female,Neutered,American Pit Bull Terrier Mix,0.0,Red/White
9,Dog,Female,Neutered,Cairn Terrier,1.0,White


Fill in missing fields:
* `mean` if column is numeric
* `most common value` if not numeric

In [19]:
# See http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing for more
# details and options
le = LabelEncoder()
for feature in nonnumeric_columns:
    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])

# Prepare the inputs for the model
train_X = big_X_imputed[0:train_df.shape[0]].as_matrix()
test_X = big_X_imputed[train_df.shape[0]::].as_matrix()
train_df.OutcomeType = le.fit_transform(train_df.OutcomeType)
train_y = train_df['OutcomeType']

## Do the actual learning

In [20]:
from sklearn.cross_validation import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10)

In [24]:
from sklearn.lda import LDA
from sklearn.metrics import log_loss

clf = LDA()
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_valid)
log_loss(y_valid, y_pred)

0.97111136246262775

## Predict and save as Kaggel

In [None]:
# from sklearn.cross_validation import train_test_split
#     X_train, X_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.01, random_state=10)
# clf.fit(train_X, y=train_y, **fit_param)
# predictions = clf.predict_proba(test_X, ntree_limit=gbm.booster().best_ntree_limit)

# predictions

In [None]:
# # to kaggle format
# submission_df = pd.DataFrame(predictions)
# submission_df = pd.concat([test_df.ID, submission_df], axis=1)
# submission_df.ID = submission_df.ID.astype(int)
# submission_df.set_index('ID', inplace=True)
# submission_df.columns = le.inverse_transform(sorted(train_df.OutcomeType.unique()))  # get the string labels back
# submission_df.head()

In [None]:
# submission_df.to_csv('submission.csv')
# len(submission_df)