In [94]:
import pandas as pd
import numpy as np

train_org = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submission.csv")
train = train_org.drop(['OutcomeSubtype', "AnimalID"], axis='columns', inplace=False)

In [101]:
def getAge(age):
    age = age.split(" ")
    time = age[1]
    num = age[0]
    if time == "year" or time == "years":
        return float(num) * 12
    if time == "month" or time == "months":
        return float(num)
    if time == "week" or time == "weeks":
        return float(num) / 4
    if time == "day" or time == "days":
        return float(num) / (4 * 7)
    
def nameLength(name):
    return len(name)

def simplifyColor(color):
    return color.split("/")[0].split(" ")[0]
  
def isMix(breed):
    if breed.endswith(' mix'):
        return 1
    if breed.find("/") != -1:
        return 1
    return 0

def isSpayedNeutered(sex):
    if sex.startswith("Spayed") or sex.startswith("Neutered"):
        return 1
    return 0

def simplifyBreed(breed):
    breed = breed.split("/")[0]
    if breed.endswith(' mix'):
        breed = breed[:-4]
    return breed
    
def encodeFeatures(df_in, hotEncode):
    df = df_in.copy()
    # Encode has name
    hasname = df.Name.isnull().astype(int)
    # Encode name length
    nameslength = df.Name
    nameslength[df.Name.notna()] = df.Name[df.Name.notna()].apply(nameLength)
    nameslength[df.Name.isnull()] = -1
    nameslength.name = "NamesLength"

    # Encode date
    dates = pd.to_datetime(df.DateTime)
    year = dates.dt.year
    year = year - year.min()
    year.name = "Year"
    month = dates.dt.month
    month.name = "Month"
    day = dates.dt.day
    day.name = "Day"

    weekday = dates.dt.dayofweek
    weekday.name = "Weekday"
    if hotEncode:
        weekday = pd.get_dummies(weekday, prefix="Weekday")

    # Encode age
    ages = df.AgeuponOutcome
    ages[df.AgeuponOutcome.notna()] = df.AgeuponOutcome[df.AgeuponOutcome.notna()].apply(getAge)
    mean_age = ages[df.AgeuponOutcome.notna()].mean()
    ages[df.AgeuponOutcome.isnull()] = mean_age
    
    # One-Hot Encode type, sex, breed and color
    df['SexuponOutcome'][df['SexuponOutcome'].isnull()] = 'Unknown'
    df['Color'] = df['Color'].apply(simplifyColor)
    df['Breed'] = df['Breed'].apply(simplifyBreed)
    if hotEncode:
        animalType = pd.get_dummies(df['AnimalType'])
        breed = pd.get_dummies(df['Breed'])
        color = pd.get_dummies(df['Color'])
    else:
        animalType = df['AnimalType']
        breed = df['Breed']
        color = df['Color']
        
    mix = df['Breed'].copy().apply(isMix)
    spayed = df['SexuponOutcome'].apply(isSpayedNeutered)
    


    return pd.concat([hasname, nameslength, year, month, day, weekday, ages, animalType, spayed,
                      breed, mix, color], axis=1)

def encodeLabels(df, hotEncode):
    # One-Hot Encode outcome
    if False:
        return pd.get_dummies(df['OutcomeType'])
    return df['OutcomeType']

encode = True
X = encodeFeatures(train, encode)
y = encodeLabels(train, encode)
X.head()

Unnamed: 0,Name,NamesLength,Year,Month,Day,Weekday_0,Weekday_1,Weekday_2,Weekday_3,Weekday_4,...,Ruddy,Sable,Seal,Silver,Tan,Torbie,Tortie,Tricolor,White,Yellow
0,0,7,1,2,12,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,5,0,10,13,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,6,2,1,31,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,-1,1,7,11,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,-1,0,11,15,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [102]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

def getModel(X, y):
    X = np.array(X)
    y = np.array(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
    # # Feature Scaling
    # if encode:
    #     sc = StandardScaler()
    #     X_train = sc.fit_transform(X_train)
    #     X_test = sc.transform(X_test)

    clf = RandomForestClassifier(n_estimators=50)

    clf.fit(X_train,y_train)

    y_pred = np.array(clf.predict_proba(X_test))
    # Model Accuracy, how often is the classifier correct?
    print("Log-loss:",metrics.log_loss(y_test, y_pred))
    return clf

cat_clf = getModel(X[train['AnimalType'] == "Cat"], y[train['AnimalType'] == "Cat"])
dog_clf = getModel(X[train['AnimalType'] == "Dog"], y[train['AnimalType'] == "Dog"])

(1671,)
(1671, 5)
Log-loss: 1.149866937696131
(2340,)
(2340, 5)
Log-loss: 1.203416917662926


In [34]:
test_pred = clf.predict_proba(test)
test_pred
test_features = encodeFeatures(test.drop(["ID"], axis='columns', inplace=False), encode)
prediction_cats = pd.DataFrame(clf.predict_proba(test_features))
prediction.columns = ['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']
prediction = pd.concat([test['ID'], prediction], axis=1)
prediction.to_csv('resuts.csv', index=False)

ValueError: could not convert string to float: 'Summer'