In [1]:
import os
import gc
import cv2
from datetime import datetime, timedelta, date
import numpy as np
import warnings
import re
import json
import pandas as pd
import pprint
import string
from glob import glob
from numpy import nanmean
from functools import partial
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold, RandomizedSearchCV, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
import scipy as sp
from collections import Counter
from textblob import TextBlob
from keras.applications.densenet import preprocess_input, DenseNet121
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
import keras.backend as K
from tqdm import tqdm, tqdm_notebook

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, cohen_kappa_score, log_loss, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier

# For text processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing import sequence, text
from keras.layers import Input, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from nltk import word_tokenize
from nltk.corpus import stopwords

from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from imblearn.over_sampling import SMOTE

# stop_words = []
stop_words = list(set(stopwords.words('english')))
warnings.filterwarnings('ignore')
punctuation = string.punctuation

pd.set_option('max_columns', 50)
pd.options.mode.chained_assignment = None
pp = pprint.PrettyPrinter(indent = 4)

Using TensorFlow backend.


In [2]:
train_orig = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test_orig = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

# Feature Engineering

In [3]:
def get_description_sentiment(df, df_type = "train"):
    doc_sent_mag = []
    doc_sent_score = []
    nf_count = 0
    for pet_id in df.PetID.values:
        try:
            with open('../input/petfinder-adoption-prediction/' + df_type + '_sentiment/' + pet_id + '.json', 'r') as f:
                sentiment = json.load(f)
            doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
            doc_sent_score.append(sentiment['documentSentiment']['score'])
        except FileNotFoundError:
            nf_count += 1
            doc_sent_mag.append(-1)
            doc_sent_score.append(-1)
            
    df['doc_sent_mag'] = doc_sent_mag
    df['doc_sent_score'] = doc_sent_score
    
    return df

In [4]:
def get_image_metadata(df, df_type = "train"):
    vertex_xs = []
    vertex_ys = []
    bounding_confidences = []
    bounding_importance_fracs = []
    dominant_blues = []
    dominant_greens = []
    dominant_reds = []
    dominant_pixel_fracs = []
    dominant_scores = []
    label_scores = []
    nf_count = 0
    nl_count = 0
    
    pets_metadata = {}
    for file_name in os.listdir('../input/petfinder-adoption-prediction/' + df_type + '_metadata/'):
        pet_id = file_name.split('-')[0]
        if pet_id not in pets_metadata:
            pets_metadata[pet_id] = []
        pets_metadata[pet_id].append(file_name)
    
    for pet_id in df.PetID.values:
        if pet_id in pets_metadata:
            pet_id_metadata_files = pets_metadata[pet_id]
            
            temp_vertex_xs = []
            temp_vertex_ys = []
            temp_bounding_confidences = []
            temp_bounding_importance_fracs = []
            temp_dominant_blues = []
            temp_dominant_greens = []
            temp_dominant_reds = []
            temp_dominant_pixel_fracs = []
            temp_dominant_scores = []
            temp_label_scores = []
            for file in pet_id_metadata_files:
                with open('../input/petfinder-adoption-prediction/' + df_type + '_metadata/' + file, 'r') as f:
                    data = json.load(f)
                
                try:
                    vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
                    temp_vertex_xs.append(vertex_x)
                except:
                    temp_vertex_xs.append(-1)
                
                try:
                    vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
                    temp_vertex_ys.append(vertex_y)
                except:
                    temp_vertex_ys.append(-1)
                
                try:
                    bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
                    temp_bounding_confidences.append(bounding_confidence)
                except:
                    temp_bounding_confidences.append(-1)
                
                try:
                    bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
                    temp_bounding_importance_fracs.append(bounding_importance_frac)
                except:
                    temp_bounding_importance_fracs.append(-1)
                
                try:
                    dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
                    temp_dominant_blues.append(dominant_blue)
                except:
                    temp_dominant_blues.append(-1)
                
                try:
                    dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
                    temp_dominant_greens.append(dominant_green)
                except:
                    temp_dominant_greens.append(-1)
                
                try:
                    dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
                    temp_dominant_reds.append(dominant_red)
                except:
                    temp_dominant_reds.append(-1)
                
                try:
                    dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
                    temp_dominant_pixel_fracs.append(dominant_pixel_frac)
                except:
                    temp_dominant_pixel_fracs.append(-1)
                
                try:
                    dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
                    temp_dominant_scores.append(dominant_score)
                except:
                    temp_dominant_scores.append(-1)

                if data.get('labelAnnotations'):
                    label_score = data['labelAnnotations'][0]['score']
                    temp_label_scores.append(label_score)
                else:
                    temp_label_scores.append(-1)
            
            vertex_xs.append(np.mean(temp_vertex_xs))
            vertex_ys.append(np.mean(temp_vertex_ys))
            bounding_confidences.append(np.mean(temp_bounding_confidences))
            bounding_importance_fracs.append(np.mean(temp_bounding_importance_fracs))
            dominant_blues.append(np.mean(temp_dominant_blues))
            dominant_greens.append(np.mean(temp_dominant_greens))
            dominant_reds.append(np.mean(temp_dominant_reds))
            dominant_pixel_fracs.append(np.mean(temp_dominant_pixel_fracs))
            dominant_scores.append(np.mean(temp_dominant_scores))
            label_scores.append(np.mean(temp_label_scores))
        else:
            nf_count += 1
            vertex_xs.append(-1)
            vertex_ys.append(-1)
            bounding_confidences.append(-1)
            bounding_importance_fracs.append(-1)
            dominant_blues.append(-1)
            dominant_greens.append(-1)
            dominant_reds.append(-1)
            dominant_pixel_fracs.append(-1)
            dominant_scores.append(-1)
            label_scores.append(-1)
            
    df.loc[:, 'vertex_x'] = vertex_xs
    df.loc[:, 'vertex_y'] = vertex_ys
    df.loc[:, 'bounding_confidence'] = bounding_confidences
    df.loc[:, 'bounding_importance'] = bounding_importance_fracs
    df.loc[:, 'dominant_blue'] = dominant_blues
    df.loc[:, 'dominant_green'] = dominant_greens
    df.loc[:, 'dominant_red'] = dominant_reds
    df.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
    df.loc[:, 'dominant_score'] = dominant_scores
    df.loc[:, 'label_score'] = label_scores
    
    return df

In [5]:
# Reference - https://www.kaggle.com/shivamb/extensive-text-data-feature-engineering

embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 1257 # max number of words in a description to use

pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

mispell_dict = {"aren't" : "are not",
                "can't" : "cannot",
                "couldn't" : "could not",
                "didn't" : "did not",
                "doesn't" : "does not",
                "don't" : "do not",
                "hadn't" : "had not",
                "hasn't" : "has not",
                "haven't" : "have not",
                "he'd" : "he would",
                "he'll" : "he will",
                "he's" : "he is",
                "i'd" : "I would",
                "i'd" : "I had",
                "i'll" : "I will",
                "i'm" : "I am",
                "isn't" : "is not",
                "it's" : "it is",
                "it'll":"it will",
                "i've" : "I have",
                "let's" : "let us",
                "mightn't" : "might not",
                "mustn't" : "must not",
                "shan't" : "shall not",
                "she'd" : "she would",
                "she'll" : "she will",
                "she's" : "she is",
                "shouldn't" : "should not",
                "that's" : "that is",
                "there's" : "there is",
                "they'd" : "they would",
                "they'll" : "they will",
                "they're" : "they are",
                "they've" : "they have",
                "we'd" : "we would",
                "we're" : "we are",
                "weren't" : "were not",
                "we've" : "we have",
                "what'll" : "what will",
                "what're" : "what are",
                "what's" : "what is",
                "what've" : "what have",
                "where's" : "where is",
                "who'd" : "who would",
                "who'll" : "who will",
                "who're" : "who are",
                "who's" : "who is",
                "who've" : "who have",
                "won't" : "will not",
                "wouldn't" : "would not",
                "you'd" : "you would",
                "you'll" : "you will",
                "you're" : "you are",
                "you've" : "you have",
                "'re": " are",
                "wasn't": "was not",
                "we'll":" will",
                "didn't": "did not",
                "tryin'":"trying"
}

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

def get_polarity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        pol = textblob.sentiment.polarity
    except:
        pol = 0.0
    return pol

def get_subjectivity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        subj = textblob.sentiment.subjectivity
    except:
        subj = 0.0
    return subj

def get_parts_of_speech_count(x):
    cnt = {
        'noun': 0,
        'pron': 0,
        'verb': 0,
        'adj': 0,
        'adv': 0
    }

    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            for pos_tag_type in cnt.keys():
                if ppo in pos_dic[pos_tag_type]:
                    cnt[pos_tag_type] += 1
    except:
        pass
    return [cnt[i] for i in cnt.keys()]

def sent2vec(description, embedding_index_1, embedding_index_2 = None):
    M = []
    for w in word_tokenize(description):
        if not w.isalpha():
            continue
        if w in embedding_index_1 and w in embedding_index_2:
            embedding_vector = np.mean([embedding_index_1[w], embedding_index_2[w]], axis = 0)
            M.append(embedding_vector)
            continue
        if w in embedding_index_1:
            embedding_vector = embedding_index_1[w]
            M.append(embedding_vector)
            continue
        if w in embedding_index_2:
            embedding_vector = embedding_index_2[w]
            M.append(embedding_vector)
    M = np.array(M)
    v = M.sum(axis = 0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

def word2vec_feature(df, groupby, target, size):
    df_bag = pd.DataFrame(df[[groupby, target]])
    df_bag[target] = df_bag[target].astype(str)
    df_bag[target].fillna('NAN', inplace = True)
    df_bag = df_bag.groupby(groupby, as_index = False)[target].agg({'list':(lambda x: list(x))}).reset_index()
    doc_list = list(df_bag['list'].values)
    w2v = Word2Vec(doc_list, size = size, window = 3, min_count = 1)
    vocab_keys = list(w2v.wv.vocab.keys())
    w2v_array = []
    for v in vocab_keys:
        w2v_array.append(list(w2v.wv[v]))
    df_w2v = pd.DataFrame()
    df_w2v['vocab_keys'] = vocab_keys    
    df_w2v = pd.concat([df_w2v, pd.DataFrame(w2v_array)], axis = 1)
    df_w2v.columns = [target] + ['w2v_%s_%s_%d'%(groupby, target, x) for x in range(size)]
    print ('df_w2v:' + str(df_w2v.shape))
    return df_w2v

def get_textual_features_from_description(df):
    
    # Fill NA
    df[["Description"]] = df[["Description"]].fillna("none")
    
    # Get stopwords count
    print("Get stopword count")
    df['Description_stopword_count'] = df['Description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))
    
    # Convert to lower
    print("Convert to lower")
    df["Description"] = df["Description"].apply(lambda x: x.lower())
    
    # Clean the text
    print("Clean the text")
    df["Description"] = df["Description"].apply(lambda x: clean_text(x))
    
    # Clean spellings
    print("Convert the spellings")
    df["Description"] = df["Description"].apply(lambda x: replace_typical_misspell(x))
    
    # Remove stopwords
    df["Description"] = df["Description"].apply(lambda x: " ".join([item for item in x if item not in stop_words]))
    
    # Get character count
    print("Get character count")
    df['Description_character_count'] = df['Description'].str.len()
    
    # Get word count
    print("Get subjectivity")
    df['Description_word_count'] = df['Description'].apply(lambda x: len(x.split()))
    
    # Get word density
    print("Get word density")
    df['Description_word_density'] = df['Description_character_count']/(df['Description_word_count'] + 1)
    
    # Get number of punctuations in a description
    print("Get punctuation count")
    df['Description_punctuation_count'] = df['Description'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation)))
    
    # Get polarity
    print("Get polarity")
    df['Description_polarity'] = df['Description'].apply(get_polarity)
    
    # Get subjectivity
    print("Get subjectivity")
    df['Description_subjectivity'] = df['Description'].apply(get_subjectivity)
    
    return df

In [6]:
def description(petid, dataclass = "train"):
    try:
        with open("../input/petfinder-adoption-prediction/" + f"{dataclass}_metadata/{petid}-1.json", 'r') as f:
            im_meta = json.load(f)
        descriptions = ""
        for desc in im_meta['labelAnnotations']:
            descriptions += desc['description'] + " "
        return descriptions
    except (FileNotFoundError, KeyError):
        return None
    except Exception as e:
        print(f"Another error: {e}")
        return None

dog_str = re.compile(r"((.*\W)|(^))dog((\W.*)|($))", re.IGNORECASE)
cat_str = re.compile(r"((.*\W)|(^))cat((\W.*)|($))", re.IGNORECASE)
def dog_label(description):
    """dog label from Google vision:
    0 = no label
    1 = 'dog' was in the tag
    """
    return [0, 1][dog_str.search(description) is not None]
    
def cat_label(description):
    """cat label from Google vision:
    0 = no label
    1 = 'cat' was in the tag
    """
    return [0, 1][cat_str.search(description) is not None]

description_fies = [dog_label, cat_label]
def get_dog_cat_label_features(df, dataclass = "train"):
    ftr = np.zeros((len(df), len(description_fies)), dtype = np.int)
    for i, petid in enumerate(df.PetID):
        desc = description(petid, dataclass)
        if desc is not None:
            for j, f in enumerate(description_fies):
                ftr[i,j] = f(desc)
    for i, f in enumerate(description_fies):
        df[f.__name__] = ftr[:,i]
    return df

In [7]:
# Reference - https://www.kaggle.com/christofhenkel/extract-image-features-from-pretrained-nn/notebook
img_size = 256
batch_size = 16

def resize_to_square(im):
    old_size = im.shape[:2] # old_size is in (height, width) format
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    
    # new_size should be in (width, height) format
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im

def load_image(path, pet_id):
    image = cv2.imread(f'{path}{pet_id}-1.jpg')
    new_image = resize_to_square(image)
    new_image = preprocess_input(new_image)
    return new_image

def get_raw_image_features(df, df_type = "train"):
    pet_ids = df['PetID'].values
    n_batches = len(pet_ids) // batch_size + 1
    
    inp = Input((256,256,3))
    backbone = DenseNet121(input_tensor = inp, include_top = False, weights = '../input/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5')
    x = backbone.output
    x = GlobalAveragePooling2D()(x)
    x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
    x = AveragePooling1D(4)(x)
    out = Lambda(lambda x: x[:,:,0])(x)

    m = Model(inp, out)
    
    features = {}
    for b in tqdm_notebook(range(n_batches)):
        start = b*batch_size
        end = (b+1)*batch_size
        batch_pets = pet_ids[start:end]
        batch_images = np.zeros((len(batch_pets), img_size, img_size, 3))
        for i,pet_id in enumerate(batch_pets):
            try:
                batch_images[i] = load_image("../input/petfinder-adoption-prediction/" + df_type + "_images/", pet_id)
            except:
                pass
        batch_preds = m.predict(batch_images)
        for i, pet_id in enumerate(batch_pets):
            features[pet_id] = batch_preds[i]
            
    feats_df = pd.DataFrame.from_dict(features, orient = 'index').reset_index()
    feats_df['PetID'] = feats_df['index']
    feats_df = feats_df.drop('index', 1)
    df = pd.merge(df, feats_df, how = 'left', on = ['PetID'])
    
    return df

In [8]:
def get_aggregate_features(df):
    
#     # Create dummies
#     df = pd.get_dummies(df, columns = ['Vaccinated', 'Dewormed', 'Sterilized', 'Health'])
    
    # Some aggregate features
    aggs = {}
    aggs['Fee'] = ['mean', 'std', 'min', 'max']
    aggs['Quantity'] = ['mean', 'std', 'min', 'max']
    aggs['Age'] = ['mean', 'std', 'min', 'max']
    aggs['PhotoAmt'] = ['mean', 'std', 'min', 'max']
    aggs['VideoAmt'] = ['mean', 'std', 'min', 'max']
    aggs['FurLength'] = ['mean', 'std', 'min', 'max']
    aggs['MaturitySize'] = ['mean', 'std', 'min', 'max']
    aggs['Vaccinated'] = ['sum', 'mean']
    aggs['Dewormed'] = ['sum', 'mean']
    aggs['Sterilized'] = ['sum', 'mean']
    aggs['Health'] = ['sum', 'mean']
    aggs['doc_sent_mag'] = ['mean']
    aggs['doc_sent_score'] = ['mean']
    aggs['vertex_x'] = ['mean']
    aggs['vertex_y'] = ['mean']
    aggs['bounding_confidence'] = ['mean']
    aggs['bounding_importance'] = ['mean']
    aggs['dominant_blue'] = ['mean']
    aggs['dominant_green'] = ['mean']
    aggs['dominant_red'] = ['mean']
    aggs['dominant_pixel_frac'] = ['mean']
    aggs['dominant_score'] = ['mean']
    aggs['label_score'] = ['mean']
    aggs['Description_stopword_count'] = ['mean', 'sum']
    aggs['Description_character_count'] = ['mean', 'sum']
    aggs['Description_word_count'] = ['mean', 'sum']
    aggs['Description_word_density'] = ['mean']
    aggs['Description_punctuation_count'] = ['mean', 'sum']
    aggs['Description_polarity'] = ['mean']
    aggs['Description_subjectivity'] = ['mean']
#     aggs['Gender'] = ['mean', 'std', 'min', 'max']
#     aggs['Color1'] = ['mean', 'std', 'min', 'max']
#     aggs['Color2'] = ['mean', 'std', 'min', 'max']
#     aggs['Color3'] = ['mean', 'std', 'min', 'max']
#     aggs['Vaccinated_1'] = ['mean',  'std','min', 'max']
#     aggs['Vaccinated_2'] = ['mean',  'std','min', 'max']
#     aggs['Vaccinated_3'] = ['mean',  'std','min', 'max']
#     aggs['Dewormed_1'] = ['mean',  'std','min', 'max']
#     aggs['Dewormed_2'] = ['mean',  'std','min', 'max']
#     aggs['Dewormed_3'] = ['mean',  'std','min', 'max']
#     aggs['Sterilized_1'] = ['mean',  'std','min', 'max']
#     aggs['Sterilized_2'] = ['mean',  'std','min', 'max']
#     aggs['Sterilized_3'] = ['mean',  'std','min', 'max']
#     aggs['Health_1'] = ['mean',  'std','min', 'max']
#     aggs['Health_2'] = ['mean',  'std','min', 'max']
#     aggs['Health_3'] = ['mean',  'std','min', 'max']
    
    for groupby_col in ['RescuerID', 'State', 'Breed1', 'Gender']:
        df_ = df.reset_index().groupby(groupby_col).agg(aggs)
        df_.columns = [groupby_col + '_' + '_'.join(col).strip() for col in df_.columns.values]
        df_.reset_index(inplace = True)
        df = pd.merge(df, df_, on = [groupby_col], how = 'left')
        
    df_ = df.reset_index().groupby(['RescuerID', 'State']).agg(aggs)
    df_.columns = ['Rescuer_State' + '_' + '_'.join(col).strip() for col in df_.columns.values]
    df_.reset_index(inplace = True)
    df = pd.merge(df, df_, on = ['RescuerID', 'State'], how = 'left')
    
    df_ = df.reset_index().groupby(['RescuerID', 'Gender']).agg(aggs)
    df_.columns = ['Rescuer_Gender' + '_' + '_'.join(col).strip() for col in df_.columns.values]
    df_.reset_index(inplace = True)
    df = pd.merge(df, df_, on = ['RescuerID', 'Gender'], how = 'left')
        
    return df

In [9]:
class image_annotation_features():
    """Frame work for image annotation processing.
    
    Objective:  We want to go over JSON files only once and get and
                process all that we want to get from it. Such to limit
                disc-IO and thus time.
    Limitation: You have to choose only profile images, or all images 
                till you find what you're lokoing for.
               
    Use:
        ia = image_annotation_features()
        ia.annotation_fies = [my_fie, my_2nd_fie]
            my_fie should be a function taking a JSON structure and produce
            a feature. If the function has not found what it needed and wants
            to see the next profile picture JSON file, it should return None.
        ia.dataclass = "train"
        ia.get_ftr(train)
        ia.dataclass = "test"
        ia.get_ftr(test)
    """

    def __init__(self):
        self.annotation_fies = []
        """List of functions taking a JSON image annotation and 
           producing a single feature"""
        self.dataclass = ["train","test"][0]
        """The type of data"""
        
    def get_ftr(self, data):
        ftr = [[] for i in range(len(self.annotation_fies))]
        for petid in tqdm_notebook(data.PetID):
            fn = '../input/petfinder-adoption-prediction/' + f"{self.dataclass}_metadata/{petid}-1.json"
            self.ftrs = [None] * len(self.annotation_fies)
            self._get_features(fn)
            for i, f in enumerate(self.ftrs):
                ftr[i].append(f if f is not None else 0)

        for i, f in enumerate(self.annotation_fies):
            data[f.__name__] = ftr[i]
            
    def _get_features(self, fn):
        try:
            with open(fn, 'r') as f:
                im_meta = json.load(f)
        except:
            return
        for j, f in enumerate(self.annotation_fies):
            if self.ftrs[j] is not None:
                continue
            try:
                self.ftrs[j] = f(im_meta)
            except Exception as e:
                pass
        return

    def get_ftr_rcsv(self, data):
        ftr = [[] for i in range(len(self.annotation_fies))]
        for petid in tqdm_notebook(data.PetID):
            fn = '../input/petfinder-adoption-prediction/' + f"{self.dataclass}_metadata/{petid}-1.json"
            self.ftrs = [None] * len(self.annotation_fies)
            self._get_features(fn)
            if None in self.ftrs:
                for fn_r in glob('../input/petfinder-adoption-prediction/' + f"{self.dataclass}_metadata/{petid}-*.json"):
                    if fn_r == fn:
                        continue
                    self._get_features(fn_r)
                    if not (None in self.ftrs):
                        break
                        
            for i, f in enumerate(self.ftrs):
                ftr[i].append(f if f is not None else 0)
                
        for i, f in enumerate(self.annotation_fies):
            data[f.__name__] = ftr[i]

In [10]:
def gv_rel_text_area_feature(im_meta):
    text_area = 0
    try:
        for textbox in im_meta['textAnnotations']:
            try:
                rect = np.array([[c['x'], c['y']] for c in textbox['boundingPoly']['vertices']])
                bl = np.min(rect, axis=0)
                size = np.max(rect, axis=0) - bl
                text_area += np.prod(size)
            except KeyError:
                pass
    except KeyError:
        pass
        
    try:
        imsize = im_meta['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]
        im_area = np.prod([int(imsize['y']+1), int(imsize['x']+1)])
    except (KeyError, TypeError):
        return 0
    return text_area/im_area

In [11]:
from itertools import product

def get_rescuer_experience(d_train, d_test):
    """ Creates rescuer experience feature vector.
    Vector of crossproduct of Type and maturity size (8 elements).
    Each element has the experience in that category of the rescuer,
    normalized to the total experience in the total (train+test) dataset.
    Returns:
        Nothing, columns added to the Dataframes
    """
    
    d = d_train.append(d_test, sort=True)
    feature = np.zeros((len(d), 8), np.float32)
    types = d.Type.map(str)
    maturities = d.MaturitySize.map(str)
    h = Counter(d.RescuerID.values + "_" + types + "_" + maturities)
    keys = [f"_{i}_{j}" for i,j in product(set(types), set(maturities))]
    for i, rid in enumerate(d['RescuerID']):
        feature[i] = [h[rid+k] for k in keys]
    for i, k in enumerate(keys):
        d_train["rescuer" + k] = feature[0:len(d_train),i]
        d_test["rescuer" + k] = feature[len(d_train):,i]
    return

In [12]:
class GVBreedDesc():
    def __init__(self):
        # load breeds
        breeds = pd.read_csv('../input/petfinder-adoption-prediction/' + "breed_labels.csv")
        # compile regex
        self.breed_regexs = []
        for ind, breedname in zip(breeds.BreedID, breeds.BreedName):
            self.breed_regexs.append((ind, re.compile(f".*{breedname}.*", re.IGNORECASE)))
        
    def _description(self, im_meta):
        try:
            descriptions = ""
            for desc in im_meta['labelAnnotations']:
                descriptions += desc['description'] + " "
        except KeyError:
            pass
        except Exception as e:
            print(f"Another error: {e}")
            pass
        return descriptions

    def gv_breed_feature(self, im_meta):
        desc = self._description(im_meta)
        for i, regex in self.breed_regexs:
            if regex.match(desc):
                return i
        return None

In [13]:
def get_svd_features(train, test):
    tfv = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')

    train_desc = train["Description"].fillna("none").values
    test_desc = test["Description"].fillna("none").values

    tfv.fit(list(train_desc) + list(test_desc))
    X_train =  tfv.transform(train_desc)
    X_test = tfv.transform(test_desc)

    svd = TruncatedSVD(n_components = 120, random_state = 420)
    svd.fit(X_train)
    X_train = svd.transform(X_train)
    X_test = svd.transform(X_test)

    X_train = pd.DataFrame(X_train, columns=['svd_{}'.format(i) for i in range(120)])
    X_test = pd.DataFrame(X_test, columns=['svd_{}'.format(i) for i in range(120)])
    train = pd.concat((train, X_train), axis = 1)
    test = pd.concat((test, X_test), axis = 1)
    
    return train, test

In [14]:
cols_to_remove = ['Name', 'RescuerID', 'Description', 'PetID']

train, test = train_orig.copy(), test_orig.copy()

# # Get text area in image feature
# ia = image_annotation_features()
# ia.annotation_fies = [gv_rel_text_area_feature]
# ia.dataclass = "test"
# ia.get_ftr(test)
# ia.dataclass = "train"
# feats = ia.get_ftr(train)

# # # Get GV Breed features
# gv_breed_desc = GVBreedDesc()
# ia = image_annotation_features()
# ia.annotation_fies = [gv_breed_desc.gv_breed_feature]
# ia.dataclass = "test"
# ia.get_ftr_rcsv(test)
# ia.dataclass = "train"
# ia.get_ftr_rcsv(train)

# Get rescuer experience feature
get_rescuer_experience(train, test)

# Get textual features
train = get_textual_features_from_description(train)
test = get_textual_features_from_description(test)

# Get TFIDF and SVD features
train, test = get_svd_features(train, test)

# Get description sentiment from JSON data
train = get_description_sentiment(train, "train")
test = get_description_sentiment(test, "test")

# Get image metadata
train = get_image_metadata(train, "train")
test = get_image_metadata(test, "test")

# Get label-annotation features
# train = get_dog_cat_label_features(train, "train")
# test = get_dog_cat_label_features(test, "test")

# Get raw image features
train = get_raw_image_features(train, "train")
test = get_raw_image_features(test, "test")

# Create some aggregate features
train = get_aggregate_features(train)
test = get_aggregate_features(test)

# Save RescuerID for fold grouping
rescuerid = train.RescuerID

target = train['AdoptionSpeed']
train = train.drop('AdoptionSpeed', 1)

# Remove unnecessary columns
train.drop(cols_to_remove, 1, inplace = True)
test.drop(cols_to_remove, 1, inplace = True)

Get stopword count
Convert to lower
Clean the text
Convert the spellings
Get character count
Get subjectivity
Get word density
Get punctuation count
Get polarity
Get subjectivity
Get stopword count
Convert to lower
Clean the text
Convert the spellings
Get character count
Get subjectivity
Get word density
Get punctuation count
Get polarity
Get subjectivity
Instructions for updating:
Colocations handled automatically by placer.


HBox(children=(IntProgress(value=0, max=938), HTML(value='')))




HBox(children=(IntProgress(value=0, max=249), HTML(value='')))




In [15]:
train.shape, test.shape

((14993, 776), (3972, 776))

In [16]:
train.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt,rescuer_2_4,rescuer_2_3,rescuer_2_2,rescuer_2_1,rescuer_1_4,rescuer_1_3,...,Rescuer_Gender_Health_sum,Rescuer_Gender_Health_mean,Rescuer_Gender_doc_sent_mag_mean,Rescuer_Gender_doc_sent_score_mean,Rescuer_Gender_vertex_x_mean,Rescuer_Gender_vertex_y_mean,Rescuer_Gender_bounding_confidence_mean,Rescuer_Gender_bounding_importance_mean,Rescuer_Gender_dominant_blue_mean,Rescuer_Gender_dominant_green_mean,Rescuer_Gender_dominant_red_mean,Rescuer_Gender_dominant_pixel_frac_mean,Rescuer_Gender_dominant_score_mean,Rescuer_Gender_label_score_mean,Rescuer_Gender_Description_stopword_count_mean,Rescuer_Gender_Description_stopword_count_sum,Rescuer_Gender_Description_character_count_mean,Rescuer_Gender_Description_character_count_sum,Rescuer_Gender_Description_word_count_mean,Rescuer_Gender_Description_word_count_sum,Rescuer_Gender_Description_word_density_mean,Rescuer_Gender_Description_punctuation_count_mean,Rescuer_Gender_Description_punctuation_count_sum,Rescuer_Gender_Description_polarity_mean,Rescuer_Gender_Description_subjectivity_mean
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,0,1.0,0.0,0.0,6.0,2.0,0.0,0.0,...,1,1.0,2.4,0.3,359.0,479.0,0.8,1.0,21.0,20.0,25.0,0.39391,0.302789,0.990786,34.0,34,501.0,501,167.0,167,2.982143,8.0,8,0.0,0.0
1,2,1,265,0,1,1,2,0,2,2,3,3,3,1,1,0,41401,0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1,1.0,0.7,-0.2,348.5,387.5,0.8,1.0,17.5,23.5,24.5,0.185342,0.353332,0.98359,12.0,12,137.0,137,43.0,43,3.113636,2.0,2,0.0,0.0
2,1,1,307,0,1,2,7,0,2,2,1,1,2,1,1,0,41326,0,7.0,0.0,0.0,4.0,0.0,6.0,44.0,...,148,1.0,1.473649,0.45,369.372174,443.066779,0.802522,0.993987,91.396673,104.469023,126.513345,0.099522,0.250549,0.967341,8.885135,1315,181.554054,26870,62.648649,9272,2.82471,2.358108,349,0.0,0.0
3,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,0,8.0,0.0,0.0,8.0,1.0,0.0,1.0,...,29,1.035714,1.117857,0.357143,371.645542,380.128394,0.738244,0.927375,87.666424,102.261672,118.28515,0.070447,0.230839,0.881987,34.214286,958,574.714286,16092,189.75,5313,3.008534,10.821429,303,0.0,0.0
4,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,0,3.0,0.0,0.0,3.0,1.0,0.0,4.0,...,66,1.03125,1.94375,0.290625,383.083333,452.079688,0.805833,0.988359,100.985677,119.224219,136.935417,0.118809,0.264074,0.964085,22.515625,1441,333.59375,21350,107.109375,6855,3.090985,6.265625,401,0.0,0.0


# Model Building

In [17]:
# Metric used for this competition (Quadratic Weigthed Kappa aka Quadratic Cohen Kappa Score)
def metric_function(y1, y2):
    return cohen_kappa_score(y1, y2, weights = 'quadratic')

In [18]:
def get_class_bounds(y, y_pred, N = 5, class0_fraction = -1):
    ysort = np.sort(y)
    predsort = np.sort(y_pred)
    bounds = []
    for ibound in range(N-1):
        iy = len(ysort[ysort <= ibound])
        # adjust the number of class 0 predictions?
        if (ibound == 0) and (class0_fraction >= 0.0):
            iy = int(class0_fraction * iy)
        bounds.append(predsort[iy])
    return bounds

def assign_class(y_pred, boundaries):
    """
    Given class boundaries in y_pred units, output integer class values
    """
    y_classes = np.zeros(len(y_pred))
    for iclass, bound in enumerate(boundaries):
        y_classes[y_pred >= bound] = iclass + 1
    return y_classes.astype(int)

In [19]:
# put some numerical values to bins
def to_bins(x, borders):
    for i in range(len(borders)):
        if x <= borders[i]:
            return i
    return len(borders)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _loss(self, coef, X, y, idx):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        ll = -metric_function(y, X_p)
        return ll

    def fit(self, X, y):
        coef = [1.5, 2.0, 2.5, 3.0]
#         cl0fracs = np.array(np.arange(0.01, 1.001, 0.01))
#         boundaries = []
#         kappas = []
#         for cl0frac in cl0fracs:
#             boundary = get_class_bounds(y, X, class0_fraction = cl0frac)
#             train_meta_ints = assign_class(X, boundary)
#             kappa = metric_function(y, train_meta_ints)
#             kappas.append(kappa)
#             boundaries.append(boundary)
#         max_kappa_index = np.array(kappas).argmax()
#         coef = boundaries[max_kappa_index]

        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [(1, 2), (1.5, 2.5), (2, 3), (2.5, 3.5)]
        for it1 in range(10):
            for idx in range(4):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                coef[idx] = a
                la = self._loss(coef, X, y, idx)
                coef[idx] = b
                lb = self._loss(coef, X, y, idx)
                for it in range(20):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        coef[idx] = a
                        la = self._loss(coef, X, y, idx)
                    else:
                        b = b - (b - a) * golden2
                        coef[idx] = b
                        lb = self._loss(coef, X, y, idx)
        self.coef_ = {'x': coef}

    def predict(self, X, coef):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [20]:
# class OptimizedRounder(object):
#     def __init__(self):
#         self.coef_ = 0

#     def _kappa_loss(self, coef, X, y):
#         X_p = np.copy(X)
#         for i, pred in enumerate(X_p):
#             if pred < coef[0]:
#                 X_p[i] = 0
#             elif pred >= coef[0] and pred < coef[1]:
#                 X_p[i] = 1
#             elif pred >= coef[1] and pred < coef[2]:
#                 X_p[i] = 2
#             elif pred >= coef[2] and pred < coef[3]:
#                 X_p[i] = 3
#             else:
#                 X_p[i] = 4

#         ll = metric_function(y, X_p)
#         return -ll

#     def fit(self, X, y):
#         loss_partial = partial(self._kappa_loss, X = X, y = y)
        
#         cl0fracs = np.array(np.arange(0.01, 1.001, 0.01))
#         boundaries = []
#         kappas = []
#         for cl0frac in cl0fracs:
#             boundary = get_class_bounds(y, X, class0_fraction = cl0frac)
#             train_meta_ints = assign_class(X, boundary)
#             kappa = metric_function(y, train_meta_ints)
#             kappas.append(kappa)
#             boundaries.append(boundary)
#         max_kappa_index = np.array(kappas).argmax()
#         initial_coef = boundaries[max_kappa_index]
        
#         self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')

#     def predict(self, X, coef):
#         X_p = np.copy(X)
#         for i, pred in enumerate(X_p):
#             if pred < coef[0]:
#                 X_p[i] = 0
#             elif pred >= coef[0] and pred < coef[1]:
#                 X_p[i] = 1
#             elif pred >= coef[1] and pred < coef[2]:
#                 X_p[i] = 2
#             elif pred >= coef[2] and pred < coef[3]:
#                 X_p[i] = 3
#             else:
#                 X_p[i] = 4
#         return X_p

#     def coefficients(self):
#         return self.coef_['x']

In [21]:
FOLDS = 8
train_predictions = np.zeros((train.shape[0], 1))
test_predictions = np.zeros((test.shape[0], 1))
zero_test_predictions = np.zeros((test.shape[0], 1))

# print("stratified k-folds")
# cv = StratifiedKFold(n_splits = FOLDS, random_state = 42, shuffle = False)
# cv.get_n_splits(train, target)

# print("stratified grouped k-folds")
cv = GroupKFold(n_splits = FOLDS)
cv.get_n_splits(train, target, rescuerid)

cv_scores = []
fold = 1
coefficients = np.zeros((FOLDS, 4))
for train_idx, valid_idx in cv.split(train, target, rescuerid):
    xtrain, xvalid = train.iloc[train_idx], train.iloc[valid_idx]
    ytrain, yvalid = target.iloc[train_idx], target.iloc[valid_idx]
    
    lgb_params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'learning_rate': 0.006,
            'subsample': .8,
            'colsample_bytree': 0.8,
            'n_estimators': 10000,
            'silent': -1,
            'verbose': -1,
            'random_state': 420,
            'bagging_fraction': 0.9212945843023237,
            'bagging_freq': int(2.1100859370529492),
            'feature_fraction': 0.6334740217238963,
            'lambda_l2': 1.543309192604612,
            'max_bin': int(32.46977068537903),
            'max_depth': int(11.982021953762485),
            'min_child_samples': int(44.96596724925662),
            'min_child_weight': 0.5878240657385082,
            'min_split_gain': 0.004619759404679957,
            'num_leaves': int(146.73598418222304)
    }
    
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(
        xtrain, ytrain,
        eval_set = [(xvalid, yvalid)],
        eval_metric = 'rmse',
        verbose = 100,
        early_stopping_rounds = 500
    )
    valid_preds = model.predict(xvalid, num_iteration = model.best_iteration_)

    optR = OptimizedRounder()
    optR.fit(valid_preds, yvalid.values)
    coefficients[fold - 1, :] = optR.coefficients()
    valid_p = optR.predict(valid_preds, coefficients[fold-1,:])
    
    test_preds = model.predict(test, num_iteration = model.best_iteration_)

    scr = metric_function(yvalid.values, valid_p)
    cv_scores.append(scr)
    print("Fold = {}. QWK = {}.".format(fold, scr))
    print("\n")
    train_predictions[valid_idx] = valid_preds.reshape(-1, 1)
    test_predictions += test_preds.reshape(-1, 1)
    fold += 1
test_predictions = test_predictions * 1./FOLDS
print("Mean Score: {}. Std Dev: {}. Mean Coeff: {}".format(np.mean(cv_scores), np.std(cv_scores), np.mean(coefficients, axis = 0)))

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's l2: 1.18075	valid_0's rmse: 1.08662
[200]	valid_0's l2: 1.13093	valid_0's rmse: 1.06345
[300]	valid_0's l2: 1.10496	valid_0's rmse: 1.05117
[400]	valid_0's l2: 1.09394	valid_0's rmse: 1.04592
[500]	valid_0's l2: 1.08684	valid_0's rmse: 1.04251
[600]	valid_0's l2: 1.08289	valid_0's rmse: 1.04062
[700]	valid_0's l2: 1.07977	valid_0's rmse: 1.03912
[800]	valid_0's l2: 1.07771	valid_0's rmse: 1.03813
[900]	valid_0's l2: 1.07642	valid_0's rmse: 1.03751
[1000]	valid_0's l2: 1.0756	valid_0's rmse: 1.03711
[1100]	valid_0's l2: 1.07543	valid_0's rmse: 1.03703
[1200]	valid_0's l2: 1.07482	valid_0's rmse: 1.03674
[1300]	valid_0's l2: 1.07373	valid_0's rmse: 1.03621
[1400]	valid_0's l2: 1.07339	valid_0's rmse: 1.03605
[1500]	valid_0's l2: 1.07297	valid_0's rmse: 1.03584
[1600]	valid_0's l2: 1.07288	valid_0's rmse: 1.0358
[1700]	valid_0's l2: 1.072	valid_0's rmse: 1.03537
[1800]	valid_0's l2: 1.07116	valid_0's rmse: 1.03

In [22]:
optR = OptimizedRounder()
optR.fit(train_predictions.reshape(1,-1)[0], target.values)
coefficients = optR.coefficients()

# Manually adjust coefficients
# coefficients_ = coefficients.copy()
# coefficients_[0] = 1.645
# coefficients_[1] = 2.115
# coefficients_[3] = 2.84

predictions = optR.predict(test_predictions, coefficients).astype(int)
# predictions = [item for sublist in predictions for item in sublist]

In [23]:
sample = pd.read_csv('../input/petfinder-adoption-prediction/test/sample_submission.csv')
sample.AdoptionSpeed = predictions
sample.to_csv('submission.csv', index = False)

## Random Forest

In [24]:
# random_grid = {
#     'bootstrap': [True, False],
#     'max_depth': [50, 85],
#     'max_features': ['auto'],
#     'min_samples_leaf': [10, 15],
#     'min_samples_split': [10, 15],
#     'n_estimators': [150, 200, 215],
#     'random_state' : [seed]
# }

# rf = RandomForestClassifier()
# # rf_random = RandomizedSearchCV(estimator = rf, 
# #                                param_distributions = random_grid, 
# #                                n_iter = 100,
# #                                cv = 3,
# #                                scoring = kappa_scorer,
# #                                verbose = 2, 
# #                                random_state = 42, 
# #                                n_jobs = -1)
# # rf_random.fit(train, target)
# # print(rf_random.best_score_)
# # print(rf_random.best_params_)

# best_params = {'random_state': 42, 
#                'n_estimators': 215, 
#                'min_samples_split': 10, 
#                'min_samples_leaf': 10, 
#                'max_features': 'auto', 
#                'max_depth': 50, 
#                'bootstrap': False}
# rf.set_params(**best_params)
# rf.fit(train, target)

In [25]:
# rf_val_score = np.mean(cross_val_score(rf, train, target, scoring = kappa_scorer, cv=5))
# rf_val_score

## Gradient Boosting

In [26]:
# Create the random grid
# random_grid = {'loss' : ['deviance'],
#                 'learning_rate' : [.025, 0.5],
#                 'max_depth': [5, 8],
#                 'max_features': ['auto'],
#                 'min_samples_leaf': [100],
#                 'min_samples_split': [100],
#                 'n_estimators': [100],
#                 'subsample' : [.8],
#                 'random_state' : [seed]
#               }

# gbm = GradientBoostingClassifier()
# gbm_random = RandomizedSearchCV(estimator = gbm, 
#                    param_distributions = random_grid, 
#                    n_iter = 100,
#                    cv = 3,
#                    scoring = kappa_scorer,
#                    verbose = 2, 
#                    random_state = 42, 
#                    n_jobs = -1)
# gbm_random.fit(train, target)

# best_params = {'subsample': 0.8, 
#                'random_state': 42, 
#                'n_estimators': 100, 
#                'min_samples_split': 100, 
#                'min_samples_leaf': 100, 
#                'max_features': 'auto', 
#                'max_depth': 8, 
#                'loss': 'deviance', 
#                'learning_rate': 0.025}
# gbm.set_params(**best_params)
# gbm.fit(train, target)

In [27]:
# gbm_val_score = np.mean(cross_val_score(gbm, train, target, scoring = kappa_scorer, cv=5))
# gbm_val_score

## XGBoost

In [28]:
# # Parameters
# xgb_params = {'objective' : 'multi:softmax',
#               'eval_metric' : 'mlogloss',
#               'eta' : 0.05,
#               'max_depth' : 4,
#               'num_class' : 5,
#               'lambda' : 0.8
# }

# d_train = xgb.DMatrix(train, label = target)
# d_val = xgb.DMatrix(x_val, label = y_val)

# watchlist = [(d_train, 'train'), (d_val, 'valid')]

# xgb_model = xgb.train(xgb_params, 
#                 d_train, 
#                 400)

In [29]:
# metric_function(xgb_model.predict(xgb.DMatrix(x_val)).astype(int), y_val)
# dtest = xgb.DMatrix(test)

# Submission

In [30]:
# submission_df = pd.DataFrame(data={'PetID' : test_orig['PetID'], 
#                                    'AdoptionSpeed' : xgb_model.predict(dtest).astype(int)})
# submission_df.to_csv('submission.csv', index = False)

In [31]:
# submission_df.head()