In [1]:
import os
import gc
import cv2
from datetime import datetime, timedelta, date
import numpy as np
import warnings
import re
import json
import pandas as pd
import pprint
import string
from glob import glob
from numpy import nanmean
from functools import partial
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold, RandomizedSearchCV, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb
import scipy as sp
from collections import Counter
from textblob import TextBlob
from keras.applications.densenet import preprocess_input, DenseNet121
from keras.models import Model
from keras.layers import GlobalAveragePooling2D, Input, Lambda, AveragePooling1D
import keras.backend as K
from tqdm import tqdm, tqdm_notebook

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, cohen_kappa_score, log_loss, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier

# For text processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing import sequence, text
from keras.layers import Input, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from nltk import word_tokenize
from nltk.corpus import stopwords

from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from imblearn.over_sampling import SMOTE

# stop_words = []
stop_words = list(set(stopwords.words('english')))
warnings.filterwarnings('ignore')
punctuation = string.punctuation

pd.set_option('max_columns', 50)
pd.options.mode.chained_assignment = None
pp = pprint.PrettyPrinter(indent = 4)

import random
random.seed(420)

Using TensorFlow backend.


In [2]:
train_orig = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv')
test_orig = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv')

# Feature Engineering

## Description Sentiment

In [3]:
def get_description_sentiment(df, df_type = "train"):
    doc_sent_mag = []
    doc_sent_score = []
    nf_count = 0
    for pet_id in df.PetID.values:
        try:
            with open('../input/petfinder-adoption-prediction/' + df_type + '_sentiment/' + pet_id + '.json', 'r') as f:
                sentiment = json.load(f)
            doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
            doc_sent_score.append(sentiment['documentSentiment']['score'])
        except FileNotFoundError:
            nf_count += 1
            doc_sent_mag.append(-1)
            doc_sent_score.append(-1)
            
    df['doc_sent_mag'] = doc_sent_mag
    df['doc_sent_score'] = doc_sent_score
    return df

## Image Metadata

In [4]:
def get_image_metadata(df, df_type = "train"):
    vertex_xs = []
    vertex_ys = []
    bounding_confidences = []
    bounding_importance_fracs = []
    dominant_blues = []
    dominant_greens = []
    dominant_reds = []
    dominant_pixel_fracs = []
    dominant_scores = []
    label_scores = []
    nf_count = 0
    nl_count = 0
    
    pets_metadata = {}
    for file_name in os.listdir('../input/petfinder-adoption-prediction/' + df_type + '_metadata/'):
        pet_id = file_name.split('-')[0]
        if pet_id not in pets_metadata:
            pets_metadata[pet_id] = []
        pets_metadata[pet_id].append(file_name)
    
    for pet_id in df.PetID.values:
        if pet_id in pets_metadata:
            pet_id_metadata_files = pets_metadata[pet_id]
            
            temp_vertex_xs = []
            temp_vertex_ys = []
            temp_bounding_confidences = []
            temp_bounding_importance_fracs = []
            temp_dominant_blues = []
            temp_dominant_greens = []
            temp_dominant_reds = []
            temp_dominant_pixel_fracs = []
            temp_dominant_scores = []
            temp_label_scores = []
            for file in pet_id_metadata_files:
                with open('../input/petfinder-adoption-prediction/' + df_type + '_metadata/' + file, 'r') as f:
                    data = json.load(f)
                
                try:
                    vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
                    temp_vertex_xs.append(vertex_x)
                except:
                    temp_vertex_xs.append(-1)
                
                try:
                    vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
                    temp_vertex_ys.append(vertex_y)
                except:
                    temp_vertex_ys.append(-1)
                
                try:
                    bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
                    temp_bounding_confidences.append(bounding_confidence)
                except:
                    temp_bounding_confidences.append(-1)
                
                try:
                    bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
                    temp_bounding_importance_fracs.append(bounding_importance_frac)
                except:
                    temp_bounding_importance_fracs.append(-1)
                
                try:
                    dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
                    temp_dominant_blues.append(dominant_blue)
                except:
                    temp_dominant_blues.append(-1)
                
                try:
                    dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
                    temp_dominant_greens.append(dominant_green)
                except:
                    temp_dominant_greens.append(-1)
                
                try:
                    dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
                    temp_dominant_reds.append(dominant_red)
                except:
                    temp_dominant_reds.append(-1)
                
                try:
                    dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
                    temp_dominant_pixel_fracs.append(dominant_pixel_frac)
                except:
                    temp_dominant_pixel_fracs.append(-1)
                
                try:
                    dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
                    temp_dominant_scores.append(dominant_score)
                except:
                    temp_dominant_scores.append(-1)

                if data.get('labelAnnotations'):
                    label_score = data['labelAnnotations'][0]['score']
                    temp_label_scores.append(label_score)
                else:
                    temp_label_scores.append(-1)
            
            vertex_xs.append(np.mean(temp_vertex_xs))
            vertex_ys.append(np.mean(temp_vertex_ys))
            bounding_confidences.append(np.mean(temp_bounding_confidences))
            bounding_importance_fracs.append(np.mean(temp_bounding_importance_fracs))
            dominant_blues.append(np.mean(temp_dominant_blues))
            dominant_greens.append(np.mean(temp_dominant_greens))
            dominant_reds.append(np.mean(temp_dominant_reds))
            dominant_pixel_fracs.append(np.mean(temp_dominant_pixel_fracs))
            dominant_scores.append(np.mean(temp_dominant_scores))
            label_scores.append(np.mean(temp_label_scores))
        else:
            nf_count += 1
            vertex_xs.append(-1)
            vertex_ys.append(-1)
            bounding_confidences.append(-1)
            bounding_importance_fracs.append(-1)
            dominant_blues.append(-1)
            dominant_greens.append(-1)
            dominant_reds.append(-1)
            dominant_pixel_fracs.append(-1)
            dominant_scores.append(-1)
            label_scores.append(-1)
            
    df.loc[:, 'vertex_x'] = vertex_xs
    df.loc[:, 'vertex_y'] = vertex_ys
    df.loc[:, 'bounding_confidence'] = bounding_confidences
    df.loc[:, 'bounding_importance'] = bounding_importance_fracs
    df.loc[:, 'dominant_blue'] = dominant_blues
    df.loc[:, 'dominant_green'] = dominant_greens
    df.loc[:, 'dominant_red'] = dominant_reds
    df.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
    df.loc[:, 'dominant_score'] = dominant_scores
    df.loc[:, 'label_score'] = label_scores
    
    return df

## Textual Features

In [5]:
# Reference - https://www.kaggle.com/shivamb/extensive-text-data-feature-engineering

embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 1257 # max number of words in a description to use

pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

mispell_dict = {"aren't" : "are not",
                "can't" : "cannot",
                "couldn't" : "could not",
                "didn't" : "did not",
                "doesn't" : "does not",
                "don't" : "do not",
                "hadn't" : "had not",
                "hasn't" : "has not",
                "haven't" : "have not",
                "he'd" : "he would",
                "he'll" : "he will",
                "he's" : "he is",
                "i'd" : "I would",
                "i'd" : "I had",
                "i'll" : "I will",
                "i'm" : "I am",
                "isn't" : "is not",
                "it's" : "it is",
                "it'll":"it will",
                "i've" : "I have",
                "let's" : "let us",
                "mightn't" : "might not",
                "mustn't" : "must not",
                "shan't" : "shall not",
                "she'd" : "she would",
                "she'll" : "she will",
                "she's" : "she is",
                "shouldn't" : "should not",
                "that's" : "that is",
                "there's" : "there is",
                "they'd" : "they would",
                "they'll" : "they will",
                "they're" : "they are",
                "they've" : "they have",
                "we'd" : "we would",
                "we're" : "we are",
                "weren't" : "were not",
                "we've" : "we have",
                "what'll" : "what will",
                "what're" : "what are",
                "what's" : "what is",
                "what've" : "what have",
                "where's" : "where is",
                "who'd" : "who would",
                "who'll" : "who will",
                "who're" : "who are",
                "who's" : "who is",
                "who've" : "who have",
                "won't" : "will not",
                "wouldn't" : "would not",
                "you'd" : "you would",
                "you'll" : "you will",
                "you're" : "you are",
                "you've" : "you have",
                "'re": " are",
                "wasn't": "was not",
                "we'll":" will",
                "didn't": "did not",
                "tryin'":"trying"
}

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

def get_polarity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        pol = textblob.sentiment.polarity
    except:
        pol = 0.0
    return pol

def get_subjectivity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        subj = textblob.sentiment.subjectivity
    except:
        subj = 0.0
    return subj

def get_parts_of_speech_count(x):
    cnt = {
        'noun': 0,
        'pron': 0,
        'verb': 0,
        'adj': 0,
        'adv': 0
    }

    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            for pos_tag_type in cnt.keys():
                if ppo in pos_dic[pos_tag_type]:
                    cnt[pos_tag_type] += 1
    except:
        pass
    return [cnt[i] for i in cnt.keys()]

def sent2vec(description, embedding_index_1, embedding_index_2 = None):
    M = []
    for w in word_tokenize(description):
        if not w.isalpha():
            continue
        if w in embedding_index_1 and w in embedding_index_2:
            embedding_vector = np.mean([embedding_index_1[w], embedding_index_2[w]], axis = 0)
            M.append(embedding_vector)
            continue
        if w in embedding_index_1:
            embedding_vector = embedding_index_1[w]
            M.append(embedding_vector)
            continue
        if w in embedding_index_2:
            embedding_vector = embedding_index_2[w]
            M.append(embedding_vector)
    M = np.array(M)
    v = M.sum(axis = 0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

def get_textual_features_from_description(df):
    
    # Fill NA
    df[["Description"]] = df[["Description"]].fillna("none")
    
    # Get stopwords count
    print("Get stopword count")
    df['Description_stopword_count'] = df['Description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))
    
    # Convert to lower
    print("Convert to lower")
    df["Description"] = df["Description"].apply(lambda x: x.lower())
    
    # Clean the text
    print("Clean the text")
    df["Description"] = df["Description"].apply(lambda x: clean_text(x))
    
    # Clean spellings
    print("Convert the spellings")
    df["Description"] = df["Description"].apply(lambda x: replace_typical_misspell(x))
    
    # Remove stopwords
    df["Description"] = df["Description"].apply(lambda x: " ".join([item for item in x if item not in stop_words]))
    
    # Get character count
    print("Get character count")
    df['Description_character_count'] = df['Description'].str.len()
    
    # Get word count
    print("Get subjectivity")
    df['Description_word_count'] = df['Description'].apply(lambda x: len(x.split()))
    
    # Get word density
    print("Get word density")
    df['Description_word_density'] = df['Description_character_count']/(df['Description_word_count'] + 1)
    
    # Get number of punctuations in a description
    print("Get punctuation count")
    df['Description_punctuation_count'] = df['Description'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation)))
    
    # Get polarity
    print("Get polarity")
    df['Description_polarity'] = df['Description'].apply(get_polarity)
    
    # Get subjectivity
    print("Get subjectivity")
    df['Description_subjectivity'] = df['Description'].apply(get_subjectivity)

    return df

## Raw Image Features

In [6]:
# Reference - https://www.kaggle.com/christofhenkel/extract-image-features-from-pretrained-nn/notebook
img_size = 256
batch_size = 16

def resize_to_square(im):
    old_size = im.shape[:2] # old_size is in (height, width) format
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    
    # new_size should be in (width, height) format
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im

def load_image(path, pet_id):
    image = cv2.imread(f'{path}{pet_id}-1.jpg')
    new_image = resize_to_square(image)
    new_image = preprocess_input(new_image)
    return new_image

def get_raw_image_features(df, df_type = "train"):
    pet_ids = df['PetID'].values
    n_batches = len(pet_ids) // batch_size + 1
    
    inp = Input((256,256,3))
    backbone = DenseNet121(input_tensor = inp, include_top = False, weights = '../input/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5')
    x = backbone.output
    x = GlobalAveragePooling2D()(x)
    x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
    x = AveragePooling1D(4)(x)
    out = Lambda(lambda x: x[:,:,0])(x)

    m = Model(inp, out)
    
    features = {}
    for b in tqdm_notebook(range(n_batches)):
        start = b*batch_size
        end = (b+1)*batch_size
        batch_pets = pet_ids[start:end]
        batch_images = np.zeros((len(batch_pets), img_size, img_size, 3))
        for i,pet_id in enumerate(batch_pets):
            try:
                batch_images[i] = load_image("../input/petfinder-adoption-prediction/" + df_type + "_images/", pet_id)
            except:
                pass
        batch_preds = m.predict(batch_images)
        for i, pet_id in enumerate(batch_pets):
            features[pet_id] = batch_preds[i]
            
    feats_df = pd.DataFrame.from_dict(features, orient = 'index').reset_index()
    feats_df['PetID'] = feats_df['index']
    feats_df = feats_df.drop('index', 1)
    df = pd.merge(df, feats_df, how = 'left', on = ['PetID'])
    
    return df

## Aggregate Features

In [7]:
def get_aggregate_features(df):
    
#     # Create dummies
#     df = pd.get_dummies(df, columns = ['Vaccinated', 'Dewormed', 'Sterilized', 'Health'])
    
    # Some RescuerID aggregate features
    aggs = {}
    aggs['Fee'] = ['mean', 'std', 'min', 'max']
    aggs['Quantity'] = ['mean', 'std', 'min', 'max']
    aggs['Age'] = ['mean', 'std', 'min', 'max']
    aggs['PhotoAmt'] = ['mean', 'std', 'min', 'max']
    aggs['VideoAmt'] = ['mean', 'std', 'min', 'max']
    aggs['FurLength'] = ['mean', 'std', 'min', 'max']
    aggs['MaturitySize'] = ['mean', 'std', 'min', 'max']
    
    for groupby_col in ['RescuerID', 'State', 'Gender']:
        df_ = df.reset_index().groupby(groupby_col).agg(aggs)
        df_.columns = [groupby_col + '_' + '_'.join(col).strip() for col in df_.columns.values]
        df_.reset_index(inplace = True)
        df = pd.merge(df, df_, on = [groupby_col], how = 'left')
    
    df_ = df.reset_index().groupby(['RescuerID', 'State']).agg(aggs)
    df_.columns = ['Rescuer_State' + '_' + '_'.join(col).strip() for col in df_.columns.values]
    df_.reset_index(inplace = True)
    df = pd.merge(df, df_, on = ['RescuerID', 'State'], how = 'left')
    
    df_ = df.reset_index().groupby(['RescuerID', 'Gender']).agg(aggs)
    df_.columns = ['Rescuer_Gender' + '_' + '_'.join(col).strip() for col in df_.columns.values]
    df_.reset_index(inplace = True)
    df = pd.merge(df, df_, on = ['RescuerID', 'Gender'], how = 'left')
        
        
    return df

## Text Area

In [8]:
def gv_rel_text_area_feature(im_meta):
    text_area = 0
    try:
        for textbox in im_meta['textAnnotations']:
            try:
                rect = np.array([[c['x'], c['y']] for c in textbox['boundingPoly']['vertices']])
                bl = np.min(rect, axis=0)
                size = np.max(rect, axis=0) - bl
                text_area += np.prod(size)
            except KeyError:
                pass
    except KeyError:
        pass
        
    try:
        imsize = im_meta['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]
        im_area = np.prod([int(imsize['y']+1), int(imsize['x']+1)])
    except (KeyError, TypeError):
        return 0
    return text_area/im_area

## Rescuer Experience

In [9]:
from itertools import product

def get_rescuer_experience(d_train, d_test):
    """ Creates rescuer experience feature vector.
    Vector of crossproduct of Type and maturity size (8 elements).
    Each element has the experience in that category of the rescuer,
    normalized to the total experience in the total (train+test) dataset.
    Returns:
        Nothing, columns added to the Dataframes
    """
    
    d = d_train.append(d_test, sort=True)
    feature = np.zeros((len(d), 8), np.float32)
    types = d.Type.map(str)
    maturities = d.MaturitySize.map(str)
    h = Counter(d.RescuerID.values + "_" + types + "_" + maturities)
    keys = [f"_{i}_{j}" for i,j in product(set(types), set(maturities))]
    for i, rid in enumerate(d['RescuerID']):
        feature[i] = [h[rid+k] for k in keys]
    for i, k in enumerate(keys):
        d_train["rescuer" + k] = feature[0:len(d_train),i]
        d_test["rescuer" + k] = feature[len(d_train):,i]
    return

## SVD Features

In [10]:
def get_svd_features(train, test):
    tfv = TfidfVectorizer(min_df=3,  max_features=10000,
        strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1,
        stop_words = 'english')

    train_desc = train["Description"].fillna("none").values
    test_desc = test["Description"].fillna("none").values

    tfv.fit(list(train_desc) + list(test_desc))
    X_train =  tfv.transform(train_desc)
    X_test = tfv.transform(test_desc)

    svd = TruncatedSVD(n_components = 120, random_state = 420)
    svd.fit(X_train)
    X_train = svd.transform(X_train)
    X_test = svd.transform(X_test)

    X_train = pd.DataFrame(X_train, columns=['svd_{}'.format(i) for i in range(120)])
    X_test = pd.DataFrame(X_test, columns=['svd_{}'.format(i) for i in range(120)])
    train = pd.concat((train, X_train), axis = 1)
    test = pd.concat((test, X_test), axis = 1)
    
    return train, test

## State Features

In [11]:
# state GDP: https://en.wikipedia.org/wiki/List_of_Malaysian_states_by_GDP
state_gdp = {
    41336: 116.679,
    41325: 40.596,
    41367: 23.02,
    41401: 190.075,
    41415: 5.984,
    41324: 37.274,
    41332: 42.389,
    41335: 52.452,
    41330: 67.629,
    41380: 5.642,
    41327: 81.284,
    41345: 80.167,
    41342: 121.414,
    41326: 280.698,
    41361: 32.270
}

# state population: https://en.wikipedia.org/wiki/Malaysia
state_population = {
    41336: 33.48283,
    41325: 19.47651,
    41367: 15.39601,
    41401: 16.74621,
    41415: 0.86908,
    41324: 8.21110,
    41332: 10.21064,
    41335: 15.00817,
    41330: 23.52743,
    41380: 2.31541,
    41327: 15.61383,
    41345: 32.06742,
    41342: 24.71140,
    41326: 54.62141,
    41361: 10.35977
}

state_area = {
    41336:19102,
    41325:9500,
    41367:15099,
    41401:243,
    41415:91,
    41324:1664,
    41332:6686,
    41335:36137,
    41330:21035,
    41380:821,
    41327:1048,
    41345:73631,
    41342:124450,
    41326:8104,
    41361:13035
}

def get_state_features(df):
    df["state_gdp"] = df.State.map(state_gdp)
    df["state_population"] = df.State.map(state_population)
    df["state_area"] = df.State.map(state_area)
    
    return df

In [12]:
cols_to_remove = ['Name', 'RescuerID', 'Description', 'PetID']

train, test = train_orig.copy(), test_orig.copy()

# # Get text area in image feature
# ia = image_annotation_features()
# ia.annotation_fies = [gv_rel_text_area_feature]
# ia.dataclass = "test"
# ia.get_ftr(test)
# ia.dataclass = "train"
# feats = ia.get_ftr(train)

# Get rescuer experience feature
get_rescuer_experience(train, test)

# Get TFIDF and SVD features
train, test = get_svd_features(train, test)

# Get description sentiment from JSON data
train = get_description_sentiment(train, "train")
test = get_description_sentiment(test, "test")

# Get image metadata
train = get_image_metadata(train, "train")
test = get_image_metadata(test, "test")

# Create some custom features
train = get_aggregate_features(train)
test = get_aggregate_features(test)

# Get raw image features
train = get_raw_image_features(train, "train")
test = get_raw_image_features(test, "test")

# Get textual features
train = get_textual_features_from_description(train)
test = get_textual_features_from_description(test)

# Get state features
train = get_state_features(train)
test = get_state_features(test)

# Get target
target = train['AdoptionSpeed']
train = train.drop('AdoptionSpeed', 1)

# Save RescuerID for fold grouping
rescuerid = train.RescuerID

# Remove unnecessary columns
train = train.drop(cols_to_remove, 1)
test = test.drop(cols_to_remove, 1)

Instructions for updating:
Colocations handled automatically by placer.


HBox(children=(IntProgress(value=0, max=938), HTML(value='')))




HBox(children=(IntProgress(value=0, max=247), HTML(value='')))


Get stopword count
Convert to lower
Clean the text
Convert the spellings
Get character count
Get subjectivity
Get word density
Get punctuation count
Get polarity
Get subjectivity
Get stopword count
Convert to lower
Clean the text
Convert the spellings
Get character count
Get subjectivity
Get word density
Get punctuation count
Get polarity
Get subjectivity


In [13]:
train.shape

(14993, 565)

In [14]:
train.columns

Index([                         'Type',                           'Age',
                              'Breed1',                        'Breed2',
                              'Gender',                        'Color1',
                              'Color2',                        'Color3',
                        'MaturitySize',                     'FurLength',
       ...
          'Description_stopword_count',   'Description_character_count',
              'Description_word_count',      'Description_word_density',
       'Description_punctuation_count',          'Description_polarity',
            'Description_subjectivity',                     'state_gdp',
                    'state_population',                    'state_area'],
      dtype='object', length=565)

In [15]:
train.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt,rescuer_1_1,rescuer_1_3,rescuer_1_2,rescuer_1_4,rescuer_2_1,rescuer_2_3,...,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,Description_stopword_count,Description_character_count,Description_word_count,Description_word_density,Description_punctuation_count,Description_polarity,Description_subjectivity,state_gdp,state_population,state_area
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,...,1.337702,0.150102,1.081104,1.282496,0.643566,0.787699,0.176625,0.575706,1.088628,0.439556,0.52046,1.54707,0.832572,0.599093,0.763348,34,501,167,2.982143,8,0.0,0.0,280.698,54.62141,8104
1,2,1,265,0,1,1,2,0,2,2,3,3,3,1,1,0,41401,0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.944075,0.481491,0.977006,1.29576,0.751895,0.628259,0.686865,0.563999,0.968191,1.070276,1.545742,0.894409,0.838596,0.468237,0.916672,12,137,43,3.113636,2,0.0,0.0,190.075,16.74621,243
2,1,1,307,0,1,2,7,0,2,2,1,1,2,1,1,0,41326,0,7.0,6.0,44.0,399.0,6.0,0.0,0.0,...,0.705539,0.79394,0.469396,0.278331,1.043843,0.579116,0.557624,1.131406,0.720514,1.49667,0.870955,1.289683,1.184462,0.465114,0.892826,30,513,171,2.982558,9,0.0,0.0,280.698,54.62141,8104
3,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,0,8.0,0.0,1.0,40.0,0.0,1.0,0.0,...,1.690941,1.246175,0.743014,1.178628,0.865275,1.295853,0.326143,0.291669,1.608087,1.119176,1.470889,0.591444,0.832755,0.483021,1.134127,8,209,67,3.073529,7,0.0,0.0,190.075,16.74621,243
4,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,0,3.0,8.0,4.0,118.0,0.0,1.0,0.0,...,1.418224,0.187639,1.176707,0.638147,0.425842,1.092663,0.669894,0.395784,0.886075,1.219729,1.033965,1.065686,0.304054,0.438069,0.676818,42,559,182,3.054645,9,0.0,0.0,280.698,54.62141,8104


# Model Building

In [16]:
# Metric used for this competition (Quadratic Weigthed Kappa aka Quadratic Cohen Kappa Score)
def metric_function(y1, y2):
    return cohen_kappa_score(y1, y2, weights = 'quadratic')

In [17]:
def get_class_bounds(y, y_pred, N = 5, class0_fraction = -1):
    ysort = np.sort(y)
    predsort = np.sort(y_pred)
    bounds = []
    for ibound in range(N-1):
        iy = len(ysort[ysort <= ibound])
        # adjust the number of class 0 predictions?
        if (ibound == 0) and (class0_fraction >= 0.0) :
            iy = int(class0_fraction * iy)
        bounds.append(predsort[iy])
    return bounds

def assign_class(y_pred, boundaries):
    """
    Given class boundaries in y_pred units, output integer class values
    """
    y_classes = np.zeros(len(y_pred))
    for iclass, bound in enumerate(boundaries):
        y_classes[y_pred >= bound] = iclass + 1
    return y_classes.astype(int)

In [18]:
# put some numerical values to bins
def to_bins(x, borders):
    for i in range(len(borders)):
        if x <= borders[i]:
            return i
    return len(borders)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _loss(self, coef, X, y, idx):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        ll = -metric_function(y, X_p)
        return ll

    def fit(self, X, y):
        coef = [1.5, 2.0, 2.5, 3.0]
        golden1 = 0.618
        golden2 = 1 - golden1
        ab_start = [(1, 2), (1.5, 2.5), (2, 3), (2.5, 3.5)]
        for it1 in range(10):
            for idx in range(4):
                # golden section search
                a, b = ab_start[idx]
                # calc losses
                coef[idx] = a
                la = self._loss(coef, X, y, idx)
                coef[idx] = b
                lb = self._loss(coef, X, y, idx)
                for it in range(20):
                    # choose value
                    if la > lb:
                        a = b - (b - a) * golden1
                        coef[idx] = a
                        la = self._loss(coef, X, y, idx)
                    else:
                        b = b - (b - a) * golden2
                        coef[idx] = b
                        lb = self._loss(coef, X, y, idx)
        self.coef_ = {'x': coef}

    def predict(self, X, coef):
        X_p = np.array([to_bins(pred, coef) for pred in X])
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [19]:
# class OptimizedRounder(object):
#     def __init__(self):
#         self.coef_ = 0

#     def _kappa_loss(self, coef, X, y):
#         X_p = np.copy(X)
#         for i, pred in enumerate(X_p):
#             if pred < coef[0]:
#                 X_p[i] = 0
#             elif pred >= coef[0] and pred < coef[1]:
#                 X_p[i] = 1
#             elif pred >= coef[1] and pred < coef[2]:
#                 X_p[i] = 2
#             elif pred >= coef[2] and pred < coef[3]:
#                 X_p[i] = 3
#             else:
#                 X_p[i] = 4

#         ll = metric_function(y, X_p)
#         return -ll

#     def fit(self, X, y):
#         loss_partial = partial(self._kappa_loss, X = X, y = y)
        
#         cl0fracs = np.array(np.arange(0.01, 1.001, 0.01))
#         boundaries = []
#         kappas = []
#         for cl0frac in cl0fracs:
#             boundary = get_class_bounds(y, X, class0_fraction = cl0frac)
#             train_meta_ints = assign_class(X, boundary)
#             kappa = metric_function(y, train_meta_ints)
#             kappas.append(kappa)
#             boundaries.append(boundary)
#         max_kappa_index = np.array(kappas).argmax()
#         initial_coef = boundaries[max_kappa_index]
        
#         self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')

#     def predict(self, X, coef):
#         X_p = np.copy(X)
#         for i, pred in enumerate(X_p):
#             if pred < coef[0]:
#                 X_p[i] = 0
#             elif pred >= coef[0] and pred < coef[1]:
#                 X_p[i] = 1
#             elif pred >= coef[1] and pred < coef[2]:
#                 X_p[i] = 2
#             elif pred >= coef[2] and pred < coef[3]:
#                 X_p[i] = 3
#             else:
#                 X_p[i] = 4
#         return X_p

#     def coefficients(self):
#         return self.coef_['x']

In [20]:
FOLDS = 10
train_predictions = np.zeros((train.shape[0], 1))
test_predictions = np.zeros((test.shape[0], 1))
zero_test_predictions = np.zeros((test.shape[0], 1))

# print("stratified k-folds")
# cv = StratifiedKFold(n_splits = FOLDS, random_state = 42, shuffle = False)
# cv.get_n_splits(train, target)

# print("stratified grouped k-folds")
cv = GroupKFold(n_splits = FOLDS)
cv.get_n_splits(train, target, rescuerid)

cv_scores = []
fold = 1
coefficients = np.zeros((FOLDS, 4))
for train_idx, valid_idx in cv.split(train, target, rescuerid):
    xtrain, xvalid = train.iloc[train_idx], train.iloc[valid_idx]
    ytrain, yvalid = target.iloc[train_idx], target.iloc[valid_idx]
    
    lgb_params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'learning_rate': 0.006,
            'subsample': .8,
            'colsample_bytree': 0.8,
            'n_estimators': 10000,
            'silent': -1,
            'verbose': -1,
            'random_state': 420,
            'bagging_fraction': 0.9212945843023237,
            'bagging_freq': int(2.1100859370529492),
            'feature_fraction': 0.6334740217238963,
            'lambda_l2': 1.543309192604612,
            'max_bin': int(32.46977068537903),
            'max_depth': int(11.982021953762485),
            'min_child_samples': int(44.96596724925662),
            'min_child_weight': 0.5878240657385082,
            'min_split_gain': 0.004619759404679957,
            'num_leaves': int(146.73598418222304)
    }
    
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(
        xtrain, ytrain,
        eval_set = [(xvalid, yvalid)],
        eval_metric = 'rmse',
        verbose = 100,
        early_stopping_rounds = 100
    )
    
    valid_preds = model.predict(xvalid, num_iteration = model.best_iteration_)
    optR = OptimizedRounder()
    optR.fit(valid_preds, yvalid.values)
    coefficients[fold - 1, :] = optR.coefficients()
    valid_p = optR.predict(valid_preds, coefficients[fold-1,:])
    scr = metric_function(yvalid.values, valid_p)
    cv_scores.append(scr)
    print("Fold = {}. QWK = {}.".format(fold, scr))
    print("\n")
    
    test_preds = model.predict(test, num_iteration = model.best_iteration_)
    train_predictions[valid_idx] = valid_preds.reshape(-1, 1)
    test_predictions += test_preds.reshape(-1, 1)
    fold += 1
test_predictions = test_predictions * 1./FOLDS
print("Mean Score: {}. Std Dev: {}. Mean Coeff: {}".format(np.mean(cv_scores), np.std(cv_scores), np.mean(coefficients, axis = 0)))

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 1.14484	valid_0's rmse: 1.06997
[200]	valid_0's l2: 1.08875	valid_0's rmse: 1.04343
[300]	valid_0's l2: 1.06215	valid_0's rmse: 1.03061
[400]	valid_0's l2: 1.05039	valid_0's rmse: 1.02488
[500]	valid_0's l2: 1.04462	valid_0's rmse: 1.02207
[600]	valid_0's l2: 1.04157	valid_0's rmse: 1.02057
[700]	valid_0's l2: 1.04059	valid_0's rmse: 1.02009
[800]	valid_0's l2: 1.03841	valid_0's rmse: 1.01903
[900]	valid_0's l2: 1.0377	valid_0's rmse: 1.01868
Early stopping, best iteration is:
[857]	valid_0's l2: 1.03703	valid_0's rmse: 1.01834
Fold = 1. QWK = 0.4300947097947181.


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 1.21137	valid_0's rmse: 1.10062
[200]	valid_0's l2: 1.14175	valid_0's rmse: 1.06853
[300]	valid_0's l2: 1.10642	valid_0's rmse: 1.05186
[400]	valid_0's l2: 1.08683	valid_0's rmse: 1.04251
[500]	valid_0's l2: 1.0765	valid_0's rmse: 1.03755
[600]	valid_0's l2: 1.0

In [21]:
optR = OptimizedRounder()
optR.fit(train_predictions.reshape(1, -1)[0], target.values)
coefficients = optR.coefficients()
predictions = optR.predict(test_predictions, coefficients).astype(int)

In [22]:
# optR = OptimizedRounder()
# optR.fit(train_predictions.reshape(1,-1)[0], target.values)
# coefficients = optR.coefficients()

# # Manually adjust coefficients
# coefficients_ = coefficients.copy()
# coefficients_[0] = 1.645
# coefficients_[1] = 2.115
# coefficients_[3] = 2.84

# predictions = optR.predict(test_predictions, coefficients_).astype(int)

In [23]:
sample = pd.read_csv('../input/petfinder-adoption-prediction/test/sample_submission.csv')
sample.AdoptionSpeed = predictions
sample.to_csv('submission.csv', index = False)