In [None]:
# https://www.kaggle.com/reppy4620/xgboost

In [1]:
import gc
import glob
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import warnings

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from joblib import Parallel, delayed
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

np.random.seed(seed=1337)
warnings.filterwarnings('ignore')

split_char = '/'

In [2]:
os.listdir('./input')

['breed_labels.csv',
 'color_labels.csv',
 'state_labels.csv',
 'test',
 'test_images',
 'test_metadata',
 'test_sentiment',
 'train',
 'train_images',
 'train_metadata',
 'train_sentiment']

In [3]:
train = pd.read_csv('./input/train/train.csv')
test = pd.read_csv('./input/test/test.csv')
sample_submission = pd.read_csv('./input/test/sample_submission.csv')

In [4]:
"""
about matedata and sentiment
"""

labels_breed = pd.read_csv('./input/breed_labels.csv')
labels_state = pd.read_csv('./input/state_labels.csv')
labels_color = pd.read_csv('./input/color_labels.csv')

In [5]:
train_image_files = [x.replace('\\', '/') for x in sorted(glob.glob('./input/train_images/*.jpg'))]
train_metadata_files = [x.replace('\\', '/') for x in sorted(glob.glob('./input/train_metadata/*.json'))]
train_sentiment_files = [x.replace('\\', '/') for x in sorted(glob.glob('./input/train_sentiment/*.json'))]

In [6]:
print(f'num of train images files: {len(train_image_files)}')
print(f'num of train metadata files: {len(train_metadata_files)}')
print(f'num of train sentiment files: {len(train_sentiment_files)}')

num of train images files: 58311
num of train metadata files: 58311
num of train sentiment files: 14442


In [7]:
test_image_files = [x.replace('\\', '/') for x in sorted(glob.glob('./input/test_images/*.jpg'))]
test_metadata_files = [x.replace('\\', '/') for x in sorted(glob.glob('./input/test_metadata/*.json'))]
test_sentiment_files = [x.replace('\\', '/') for x in sorted(glob.glob('./input/test_sentiment/*.json'))]

In [8]:
print(f'num of test images files: {len(test_image_files)}')
print(f'num of test metadata files: {len(test_metadata_files)}')
print(f'num of test setiment files: {len(test_sentiment_files)}')

num of test images files: 15040
num of test metadata files: 15040
num of test setiment files: 3815


#### Train

In [9]:
# Images:
train_df_ids = train[['PetID']]
print(train_df_ids.shape)

(14993, 1)


In [10]:
# Metadata:
train_df_ids = train[['PetID']]
train_df_metadata = pd.DataFrame(train_metadata_files)
train_df_metadata.columns = ['metadata_filename']

In [11]:
train_metadata_pets = train_df_metadata['metadata_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
train_df_metadata = train_df_metadata.assign(PetID=train_metadata_pets)
print(len(train_metadata_pets.unique()))

14652


In [12]:
pets_with_metadatas = len(np.intersect1d(train_metadata_pets.unique(), train_df_ids['PetID'].unique()))
print(f'fraction of pets with metadata: {pets_with_metadatas / train_df_ids.shape[0]:.3f}')

fraction of pets with metadata: 0.977


In [13]:
# Sentiment:
train_df_ids = train[['PetID']]
train_df_sentiment = pd.DataFrame(train_sentiment_files)
train_df_sentiment.columns = ['sentiment_filename']
train_sentiment_pets = train_df_sentiment['sentiment_filename'].apply(lambda x: x.split(split_char)[-1].split('.')[0])
train_df_sentiment = train_df_sentiment.assign(PetID=train_sentiment_pets)
print(len(train_sentiment_pets.unique()))

14442


In [14]:
pets_with_sentiments = len(np.intersect1d(train_sentiment_pets.unique(), train_df_ids['PetID'].unique()))
print(f'fraction of pets with sentiment: {pets_with_sentiments / train_df_ids.shape[0]:.3f}')

fraction of pets with sentiment: 0.963


#### Test

In [15]:
# Images:
test_df_ids = test[['PetID']]
print(test_df_ids.shape)

(3948, 1)


In [16]:
# Metadata:
test_df_metadata = pd.DataFrame(test_metadata_files)
test_df_metadata.columns = ['metadata_filename']
test_metadata_pets = test_df_metadata['metadata_filename'].apply(lambda x: x.split(split_char)[-1].split('-')[0])
test_df_metadata = test_df_metadata.assign(PetID = test_metadata_pets)
print(len(test_metadata_pets.unique()))

3821


In [17]:
pets_with_metadatas = len(np.intersect1d(test_metadata_pets.unique(), test_df_ids['PetID'].unique()))
print(f'fraction of pets with metadata: {pets_with_metadatas / test_df_ids.shape[0]:.3f}')

fraction of pets with metadata: 0.968


In [18]:
# Sentiment:
test_df_sentiment = pd.DataFrame(test_sentiment_files)
test_df_sentiment.columns = ['sentiment_filename']
test_sentiment_pets = test_df_sentiment['sentiment_filename'].apply(lambda x: x.split(split_char)[-1].split('.')[0])
test_df_sentiment = test_df_sentiment.assign(PetID = test_sentiment_pets)
print(len(test_sentiment_pets.unique()))

3815


In [19]:
pets_with_sentiments = len(np.intersect1d(test_sentiment_pets.unique(), test_df_ids['PetID'].unique()))
print(f'fraction of pets with sentiment: {pets_with_sentiments / test_df_ids.shape[0]:.3f}')

fraction of pets with sentiment: 0.966


#### Extract features from json

In [20]:
"""
extract features from json
"""

class PetFinderParser(object):
    
    def __init__(self, debug=False):
        
        self.debug = debug
        self.sentence_sep = ' '
        
        self.extract_sentiment_text = False
    
    def open_json_file(self, filename):
        with open(filename, 'r', encoding='utf-8') as f:
            json_file = json.load(f)
        return json_file
        
    def parse_sentiment_file(self, file):
        """
        Parse sentiment file. Output DF with sentiment features.
        """
        
        file_sentiment = file['documentSentiment']
        file_entities = [x['name'] for x in file['entities']]
        file_entities = self.sentence_sep.join(file_entities)
        
        file_sentences_sentiment = [x['sentiment'] for x in file['sentences']]
        
        file_sentences_sentiment = pd.DataFrame.from_dict(
            file_sentences_sentiment, orient='columns')
        file_sentences_sentiment_df = pd.DataFrame(
            {
                'magnitude_sum': file_sentences_sentiment['magnitude'].sum(axis=0),
                'score_sum': file_sentences_sentiment['score'].sum(axis=0),
            }, index=[0]
        )
        
        df_sentiment = pd.DataFrame.from_dict(file_sentiment, orient='index').T
        df_sentiment = pd.concat([df_sentiment, file_sentences_sentiment_df], axis=1)
            
        df_sentiment['entities'] = file_entities
        df_sentiment = df_sentiment.add_prefix('sentiment_')
        
        return df_sentiment
    
    def parse_metadata_file(self, file):
        """
        Parse metadata file. Output DF with metadata features.
        """
        
        file_keys = list(file.keys())
        
        if 'labelAnnotations' in file_keys:
            file_annots = file['labelAnnotations']
            file_top_score = np.asarray([x['score'] for x in file_annots]).mean()
            file_top_desc = [x['description'] for x in file_annots]
        else:
            file_top_score = np.nan
            file_top_desc = ['']
        
        file_colors = file['imagePropertiesAnnotation']['dominantColors']['colors']
        file_crops = file['cropHintsAnnotation']['cropHints']

        file_color_score = np.asarray([x['score'] for x in file_colors]).mean()
        file_color_pixelfrac = np.asarray([x['pixelFraction'] for x in file_colors]).mean()

        file_crop_conf = np.asarray([x['confidence'] for x in file_crops]).mean()
        
        if 'importanceFraction' in file_crops[0].keys():
            file_crop_importance = np.asarray([x['importanceFraction'] for x in file_crops]).mean()
        else:
            file_crop_importance = np.nan

        df_metadata = {
            'annots_score': file_top_score,
            'crop_importance': file_crop_importance,
            'annots_top_desc': self.sentence_sep.join(file_top_desc)
        }
        
        df_metadata = pd.DataFrame.from_dict(df_metadata, orient='index').T
        df_metadata = df_metadata.add_prefix('metadata_')
        
        return df_metadata
    
def extract_additional_features(pet_id, mode='train'):
    
    sentiment_filename = f'./input/{mode}_sentiment/{pet_id}.json'
    try:
        sentiment_file = pet_parser.open_json_file(sentiment_filename)
        df_sentiment = pet_parser.parse_sentiment_file(sentiment_file)
        df_sentiment['PetID'] = pet_id
    except FileNotFoundError:
        df_sentiment = []

    dfs_metadata = []
    metadata_filenames = sorted(glob.glob(f'./input/{mode}_metadata/{pet_id}*.json'))
    if len(metadata_filenames) > 0:
        for f in metadata_filenames:
            metadata_file = pet_parser.open_json_file(f)
            df_metadata = pet_parser.parse_metadata_file(metadata_file)
            df_metadata['PetID'] = pet_id
            dfs_metadata.append(df_metadata)
        dfs_metadata = pd.concat(dfs_metadata, ignore_index=True, sort=False)
    dfs = [df_sentiment, dfs_metadata]
    
    return dfs
    
pet_parser = PetFinderParser()

In [21]:
debug = False
train_pet_ids = train.PetID.unique()
test_pet_ids = test.PetID.unique()

if debug:
    train_pet_ids = train_pet_ids[:1000]
    test_pet_ids = test_pet_ids[:500]

dfs_train = Parallel(n_jobs=6, verbose=1)(
    delayed(extract_additional_features)(i, mode='train') for i in train_pet_ids)

train_dfs_sentiment = [x[0] for x in dfs_train if isinstance(x[0], pd.DataFrame)]
train_dfs_metadata = [x[1] for x in dfs_train if isinstance(x[1], pd.DataFrame)]

train_dfs_sentiment = pd.concat(train_dfs_sentiment, ignore_index=True, sort=False)
train_dfs_metadata = pd.concat(train_dfs_metadata, ignore_index=True, sort=False)

print(train_dfs_sentiment.shape, train_dfs_metadata.shape)

dfs_test = Parallel(n_jobs=6, verbose=1)(
    delayed(extract_additional_features)(i, mode='test') for i in test_pet_ids)

test_dfs_sentiment = [x[0] for x in dfs_test if isinstance(x[0], pd.DataFrame)]
test_dfs_metadata = [x[1] for x in dfs_test if isinstance(x[1], pd.DataFrame)]

test_dfs_sentiment = pd.concat(test_dfs_sentiment, ignore_index=True, sort=False)
test_dfs_metadata = pd.concat(test_dfs_metadata, ignore_index=True, sort=False)

print(test_dfs_sentiment.shape, test_dfs_metadata.shape)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    1.9s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:    6.4s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:   13.5s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:   23.6s
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:   36.4s
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:   51.9s
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed:  1.5min
[Parallel(n_jobs=6)]: Done 4038 tasks      | elapsed:  1.9min
[Parallel(n_jobs=6)]: Done 4988 tasks      | elapsed:  2.3min
[Parallel(n_jobs=6)]: Done 6038 tasks      | elapsed:  2.8min
[Parallel(n_jobs=6)]: Done 7188 tasks      | elapsed:  3.3min
[Parallel(n_jobs=6)]: Done 8438 tasks      | elapsed:  3.9min
[Parallel(n_jobs=6)]: Done 9788 tasks      | elapsed:  4.5min
[Parallel(n_jobs=6)]: Done 11238 tasks      | elapsed:  5.2mi

(14442, 6) (58311, 4)


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:    1.1s
[Parallel(n_jobs=6)]: Done 716 tasks      | elapsed:    7.5s
[Parallel(n_jobs=6)]: Done 1716 tasks      | elapsed:   18.4s
[Parallel(n_jobs=6)]: Done 3116 tasks      | elapsed:   32.4s
[Parallel(n_jobs=6)]: Done 3948 out of 3948 | elapsed:   40.6s finished


(3815, 6) (15040, 4)


#### Group extracted features by PetID

In [42]:
"""
group extracted features by PetID
"""

aggregates = ['sum', 'mean']
sent_agg = ['sum']

# Train
train_metadata_desc = train_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
train_metadata_desc = train_metadata_desc.reset_index()
train_metadata_desc['metadata_annots_top_desc'] = \
train_metadata_desc['metadata_annots_top_desc'].apply(lambda x: ' '.join(x))

prefix = 'metadata'
train_metadata_gr = train_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in train_metadata_gr.columns:
    if 'PetID' not in i:
        train_metadata_gr[i] = train_metadata_gr[i].astype(float)
train_metadata_gr = train_metadata_gr.groupby(['PetID']).agg(aggregates)
train_metadata_gr.columns = pd.Index([f'{c[0]}_{c[1].upper()}' for c in train_metadata_gr.columns.tolist()])
train_metadata_gr = train_metadata_gr.reset_index()

train_sentiment_desc = train_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
train_sentiment_desc = train_sentiment_desc.reset_index()
train_sentiment_desc['sentiment_entities'] = \
train_sentiment_desc['sentiment_entities'].apply(lambda x: ' '.join(x))

prefix = 'sentiment'
train_sentiment_gr = train_dfs_sentiment.drop(['sentiment_entities'], axis=1)
for i in train_sentiment_gr.columns:
    if 'PetID' not in i:
        train_sentiment_gr[i] = train_sentiment_gr[i].astype(float)
train_sentiment_gr = train_sentiment_gr.groupby(['PetID']).agg(sent_agg)
train_sentiment_gr.columns = pd.Index([f'{c[0]}' for c in train_sentiment_gr.columns.tolist()])
train_sentiment_gr = train_sentiment_gr.reset_index()


# Test
test_metadata_desc = test_dfs_metadata.groupby(['PetID'])['metadata_annots_top_desc'].unique()
test_metadata_desc = test_metadata_desc.reset_index()
test_metadata_desc['metadata_annots_top_desc'] = \
test_metadata_desc['metadata_annots_top_desc'].apply(lambda x : ' '.join(x))

prefix = 'metadata'
test_metadata_gr = test_dfs_metadata.drop(['metadata_annots_top_desc'], axis=1)
for i in test_metadata_gr.columns:
    if 'PetID' not in i:
        test_metadata_gr[i] = test_metadata_gr[i].astype(float)
test_metadata_gr = test_metadata_gr.groupby(['PetID']).agg(aggregates)
test_metadata_gr.columns = pd.Index([f'{c[0]}_{c[1].upper()}' for c in test_metadata_gr.columns.tolist()])
test_metadata_gr = test_metadata_gr.reset_index()


test_sentiment_desc = test_dfs_sentiment.groupby(['PetID'])['sentiment_entities'].unique()
test_sentiment_desc = test_sentiment_desc.reset_index()
test_sentiment_desc['sentiment_entities'] = \
test_sentiment_desc['sentiment_entities'].apply(lambda x: ' '.join(x))

prefix = 'sentiment'
test_sentiment_gr = test_dfs_sentiment.drop(['sentiment_entities'], axis=1)
for i in test_sentiment_gr.columns:
    if 'PetID' not in i:
        test_sentiment_gr[i] = test_sentiment_gr[i].astype(float)
test_sentiment_gr = test_sentiment_gr.groupby(['PetID']).agg(sent_agg)
test_sentiment_gr.columns = pd.Index([f'{c[0]}' for c in test_sentiment_gr.columns.tolist()])
test_sentiment_gr = test_sentiment_gr.reset_index()

In [43]:
"""
merge processed DFs with base train/test DF:
"""

# Train merges:
train_proc = train.copy()
train_proc = train_proc.merge(train_sentiment_gr, how='left', on='PetID')
train_proc = train_proc.merge(train_metadata_gr, how='left', on='PetID')
train_proc = train_proc.merge(train_sentiment_desc, how='left', on='PetID')
train_proc = train_proc.merge(train_metadata_desc, how='left', on='PetID')

# Test merges:
test_proc = test.copy()
test_proc = test_proc.merge(test_sentiment_gr, how='left', on='PetID')
test_proc = test_proc.merge(test_metadata_gr, how='left', on='PetID')
test_proc = test_proc.merge(test_sentiment_desc, how='left', on='PetID')
test_proc = test_proc.merge(test_metadata_desc, how='left', on='PetID')

print(train_proc.shape, test_proc.shape)
assert train_proc.shape[0] == train.shape[0]
assert test_proc.shape[0] == test.shape[0]

(14993, 34) (3948, 33)


In [44]:
train_breed_main = train_proc[['Breed1']].merge(
    labels_breed, how='left', left_on='Breed1', 
    right_on='BreedID', suffixes=('', '_main_breed'))
train_breed_main = train_breed_main.iloc[:, 2:]
train_breed_main = train_breed_main.add_prefix('main_breed_')

train_breed_second = train_proc[['Breed2']].merge(
    labels_breed, how='left', left_on='Breed2', 
    right_on='BreedID', suffixes=('', '_second_breed'))
train_breed_second = train_breed_second.iloc[:, 2:]
train_breed_second = train_breed_second.add_prefix('second_breed_')

train_proc = pd.concat([train_proc, train_breed_main, train_breed_second], axis=1)

test_breed_main = test_proc[['Breed1']].merge(
    labels_breed, how='left', left_on='Breed1', 
    right_on='BreedID', suffixes=('', '_main_breed'))
test_breed_main = test_breed_main.iloc[:, 2:]
test_breed_main = test_breed_main.add_prefix('main_breed_')

test_breed_second = test_proc[['Breed2']].merge(
    labels_breed, how='left', left_on='Breed2', 
    right_on='BreedID', suffixes=('', '_second_breed'))
test_breed_second = test_breed_second.iloc[:, 2:]
test_breed_second = test_breed_second.add_prefix('second_breed_')

test_proc = pd.concat([test_proc, test_breed_main, test_breed_second], axis=1)

print(train_proc.shape, test_proc.shape)

(14993, 38) (3948, 37)


In [46]:
X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False)

In [50]:
X_temp = X.copy()

text_columns = ['Description', 'metadata_annots_top_desc', 'sentiment_entities']
categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']

to_drop_columns = ['PetID', 'Name', 'RescuerID']

In [51]:
rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index()
rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID')

In [52]:
for i in categorical_columns:
    X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0]

In [53]:
X_text = X_temp[text_columns]

for i in X_text.columns:
    X_text.loc[:, i] = X_text.loc[:, i].fillna('none')

#### TF-IDF

In [65]:
"""
TF-IDF
"""

n_components = 5
text_features = []

# Generate text features
for i in X_text.columns:
    
    # Initialize decomposition methods:
    print(f'generating features from: {i}')
    tfv = TfidfVectorizer(
        min_df=2, max_features=None, strip_accents='unicode', 
        analyzer='word', token_pattern=r'(?u)\b\w+\b', 
        ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1
    )
    
    svd_ = TruncatedSVD(n_components=n_components, random_state=1337)
    tfidf_col = tfv.fit_transform(X_text.loc[:, i].values)
    
    svd_col = svd_.fit_transform(tfidf_col)
    svd_col = pd.DataFrame(svd_col)
    svd_col = svd_col.add_prefix('TFIDF_{}_'.format(i))
    
    text_features.append(svd_col)

text_features = pd.concat(text_features, axis=1)
X_temp = pd.concat([X_temp, text_features], axis=1)

for i in X_text.columns:
    X_temp = X_temp.drop(i, axis=1)

generating features from: Description
generating features from: metadata_annots_top_desc
generating features from: sentiment_entities


In [68]:
X_temp = X_temp.drop(to_drop_columns, axis=1)

In [70]:
X_train = X_temp.loc[np.isfinite(X_temp.AdoptionSpeed), :]
X_test = X_temp.loc[~np.isfinite(X_temp.AdoptionSpeed), :]

X_test = X_test.drop(['AdoptionSpeed'], axis=1)

assert X_train.shape[0] == train.shape[0]
assert X_test.shape[0] == test.shape[0]

train_cols = X_train.columns.tolist()
train_cols.remove('AdoptionSpeed')

test_cols = X_test.columns.tolist()

assert np.all(train_cols == test_cols)

In [71]:
X_train_non_null = X_train.fillna(-1)
X_test_non_null = X_test.fillna(-1)

In [72]:
len(train_cols), len(test_cols)

(47, 47)

In [73]:
X_train_non_null.shape, X_test_non_null.shape

((14993, 48), (3948, 47))

In [74]:
import scipy as sp

from collections import Counter
from functools import partial
from math import sqrt

from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix

In [85]:
# FROM: https://www.kaggle.com/myltykritik/simple-lgbm-image-features

# The following 3 functions have been taken from Ben Hamner's github repository
# https://github.com/benhamner/Metrics

def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings


def quadratic_weighted_kappa(y, y_pred):
    """
    Calculates the quadratic weighted keppa
    axquadratic_weighted_kappa calculates the quadratic weighted kappa values,
    which is a measure of inter-rater agreement between two raters that provide
    discrete numeric ratings. Potential values range from -1 (representing complete
    disagreement) to 1 (representing complete agreement). A kappa value of 0 is 
    expected if all agreement is due to chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b each 
    correspond to a list of integer ratings. These lists must have the same length.
    The rating should be integers, and it is assumed that they contain the complete
    range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating is the 
    minimum possible rating, and max_rating is the maximum possible rating.
    """
    rater_a = y
    rater_b = y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

#### OptimizedRounder from [OptimizedRounder() -Improved](https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved)

In [86]:
"""
OptimizedRounder from OptimizedRounder() - Improved
(https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved)
"""

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0
    
    def _kappa_loss(self, coef, X, y):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return -cohen_kappa_score(y, preds, weights='quadratic')
    
    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
    
    def predict(self, X, coef):
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
        return preds
    
    def coefficients(self):
        return self.coef_['x']

#### Train model

In [87]:
"""
Train model
"""

import xgboost as xgb
from sklearn.model_selection import StratifiedKFold

xgb_params = {
    'eval_metric': 'rmse',
    'seed': 1337,
    'silent': 1,
}

def run_xgb(params, X_train, X_test):
    n_splits = 5
    verbose_eval = 1000
    num_rounds = 30000
    early_stop = 500
    
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1337)
    
    oof_train = np.zeros((X_train.shape[0]))
    oof_test = np.zeros((X_test.shape[0], n_splits))
    
    i = 0
    
    for train_idx, valid_idx in kf.split(X_train, X_train['AdoptionSpeed'].values):
        X_tr = X_train.iloc[train_idx, :]
        X_val = X_train.iloc[valid_idx, :]
        
        y_tr = X_tr['AdoptionSpeed'].values
        X_tr = X_tr.drop(['AdoptionSpeed'], axis=1)
        
        y_val = X_val['AdoptionSpeed'].values
        X_val = X_val.drop(['AdoptionSpeed'], axis=1)
        
        d_train = xgb.DMatrix(data=X_tr, label=y_tr, feature_names=X_tr.columns)
        d_valid = xgb.DMatrix(data=X_val, label=y_val, feature_names=X_val.columns)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(dtrain=d_train, num_boost_round=num_rounds, evals=watchlist, 
                          early_stopping_rounds=early_stop, verbose_eval=verbose_eval, 
                          params=params)
        valid_pred = model.predict(xgb.DMatrix(X_val, feature_names=X_val.columns), 
                                   ntree_limit=model.best_ntree_limit)
        test_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_test.columns), 
                                  ntree_limit=model.best_ntree_limit)
        
        oof_train[valid_idx] = valid_pred
        oof_test[:, i] = test_pred
        
        i += 1
    
    return model, oof_train, oof_test

In [88]:
model, oof_train, oof_test = run_xgb(xgb_params, X_train_non_null, X_test_non_null)

[0]	train-rmse:1.8028	valid-rmse:1.81097
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[33]	train-rmse:0.831329	valid-rmse:1.06437

[0]	train-rmse:1.80474	valid-rmse:1.80561
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[23]	train-rmse:0.886203	valid-rmse:1.05005

[0]	train-rmse:1.80355	valid-rmse:1.81105
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[26]	train-rmse:0.866392	valid-rmse:1.06203

[0]	train-rmse:1.80232	valid-rmse:1.81532
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 500 rounds.
Stopping. Best iteration:
[21]	train

In [89]:
optR = OptimizedRounder()
optR.fit(oof_train, X_train['AdoptionSpeed'].values)
coefficients = optR.coefficients()
valid_pred = optR.predict(oof_train, coefficients)
qwk = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, valid_pred)
print("QWK = ", qwk)

QWK =  0.4165190896147707


In [90]:
coefficients_ = coefficients.copy()
coefficients_[0] = 1.65
train_predicitons = optR.predict(oof_train, coefficients_).astype(np.int8)
test_predictions = optR.predict(oof_test.mean(axis=1), coefficients_).astype(np.int8)

In [92]:
submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions})