In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from ndcg_scoring import make_ndcg_scorer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime, date
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from IPython.display import display
import gc


users = pd.read_pickle('checkpoint_39')
users.set_index('id',inplace=True)
users.drop([col for col in users.columns if 'pct_booking_request' in col],axis=1,inplace=True)
users.drop([col for col in users.columns if 'booking_request_count' in col],axis=1,inplace=True)
colx = users.columns.tolist()
colx.remove('country_destination')

train_users_path = 'data/train_users_2.csv'
train_users = pd.read_csv(train_users_path)
target = train_users['country_destination']

# Create numeric label for each of the 12 target labels
labels = target.values
le = LabelEncoder()
y = le.fit_transform(labels)
X1 = users[~(users['country_destination'].isnull())][colx]
y = users[~(users['country_destination'].isnull())]['country_destination']
X1.fillna(0,inplace=True)

X = X1[:len(target)]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

# NDCG Scorer function
ndcg_scorer = make_ndcg_scorer()

# For displaying confusion matricies later
le_classes = le.classes_

In [2]:
def dcg_at_k(r, k, method=1):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k=5, method=1):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


def score_predictions(preds, truth, n_modes=5):
    """
    preds: pd.DataFrame
      one row for each observation, one column for each prediction.
      Columns are sorted from left to right descending in order of likelihood.
    truth: pd.Series
      one row for each obeservation.
    """
    assert(len(preds)==len(truth))

    r = pd.DataFrame(0, index=preds.index, columns=preds.columns, dtype=np.float64)
    for col in preds.columns:
        r[col] = (preds[col] == truth) * 1.0

    score = pd.Series(r.apply(ndcg_at_k, axis=1, reduce=True), name='score')
    return score.mean()


In [3]:
"""Metrics to compute the model performance."""
# ref: https://www.kaggle.com/davidgasquez/airbnb-recruiting-new-user-bookings/ndcg-scorer
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.
    
    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.
        
    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)

def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    #lb.fit(range(len(predictions) + 1))  ## original
    #lb.fit(range(predictions.shape[1] + 1))
    #T = lb.transform(ground_truth)
    
    T = lb.fit_transform(ground_truth) 

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

In [4]:
def model_predict(model_idx, clf):
    model_idx += "_pred"
    start(model_idx)
    y_pred = clf.predict(X_test)
    stop(model_idx)
    return y_pred

In [5]:
from sklearn.metrics import accuracy_score,confusion_matrix

def model_accuracy_ndcg(model_idx, clf, y_pred):
    print ('Accuracy:' + str(accuracy_score(y_test, y_pred)))
    accuracy_ndcg[model_idx,'Accuracy'] = accuracy_score(y_test, y_pred)
    prob_idx = model_idx + '_prob'
    start(prob_idx)
    y_prob=clf.predict_proba(X_test)
    stop(prob_idx)
    print('nDCG:' + str(ndcg_score(y_conv,y_prob)))
    ndcg_score(y_conv,y_prob)
    accuracy_ndcg[model_idx,'nDCG'] = ndcg_score(y_conv,y_prob)

In [8]:
from time import time
from math import sqrt
import logging
import os
import sys
import csv

runtime = {}
started = {}
accuracy_ndcg={}

        
def start(key):
    started[key]=time()


def stop(key):
    stop=time()
    start=started.pop(key,None)
    if start:
        if key in runtime:
            runtime[key].append(stop-float(start))
        else:
            runtime[key]=[stop-float(start)]
    else:
        logging.error("stopping non started timer: %s"%key)

In [9]:
logr = pd.read_pickle('logr_model.sav')

logr_pred = model_predict('logr', logr)
pd.crosstab(y_test, logr_pred, rownames=['Actual Destination'], colnames=['Predicted Destination'])

Predicted Destination,IT,NDF,US
Actual Destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AU,0,92,16
CA,0,240,46
DE,1,178,33
ES,0,390,60
FR,0,856,149
GB,0,387,78
IT,0,474,93
NDF,15,22831,2063
NL,0,137,15
PT,0,41,2


In [17]:
def get_top_5(clf, file_path):
    y_pred_prob=clf.predict_proba(X1)
    id_test = X1.reset_index()['id']
    ids = []  #list of ids
    cts = []  #list of countries
    for i in range(len(id_test)):
        idx = id_test[i]
        ids += [idx] * 5
        arr = [clf.classes_.tolist()[k] for k in np.argsort(y_pred_prob_1[i])[::-1]] 
        cts += arr[:5]

    #Generate submission
    sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    sub.to_csv(file_path,index=False)

In [18]:
get_top_5(logr, './kaggle_submit/sub_logr.csv')

In [19]:
dt = pd.read_pickle('dt_model.sav')

dt_pred = model_predict('dt', dt)
pd.crosstab(y_test, dt_pred, rownames=['Actual Destination'], colnames=['Predicted Destination'])

Predicted Destination,AU,CA,DE,ES,FR,GB,IT,NDF,NL,PT,US,other
Actual Destination,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AU,0,1,1,3,2,0,2,43,0,0,49,7
CA,2,0,3,6,10,1,3,137,1,0,100,23
DE,2,3,1,3,12,3,7,95,0,0,77,9
ES,3,6,3,11,15,5,6,208,5,0,158,30
FR,1,9,7,24,33,21,22,471,4,0,355,58
GB,1,3,6,6,9,6,6,217,3,0,178,30
IT,2,5,3,9,19,5,9,246,4,0,226,39
NDF,59,157,108,250,574,247,299,16154,73,31,5902,1055
NL,0,4,3,2,8,2,2,62,0,0,57,12
PT,0,0,0,0,2,2,0,20,0,0,15,4


In [20]:
get_top_5(dt, './kaggle_submit/sub_dt.csv')

In [21]:
rf = pd.read_pickle('rf_best_model.sav')

rf_pred = model_predict('rf', rf)
pd.crosstab(y_test, rf_pred, rownames=['Actual Destination'], colnames=['Predicted Destination'])

Predicted Destination,NDF,US
Actual Destination,Unnamed: 1_level_1,Unnamed: 2_level_1
AU,82,26
CA,192,94
DE,164,48
ES,353,97
FR,746,259
GB,355,110
IT,426,141
NDF,23101,1808
NL,113,39
PT,28,15


In [22]:
get_top_5(rf, './kaggle_submit/sub_rf.csv')