In [1]:
%matplotlib inline

In [2]:
from scipy import stats as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings('ignore')

sns.set_context("poster")

Model prediction
---

Focus: Get data in and build random forests + testing on test/train split

#### Pull in data and finalise before classifications

In [77]:
train_x = pd.read_csv('data/complete_train_x_sample.csv')
train_x.columns

train_y = pd.read_csv('data/train_sample_y.csv')
print train_y.head()

print len(train_x), len(train_y)

   event_id  hotel_cluster
0         0              7
1         1             59
2         2             36
3         3             55
4         4             62
150000 150000


## Read back in the dat and set-up the classifiers

In [78]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=5):
    """
    https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [79]:
##Lets start with a simple event level classification problem
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(
    train_x.fillna(-1000).values,
    train_y['hotel_cluster'].tolist(),
    test_size=0.08,
    train_size=0.2,
    random_state=42)

### Simple classifier with 1 output

In [80]:
clf = RandomForestClassifier(n_estimators=50)
clf = clf.fit(X_tr, y_tr)
clfp = clf.predict(X_te)
mapk([[int(x)] for x in y_te],[[int(x)] for x in clfp.tolist()],k=5)

0.12633333333333333

### Create multiple outputs and re-test

In [81]:
## Now lets try and get multiple labels out
from sklearn.preprocessing import MultiLabelBinarizer

b_y_tr = MultiLabelBinarizer().fit_transform([[int(x)] for x in y_tr])
mclf = clf.fit(X_tr, b_y_tr)
probs = mclf.predict_proba(X_te)

In [89]:
print probs[1]

[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 ..., 
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]


#### Test different outputs, depending on cut-off or by ordered top-x

In [82]:
## rank probabilities in each line and select top X depending on cutoff (max 5)
def get_outputs(probs,cutoff=0.1):
    X_te_op = []
    for p in xrange(len(probs[1])):
        tt = [(x[p].tolist()[1],i) for i, x in enumerate(probs)]
        X_te_op.append(list(set([sorted(tt,reverse=True)[0][1]] + [x[1] for x in sorted(tt,reverse=True) if x[0] > cutoff]))[:5])
    return X_te_op

## just select top x given sorted list
def get_top_x(probs,n=5):
    X_te_op = []
    for p in xrange(len(probs[1])):
        tt = [(x[p].tolist()[1],i) for i, x in enumerate(probs)]
        X_te_op.append([x[1] for x in sorted(tt,reverse=True)][:n])
    return X_te_op

In [83]:
for x in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]:
    X_test_outputs = get_outputs(probs,cutoff=x)
    print x, mapk([[int(x)] for x in y_te],X_test_outputs,k=5)

for x in [1,2,3,4,5]:
    X_test_outputs = get_top_x(probs,n=x)
    print x, mapk([[int(x)] for x in y_te],X_test_outputs,k=5)
    
## Sending 5 is actually the best way! lets do that!

0.01 0.0819458333333
0.02 0.132813888889
0.03 0.132813888889
0.04 0.167015277778
0.05 0.167015277778
0.06 0.171670833333
0.07 0.171670833333
0.08 0.159077777778
0.09 0.159077777778
0.1 0.146098611111
1 0.123166666667
2 0.163708333333
3 0.185402777778
4 0.198027777778
5 0.207444444444


So the final algorithm is a Random Forest classifier sending all of the top 5 clusters

## Save the final model

In [84]:
from sklearn.externals import joblib

joblib.dump(mclf, 'models/model.pkl')

['models/model.pkl',
 'models/model.pkl_01.npy',
 'models/model.pkl_02.npy',
 'models/model.pkl_03.npy',
 'models/model.pkl_04.npy',
 'models/model.pkl_05.npy',
 'models/model.pkl_06.npy',
 'models/model.pkl_07.npy',
 'models/model.pkl_08.npy',
 'models/model.pkl_09.npy',
 'models/model.pkl_10.npy',
 'models/model.pkl_11.npy',
 'models/model.pkl_12.npy',
 'models/model.pkl_13.npy',
 'models/model.pkl_14.npy',
 'models/model.pkl_15.npy',
 'models/model.pkl_16.npy',
 'models/model.pkl_17.npy',
 'models/model.pkl_18.npy',
 'models/model.pkl_19.npy',
 'models/model.pkl_20.npy',
 'models/model.pkl_21.npy',
 'models/model.pkl_22.npy',
 'models/model.pkl_23.npy',
 'models/model.pkl_24.npy',
 'models/model.pkl_25.npy',
 'models/model.pkl_26.npy',
 'models/model.pkl_27.npy',
 'models/model.pkl_28.npy',
 'models/model.pkl_29.npy',
 'models/model.pkl_30.npy',
 'models/model.pkl_31.npy',
 'models/model.pkl_32.npy',
 'models/model.pkl_33.npy',
 'models/model.pkl_34.npy',
 'models/model.pkl_35.npy',

## FInal scoring of the data happens in scoring.py and scoring_final_leg.py

These were then concatenetated using cat on command line