In [1]:
%matplotlib inline

In [2]:
from scipy import stats as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings('ignore')

sns.set_context("poster")

#### Pull in data and finalise before classifications

In [77]:
train_x = pd.read_csv('data/complete_train_x_sample.csv')
train_x.columns

train_y = pd.read_csv('data/train_sample_y.csv')
print train_y.head()

print len(train_x), len(train_y)

   event_id  hotel_cluster
0         0              7
1         1             59
2         2             36
3         3             55
4         4             62
150000 150000


## Read back in the dat and set-up the classifiers

In [78]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=5):
    """
    https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [79]:
##Lets start with a simple event level classification problem
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(
    train_x.fillna(-1000).values,
    train_y['hotel_cluster'].tolist(),
    test_size=0.08,
    train_size=0.2,
    random_state=42)

### Simple classifier with 1 output

In [80]:
clf = RandomForestClassifier(n_estimators=50)
clf = clf.fit(X_tr, y_tr)
clfp = clf.predict(X_te)
mapk([[int(x)] for x in y_te],[[int(x)] for x in clfp.tolist()],k=5)

0.12633333333333333

### Create multiple outputs and re-test

In [81]:
## Now lets try and get multiple labels out
from sklearn.preprocessing import MultiLabelBinarizer

b_y_tr = MultiLabelBinarizer().fit_transform([[int(x)] for x in y_tr])
mclf = clf.fit(X_tr, b_y_tr)
probs = mclf.predict_proba(X_te)

In [89]:
print probs[1]

[[ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 ..., 
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]]


#### Test different outputs, depending on cut-off or by ordered top-x

In [82]:
## rank probabilities in each line and select top X depending on cutoff (max 5)
def get_outputs(probs,cutoff=0.1):
    X_te_op = []
    for p in xrange(len(probs[1])):
        tt = [(x[p].tolist()[1],i) for i, x in enumerate(probs)]
        X_te_op.append(list(set([sorted(tt,reverse=True)[0][1]] + [x[1] for x in sorted(tt,reverse=True) if x[0] > cutoff]))[:5])
    return X_te_op

## just select top x given sorted list
def get_top_x(probs,n=5):
    X_te_op = []
    for p in xrange(len(probs[1])):
        tt = [(x[p].tolist()[1],i) for i, x in enumerate(probs)]
        X_te_op.append([x[1] for x in sorted(tt,reverse=True)][:n])
    return X_te_op

In [83]:
for x in [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]:
    X_test_outputs = get_outputs(probs,cutoff=x)
    print x, mapk([[int(x)] for x in y_te],X_test_outputs,k=5)

for x in [1,2,3,4,5]:
    X_test_outputs = get_top_x(probs,n=x)
    print x, mapk([[int(x)] for x in y_te],X_test_outputs,k=5)
    
## Sending 5 is actually the best way! lets do that!

0.01 0.0819458333333
0.02 0.132813888889
0.03 0.132813888889
0.04 0.167015277778
0.05 0.167015277778
0.06 0.171670833333
0.07 0.171670833333
0.08 0.159077777778
0.09 0.159077777778
0.1 0.146098611111
1 0.123166666667
2 0.163708333333
3 0.185402777778
4 0.198027777778
5 0.207444444444


So the final algorithm is a Random Forest classifier sending all of the top 5 clusters

## Save the final model

In [84]:
from sklearn.externals import joblib

joblib.dump(mclf, 'models/model.pkl')

['models/model.pkl',
 'models/model.pkl_01.npy',
 'models/model.pkl_02.npy',
 'models/model.pkl_03.npy',
 'models/model.pkl_04.npy',
 'models/model.pkl_05.npy',
 'models/model.pkl_06.npy',
 'models/model.pkl_07.npy',
 'models/model.pkl_08.npy',
 'models/model.pkl_09.npy',
 'models/model.pkl_10.npy',
 'models/model.pkl_11.npy',
 'models/model.pkl_12.npy',
 'models/model.pkl_13.npy',
 'models/model.pkl_14.npy',
 'models/model.pkl_15.npy',
 'models/model.pkl_16.npy',
 'models/model.pkl_17.npy',
 'models/model.pkl_18.npy',
 'models/model.pkl_19.npy',
 'models/model.pkl_20.npy',
 'models/model.pkl_21.npy',
 'models/model.pkl_22.npy',
 'models/model.pkl_23.npy',
 'models/model.pkl_24.npy',
 'models/model.pkl_25.npy',
 'models/model.pkl_26.npy',
 'models/model.pkl_27.npy',
 'models/model.pkl_28.npy',
 'models/model.pkl_29.npy',
 'models/model.pkl_30.npy',
 'models/model.pkl_31.npy',
 'models/model.pkl_32.npy',
 'models/model.pkl_33.npy',
 'models/model.pkl_34.npy',
 'models/model.pkl_35.npy',

## Score the testing data and output to submit to kaggle

In [3]:
from sklearn.externals import joblib

mclf = joblib.load('models/model.pkl')

test_x = pd.read_csv('data/complete_test_x.csv')

test_id = test_x['id']
test_var = test_x[[x for x in test_x.columns][1:]]
test_var.to_csv('data/complete_test_x_vars.csv',index=False)

In [10]:
def get_top_x(probs,n=5):
    X_te_op = []
    for p in xrange(len(probs[1])):
        tt = [(x[p].tolist()[1],i) for i, x in enumerate(probs)]
        X_te_op.append([x[1] for x in sorted(tt,reverse=True)][:n])
    return X_te_op

def create_kaggle_output(test_id,preds):
    str_output = ''
    for i, tid in enumerate(test_id.tolist()):
        str_output += str(tid)+','+' '.join([str(x) for x in preds[i]])+'\n'
    return str_output

In [11]:
chunksize = 10000

data = pd.read_csv('data/complete_test_x_vars.csv',iterator=True, chunksize=chunksize)

for i, chunk in enumerate(data):
    print 'chunk ', i
    
    tpred = mclf.predict_proba(chunk.fillna(-1000))
    top_x = get_top_x(tpred,5)
    str_output = create_kaggle_output(test_id[((i+1)*chunksize-chunksize):((i+1)*chunksize)],top_x)
    with open('data/kaggle_expedia_submission_'+str(i)+'.csv','w') as f:
        f.write(str_output)

chunk  0
chunk  1
chunk  2
chunk  3
chunk  4
chunk  5
chunk  6


KeyboardInterrupt: 

In [8]:
data = pd.read_csv('data/complete_test_x_vars_test.csv')

for dat in data.iterrows():
    print dat


(0, site_name                        2.000000
posa_continent                   3.000000
user_location_country           66.000000
user_location_region           174.000000
user_location_city           37449.000000
orig_destination_distance     5539.056700
is_mobile                        1.000000
is_package                       0.000000
channel                          3.000000
srch_adults_cnt                  2.000000
srch_children_cnt                0.000000
srch_rm_cnt                      1.000000
srch_destination_type_id         6.000000
hotel_continent                  6.000000
hotel_country                  204.000000
hotel_market                    27.000000
srch_ci_year                  2016.000000
srch_ci_day                     19.000000
srch_ci_month                    5.000000
srch_ci_dayofweek                3.000000
srch_ci_yearquarter              1.000000
srch_co_year                  2016.000000
srch_co_day                     23.000000
srch_co_month                 

In [None]:
data = pd.read_csv('data/complete_test_x_vars.csv')

def create_kaggle_output(test_id,preds):
    return str(test_id)+','+' '.join([str(x) for x in preds])+'\n'

with open('data/kaggle_expedia_submission.csv','w') as f:
    for i, dat in data.iterrows():
        tpred = mclf.predict_proba(dat.fillna(-1000))
        #print tpred
        top_x = get_top_x(tpred,5)
        #print top_x
        str_output = create_kaggle_output(test_id[i],top_x[0])
        f.write(str_output)



In [None]:
### Has it finished?

In [None]:
print len(pd.read_csv('data/kaggle_expedia_submission.csv'))

In [16]:
with open('data/kaggle_expedia_submission.csv','r') as f:
    print f.read()

0,78 5 21 82 43
1,2 64 67 20 98
2,0 42 6 31 59
3,1 24 54 45 19
4,6 47 40 48 95



In [None]:
with open('data/kaggle_expedia_submission.csv','w') as f:
    f.write(str_output)

In [16]:
pd.read_csv('data/complete_test_x_vars_test.csv').head()

Unnamed: 0,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,is_mobile,is_package,channel,srch_adults_cnt,...,date_time_month,date_time_dayofweek,date_time_yearquarter,date_time_hour,pca0,pca1,pca2,pca3,pca4,pca5
0,2,3,66,174,37449,5539.0567,1,0,3,2,...,9,3,2,17,-0.377409,0.304712,0.338998,0.085367,0.095488,0.148569
1,2,3,66,174,37449,5873.2923,1,0,10,2,...,9,3,2,17,,,,,,
2,2,3,66,142,17440,3975.9776,0,0,1,4,...,6,6,1,15,-0.998025,-0.71957,0.045389,-0.297122,0.069929,0.121615
3,2,3,66,258,34156,1508.5975,0,1,10,2,...,9,0,2,14,-2.168369,0.061117,-0.723259,-0.254975,-0.25858,-0.862235
4,2,3,66,467,36345,66.7913,0,0,0,2,...,7,4,2,9,-0.58839,0.302535,0.16699,0.132365,-0.059723,-0.208519


In [74]:
tpred = mclf.predict(pd.read_csv('data/complete_test_x_vars_test.csv').fillna(-1000))

def get_top_x(probs,n=5):
    X_te_op = []
    for p in xrange(len(probs[1])):
        print probs[1][p]
        tt = [(x[p].tolist()[1],i) for i, x in enumerate(probs)]
        X_te_op.append([x[1] for x in sorted(tt,reverse=True)][:n])
    return X_te_op

top_x = get_top_x(tpred,5)

def create_kaggle_output(test_id1,preds):
    str_output = ''
    for i, tid in enumerate(test_id1.tolist()):
        str_output += str(tid)+','+' '.join([str(x) for x in preds[i]])+'\n'
    return str_output

create_kaggle_output(test_id[0:5],top_x)

0.0


TypeError: 'float' object has no attribute '__getitem__'

In [38]:
data = pd.read_csv('data/complete_test_x_vars.csv',iterator=True, chunksize=10)

In [68]:
chunksize = 1000

data = pd.read_csv('data/complete_test_x_vars.csv',iterator=True, chunksize=chunksize)

str_output = ''
for i, chunk in enumerate(data):
    print 'chunk ', i
    
    tpred = mclf.predict(chunk.fillna(-1000))
    print 'tpred'
    top_x = get_top_x(tpred,5)
    print len(top_x)
    print 'top_x'
    str_output += create_kaggle_output(test_id[((i+1)*chunksize-chunksize):((i+1)*chunksize)],top_x)

chunk  0
tpred
100
top_x


IndexError: list index out of range

In [64]:
chunksize=1000
for i in [0,1,2,3]:
    print test_id[((i+1)*chunksize-chunksize):((i+1)*chunksize)]

0        0
1        1
2        2
3        3
4        4
5        5
6        6
7        7
8        8
9        9
10      10
11      11
12      12
13      13
14      14
15      15
16      16
17      17
18      18
19      19
20      20
21      21
22      22
23      23
24      24
25      25
26      26
27      27
28      28
29      29
      ... 
970    970
971    971
972    972
973    973
974    974
975    975
976    976
977    977
978    978
979    979
980    980
981    981
982    982
983    983
984    984
985    985
986    986
987    987
988    988
989    989
990    990
991    991
992    992
993    993
994    994
995    995
996    996
997    997
998    998
999    999
Name: id, dtype: int64
1000    1000
1001    1001
1002    1002
1003    1003
1004    1004
1005    1005
1006    1006
1007    1007
1008    1008
1009    1009
1010    1010
1011    1011
1012    1012
1013    1013
1014    1014
1015    1015
1016    1016
1017    1017
1018    1018
1019    1019
1020    1020
1021    1021
1022    1022
1023   

In [56]:
len(test_id)

2528243

In [None]:
## check string
print str_output[0:1000]