In [11]:
import pandas as pd
import numpy as np
import random

# Load the data

In original dataset we have about 37 million training set rows, and 2 million testing set rows. That's too much to read in memory, we'll sample n rows from it. 

In [12]:
# Load random sample from train data
def load_sample(filename, sample_size, nrows=False):
    
    # if number of rows in file is unknown
    if not nrows: 
        nrows = sum(1 for line in open(filename)) - 1 # number of records in file (excludes header)

    skip = sorted(random.sample(xrange(1, nrows + 1), nrows-sample_size)) # the 0-indexed header will not be included in the skip list
    df = pd.read_csv(filename, skiprows=skip)
    
    return df


In [13]:
# Load 10000 test rows
test = load_sample("test.csv", sample_size=10000, nrows=2528243)

In [14]:
test_ids = set(test.user_id.unique())
print "Number of unique users in test set: ", len(test_ids)

Number of unique users in test set:  9932


In [16]:
# Load train data into chunks and filter it by user ids
iter_csv = pd.read_csv('train.csv', iterator=True, chunksize=10000)
train = pd.concat([chunk[chunk['user_id'].isin(test_ids)] for chunk in iter_csv])

In [20]:
# Load destinations data
destinations = pd.read_csv('destinations.csv')

In [18]:
# Save sample datasets
train.to_csv("train_sample.csv")
test.to_csv("test_sample.csv")

Let’s first look at how much data there is:

In [21]:
print train.shape
print test.shape
print destinations.shape

(501201, 24)
(10000, 22)
(62106, 150)


In original dataset we have about 37 million training set rows, and 2 million testing set rows, which will make this problem a bit challenging to work with.

In [22]:
train.head(5)

Unnamed: 0,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,...,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
2725,2013-04-17 21:00:35,8,4,77,462,3492,1608.2975,14117,0,1,...,0,1,11815,1,0,1,3,5,1701,81
2726,2013-04-17 21:05:54,8,4,77,462,3492,1608.9681,14117,0,1,...,0,1,11815,1,0,1,3,5,1701,12
2727,2013-04-17 21:12:59,8,4,77,462,3492,1608.4897,14117,0,1,...,0,1,11815,1,0,1,3,5,1701,30
2728,2013-04-17 21:16:48,8,4,77,462,3492,1608.2975,14117,0,1,...,0,1,11815,1,0,2,3,5,1701,81
2729,2013-04-17 21:20:08,8,4,77,462,3492,1607.968,14117,0,1,...,0,1,11815,1,0,1,3,5,1701,78


There are a few things that immediately stick out:

* date_time could be useful in our predictions, so we’ll need to convert it.
* Most of the columns are integers or floats, so we can’t do a lot of feature engineering. For example, user_location_country isn’t the name of a country, it’s an integer. This makes it harder to create new features, because we don’t know exactly which each value means.

In [23]:
test.head(5)

Unnamed: 0,id,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,...,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,hotel_continent,hotel_country,hotel_market
0,211,2015-03-19 13:53:11,2,3,66,174,244,2032.0594,801,0,...,2015-03-31,2015-04-01,1,0,1,12572,5,2,50,680
1,330,2015-04-13 13:39:47,2,3,66,363,25671,80.9348,1153,0,...,2015-05-15,2015-05-16,2,0,1,8219,1,2,50,688
2,429,2015-06-13 14:48:09,30,4,195,991,43633,,1417,1,...,2015-10-03,2015-10-04,2,0,1,41544,1,0,34,97
3,929,2015-11-04 14:31:23,2,3,66,311,4979,77.9177,3033,0,...,2015-11-13,2015-11-16,2,0,1,27819,6,2,50,690
4,1025,2015-04-29 13:14:41,24,2,3,63,1210,,3289,0,...,2015-05-16,2015-05-18,2,0,1,8220,1,3,182,46


There are a few things we can take away from looking at test.csv:

* It looks like all the dates in test.csv are later than the dates in train.csv, and the data page confirms this. The testing set contains dates from 2015, and the training set contains dates from 2013 and 2014.
* It looks like the user ids in test.csv are a subset of the user ids in train.csv, given the overlapping integer ranges. We can confirm this later on.
* The is_booking column always looks to be 1 in test.csv. The data page confirms this.

In [24]:
destinations.head(5)

Unnamed: 0,srch_destination_id,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d140,d141,d142,d143,d144,d145,d146,d147,d148,d149
0,0,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-1.897627,-2.198657,-2.198657,-1.897627,...,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657
1,1,-2.18169,-2.18169,-2.18169,-2.082564,-2.18169,-2.165028,-2.18169,-2.18169,-2.031597,...,-2.165028,-2.18169,-2.165028,-2.18169,-2.18169,-2.165028,-2.18169,-2.18169,-2.18169,-2.18169
2,2,-2.18349,-2.224164,-2.224164,-2.189562,-2.105819,-2.075407,-2.224164,-2.118483,-2.140393,...,-2.224164,-2.224164,-2.196379,-2.224164,-2.192009,-2.224164,-2.224164,-2.224164,-2.224164,-2.057548
3,3,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.115485,-2.177409,-2.177409,-2.177409,...,-2.161081,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409
4,4,-2.189562,-2.187783,-2.194008,-2.171153,-2.152303,-2.056618,-2.194008,-2.194008,-2.145911,...,-2.187356,-2.194008,-2.191779,-2.194008,-2.194008,-2.185161,-2.194008,-2.194008,-2.194008,-2.188037


# Figuring out what to predict

We’ll be predicting which hotel_cluster a user will book after a given search. According to the description, there are 100 clusters in total.

In [27]:
print train["hotel_cluster"].value_counts()[:6]
print  "..."
print train["hotel_cluster"].value_counts()[-5:]
print "Number of clusters in train data (originally 100): ", len(train.hotel_cluster.unique())

91    14094
64    10987
48    10399
41    10132
5      8853
82     8393
Name: hotel_cluster, dtype: int64
...
35    2040
24    1939
27    1327
88    1141
74     565
Name: hotel_cluster, dtype: int64
Number of clusters in train data (originally 100):  100


Check which columns are present in train data, and absent in test data

In [28]:
print "In test set we don't have columns: ", set(train.columns) - set(test.columns)

In test set we don't have columns:  set(['hotel_cluster', 'cnt', 'is_booking'])


In [29]:
# Check if all ids from test data present in train data
test_ids = set(test.user_id.unique())
train_ids = set(train.user_id.unique())
intersection_count = len(test_ids & train_ids)
intersection_count == len(test_ids)

True

* Convert the date_time column in train from an object to a datetime value. This makes it easier to work with as a date.
* Extract the year and month from from date_time, and assign them to their own columns.

In [30]:
train["date_time"] = pd.to_datetime(train["date_time"])
train["year"] = train["date_time"].dt.year
train["month"] = train["date_time"].dt.month

test["date_time"] = pd.to_datetime(test["date_time"])
test["year"] = test["date_time"].dt.year
test["month"] = test["date_time"].dt.month

* In the original train and test DataFrames, test contained data from 2015, and train contained data from 2013 and 2014. We split this data so that anything after July 2014 is in t2, and anything before is in t1. This gives us smaller training and testing sets with similar characteristics to train and test.
* If is_booking is 0, it represents a click, and a 1 represents a booking.  test contains only booking events, so we’ll need to sample t2 to only contain bookings as well.

In [34]:
t1 = train[((train.year == 2013) | ((train.year == 2014) & (train.month < 8)))]
t2 = train[((train.year == 2014) & (train.month >= 8))]
t2 = t2[t2.is_booking == True]

## Baseline 1 

The most simple technique we could try on this data is to find the most common clusters across the data, then use them as predictions.

We can again use the value_counts method to help us here:

In [35]:
most_common_clusters = list(train.hotel_cluster.value_counts().head().index)
print most_common_clusters

[91, 64, 48, 41, 5]


The above code will give us a list of the 5 most common clusters in train. This is because the head method returns the first 5 rows by default, and the index property will return the index of the DataFrame, which is the hotel cluster after running the value_counts method.

### Generating predictions
We can turn most_common_clusters into a list of predictions by making the same prediction for each row. This will create a list with as many elements as there are rows in t2. Each element will be equal to most_common_clusters.

In [40]:
predictions = [most_common_clusters for i in range(t2.shape[0])]
predictions[:5]

[[91, 64, 48, 41, 5],
 [91, 64, 48, 41, 5],
 [91, 64, 48, 41, 5],
 [91, 64, 48, 41, 5],
 [91, 64, 48, 41, 5]]

## Evaluating error
In order to evaluate error, we’ll first need to figure out how to compute Mean Average Precision. Luckily, Ben Hamner has written an implementation that can be found here. It can be installed as part of the ml_metrics package, and you can find installation instructions for how to install it here.

We can compute our error metric with the mapk with this functions:

In [46]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

Our target needs to be in list of lists format for mapk to work, so we convert the hotel_cluster column of t2 into a list of lists. Then, we call the mapk method with our target, our predictions, and the number of predictions we want to evaluate (5).

In [48]:
target = [[l] for l in t2["hotel_cluster"]]
print target[:5]
print predictions[:5]
print mapk(target, predictions, k=5)

[[1], [32], [96], [72], [99]]
[[91, 64, 48, 41, 5], [91, 64, 48, 41, 5], [91, 64, 48, 41, 5], [91, 64, 48, 41, 5], [91, 64, 48, 41, 5]]
0.0664765456473


Our result here isn’t great, but we’ve just generated our first set of predictions, and evaluated our error! The framework we’ve built will allow us to quickly test out a variety of techniques and see how they score. We’re well on our way to building a good-performing solution for the leaderboard.

In [57]:
samp_sumb = pd.read_csv("sample_submission.csv")
print samp_sumb.shape
print samp_sumb.head()

x = '1 32 96 72 99'
samp_sumb.hotel_cluster = ['1 32 96 72 99' for i in range(len(samp_sumb.hotel_cluster))]
print samp_sumb.head()
samp_sumb.to_csv("submission.csv", index=False)

(2528243, 2)
   id hotel_cluster
0   0          99 1
1   1          99 1
2   2          99 1
3   3          99 1
4   4          99 1
   id  hotel_cluster
0   0  1 32 96 72 99
1   1  1 32 96 72 99
2   2  1 32 96 72 99
3   3  1 32 96 72 99
4   4  1 32 96 72 99
