In [None]:
'''
subbbb = pd.read_csv('sub_grid0.csv')

ii = df_test[df_test.grid_cell==0].index

type(ii)

subbbb.iloc[ii]

a = pd.DataFrame(columns=list('ABC'))

for i in range(800):
    a = a.append({'A': df_train[df_train.grid_cell==i].shape[0], 
                  'B': df_test[df_test.grid_cell==i].shape[0],
                  'C': df_train[df_train.grid_cell==i]['place_id'].unique().shape[0]},
                  ignore_index=True)

a.head()

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(a['A'],a['C'])

29118021.0/8607230

a.to_hdf('gridinfo.h5','table')
'''

In [1]:
# coding: utf-8
__author__ = 'Sandro Vega Pons : https://www.kaggle.com/svpons'

'''Partially based on grid_plus_classifier script:
https://www.kaggle.com/svpons/facebook-v-predicting-check-ins/grid-plus-classifier
'''

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier


def prepare_data(df, n_cell_x, n_cell_y):
    """
    Feature engineering and computation of the grid.
    """
    #Creating the grid
    size_x = 10. / n_cell_x
    size_y = 10. / n_cell_y
    eps = 0.00001  
    xs = np.where(df.x.values < eps, 0, df.x.values - eps)
    ys = np.where(df.y.values < eps, 0, df.y.values - eps)
    pos_x = (xs / size_x).astype(np.int)
    pos_y = (ys / size_y).astype(np.int)
    df['grid_cell'] = (pos_y * n_cell_x + pos_x).astype(np.int16)
    
    #Feature engineering
    fw = [500, 1000, 4, 3, 1./22., 2, 10] #feature weights (black magic here)
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)    
    df['hour'] = (d_times.hour * fw[2]).astype(np.int8)
    df['weekday'] = (d_times.weekday * fw[3]).astype(np.int8)
    df['day'] = (d_times.dayofyear * fw[4]).astype(np.int8)
    df['month'] = (d_times.month * fw[5]).astype(np.int8)
    df['year'] = ((d_times.year - 2013) * fw[6]).astype(np.int8)

    df = df.drop(['time'], axis=1) 
    return df
    

def process_one_cell(df_train, df_test, grid_id, th):
    """   
    Classification inside one grid cell.
    """   
    #Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=25, weights='distance', 
                               metric='manhattan',n_jobs=-1)
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_probas = np.sort(y_pred, axis=1)[:,::-1][:,:3]
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])
    
    return pred_labels, row_ids, pred_probas
    
    
def process_grid(df_train, df_test, th, n_cells):
    """
    Iterates over all grid cells, aggregates the results and makes the
    submission.
    """ 
    preds = np.zeros((df_test.shape[0], 3), dtype=int)
    pred_probability = np.zeros((df_test.shape[0], 3), dtype=float)
    
    for g_id in range(n_cells):
        if g_id % 25 == 0:
            print('iter: %s' %(g_id))
        
        #Applying classifier to one grid cell
        pred_labels, row_ids, pred_probas = process_one_cell(df_train, df_test, g_id, th)

        #Updating predictions
        preds[row_ids] = pred_labels
        pred_probability[row_ids] = pred_probas

    print('Generating submission file ...')
    #Auxiliary dataframe with the 3 best predictions for each sample
    df_preds = pd.DataFrame(preds, columns=['l1', 'l2', 'l3'])
    df_probas = pd.DataFrame(pred_probability, columns=['l1', 'l2', 'l3'])
    
    return df_preds, df_probas

In [2]:
"""
"""
print('Loading data ...')
df_train = pd.read_hdf('train.h5','table')
df_test = pd.read_hdf('test.h5','table')

#Defining the size of the grid
n_cell_x = 20
n_cell_y = 40 

print('Preparing train data')
#df_train = prepare_data(df_train, n_cell_x, n_cell_y)

print('Preparing test data')
#df_test = prepare_data(df_test, n_cell_x, n_cell_y)

Loading data ...
Preparing train data
Preparing test data


In [3]:
#Solving classification problems inside each grid cell
th = 5 #Keeping place_ids with more than th samples.   
prediction, probability = process_grid(df_train, df_test, th, n_cell_x*n_cell_y)

iter: 0
iter: 25
iter: 50
iter: 75
iter: 100
iter: 125
iter: 150
iter: 175
iter: 200
iter: 225
iter: 250
iter: 275
iter: 300
iter: 325
iter: 350
iter: 375
iter: 400
iter: 425
iter: 450
iter: 475
iter: 500
iter: 525
iter: 550
iter: 575
iter: 600
iter: 625
iter: 650
iter: 675
iter: 700
iter: 725
iter: 750
iter: 775
Generating submission file ...


In [4]:
prediction.shape,type(prediction)

((8607230, 3), pandas.core.frame.DataFrame)

In [5]:
probability.shape,type(probability)

((8607230, 3), pandas.core.frame.DataFrame)

In [9]:
probability.head()

Unnamed: 0,l1,l2,l3
0,0.308713,0.154834,0.137801
1,0.242668,0.151991,0.116748
2,0.394223,0.278302,0.11024
3,0.665965,0.166739,0.048985
4,0.876901,0.087552,0.035546


In [10]:
prediction.to_hdf('1234predictedplaces.h5','table')

In [11]:
probability.to_hdf('1234predictedprobability.h5','table')