In [None]:
'''
subbbb = pd.read_csv('sub_grid0.csv')

ii = df_test[df_test.grid_cell==0].index

type(ii)

subbbb.iloc[ii]

a = pd.DataFrame(columns=list('ABC'))

for i in range(800):
    a = a.append({'A': df_train[df_train.grid_cell==i].shape[0], 
                  'B': df_test[df_test.grid_cell==i].shape[0],
                  'C': df_train[df_train.grid_cell==i]['place_id'].unique().shape[0]},
                  ignore_index=True)

a.head()

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(a['A'],a['C'])

29118021.0/8607230

a.to_hdf('gridinfo.h5','table')
'''

In [57]:
# coding: utf-8
__author__ = 'Sandro Vega Pons : https://www.kaggle.com/svpons'

'''Partially based on grid_plus_classifier script:
https://www.kaggle.com/svpons/facebook-v-predicting-check-ins/grid-plus-classifier
'''

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier


def prepare_data(df, n_cell_x, n_cell_y):
    """
    Feature engineering and computation of the grid.
    """
    #Creating the grid
    size_x = 10. / n_cell_x
    size_y = 10. / n_cell_y
    eps = 0.00001  
    xs = np.where(df.x.values < eps, 0, df.x.values - eps)
    ys = np.where(df.y.values < eps, 0, df.y.values - eps)
    pos_x = (xs / size_x).astype(np.int)
    pos_y = (ys / size_y).astype(np.int)
    df['grid_cell'] = (pos_y * n_cell_x + pos_x).astype(np.int16)
    
    #Feature engineering
    fw = [500, 1000, 4, 3, 1./22., 2, 10] #feature weights (black magic here)
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)    
    df['hour'] = (d_times.hour * fw[2]).astype(np.int8)
    df['weekday'] = (d_times.weekday * fw[3]).astype(np.int8)
    df['day'] = (d_times.dayofyear * fw[4]).astype(np.int8)
    df['month'] = (d_times.month * fw[5]).astype(np.int8)
    df['year'] = ((d_times.year - 2013) * fw[6]).astype(np.int8)

    df = df.drop(['time'], axis=1) 
    return df
    

def process_one_cell(df_train, df_test, grid_id, th):
    """   
    Classification inside one grid cell.
    """   
    #Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=25, weights='distance', 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    #pred_probas = np.argsort(y_pred, axis=1)[:,::-1][:,:3]
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])
    
    return pred_labels, row_ids, y_pred
    
    
def process_grid(df_train, df_test, th, n_cells):
    """
    Iterates over all grid cells, aggregates the results and makes the
    submission.
    """ 
    preds = np.zeros((df_test.shape[0], 3), dtype=int)
    
    for g_id in range(1):
        if g_id % 100 == 0:
            print('iter: %s' %(g_id))
        
        #Applying classifier to one grid cell
        pred_labels, row_ids, y_pred = process_one_cell(df_train, df_test, g_id, th)

        #Updating predictions
        preds[row_ids] = pred_labels

    print('Generating submission file ...')
    #Auxiliary dataframe with the 3 best predictions for each sample
    df_aux = pd.DataFrame(preds, dtype=str, columns=['l1', 'l2', 'l3'])  
    
    #Concatenating the 3 predictions for each sample
    ds_sub = df_aux.l1.str.cat([df_aux.l2, df_aux.l3], sep=' ')
    
    #Writting to csv
    ds_sub.name = 'place_id'
    ds_sub.to_csv('sub_grid.csv', index=True, header=True, index_label='row_id') 
    
    return y_pred, pred_labels,preds
      

In [58]:
"""
"""
print('Loading data ...')
df_train = pd.read_hdf('train.h5','table')
df_test = pd.read_hdf('test.h5','table')

#Defining the size of the grid
n_cell_x = 20
n_cell_y = 40 

print('Preparing train data')
#df_train = prepare_data(df_train, n_cell_x, n_cell_y)

print('Preparing test data')
#df_test = prepare_data(df_test, n_cell_x, n_cell_y)

#Solving classification problems inside each grid cell
th = 5 #Keeping place_ids with more than th samples.   
predsss,lab,pred = process_grid(df_train, df_test, th, n_cell_x*n_cell_y)

Loading data ...
Preparing train data
Preparing test data
iter: 0
Generating submission file ...


In [59]:
predsss.shape

(11290, 322)

In [62]:
pred

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ..., 
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [54]:
lab

array([[8370753254, 9727638738, 4120068991],
       [1478305117, 5003921802, 9727638738],
       [7065354365, 3642864292, 8815983898],
       ..., 
       [7942904509, 3363064200, 7123189219],
       [2784881316, 4751915737, 3642864292],
       [8556108428, 1739846485, 1478305117]])

In [30]:
df_cell_train = df_train.loc[df_train.grid_cell == 0]
place_counts = df_cell_train.place_id.value_counts()
mask = (place_counts[df_cell_train.place_id.values] >= 5).values
df_cell_train = df_cell_train.loc[mask]

In [31]:
df_cell_train.place_id.unique().shape

(322,)

In [17]:
df_train[df_train.grid_cell==0].place_id.unique().shape

(912,)

In [34]:
np.argsort(predsss, axis=1)[:,::-1]

array([[265, 311, 108, ..., 216, 217,   0],
       [ 20, 145, 311, ..., 215, 216,   0],
       [212,  97, 284, ..., 215, 216,   0],
       ..., 
       [244,  92, 217, ..., 216, 218,   0],
       [ 71, 138,  97, ..., 217, 218,   0],
       [274,  35,  20, ..., 216, 217,   0]])

In [38]:
x = np.array([[3, 0,2], [1, 2,0], [9,2,4]])

In [39]:
x

array([[3, 0, 2],
       [1, 2, 0],
       [9, 2, 4]])

In [45]:
np.argsort(x, axis=1)

array([[1, 2, 0],
       [2, 0, 1],
       [1, 2, 0]])

In [47]:
np.sort(x,axis=1)[:,::-1]

array([[3, 2, 0],
       [2, 1, 0],
       [9, 4, 2]])

In [56]:
np.sort(predsss, axis=1)[:,::-1][:,:3]

array([[ 0.93357815,  0.06642185,  0.        ],
       [ 0.64942011,  0.13585602,  0.03774861],
       [ 0.29010672,  0.13280617,  0.1142694 ],
       ..., 
       [ 0.61415386,  0.17563176,  0.13132684],
       [ 0.49418324,  0.27696835,  0.11778962],
       [ 0.84890935,  0.08555736,  0.03276665]])

In [64]:
df_aux = pd.DataFrame(pred, columns=['l1', 'l2', 'l3']) 

In [76]:
iii = df_test.loc[df_test.grid_cell == 0].index

In [77]:
df_aux.iloc[iii]

Unnamed: 0,l1,l2,l3
5,8370753254,9727638738,4120068991
442,1478305117,5003921802,9727638738
2133,7065354365,3642864292,8815983898
2794,8958485237,1376741893,6505057624
3200,5003921802,4012969260,5079685869
4345,1006316884,3027578816,4180826137
4532,1254758593,4895053921,7942904509
4758,6138829831,6505057624,8958485237
5443,6349154168,5218049605,8739476634
5991,9034650629,2477372399,8944067726


In [78]:
probabi = np.zeros((df_test.shape[0], 3), dtype=float)

In [79]:
probabi[iii] = np.sort(predsss, axis=1)[:,::-1][:,:3]

In [81]:
df_aux1 = pd.DataFrame(probabi, columns=['l1', 'l2', 'l3']) 

In [82]:
df_aux1

Unnamed: 0,l1,l2,l3
0,0.000000,0.000000,0.0
1,0.000000,0.000000,0.0
2,0.000000,0.000000,0.0
3,0.000000,0.000000,0.0
4,0.000000,0.000000,0.0
5,0.933578,0.066422,0.0
6,0.000000,0.000000,0.0
7,0.000000,0.000000,0.0
8,0.000000,0.000000,0.0
9,0.000000,0.000000,0.0


In [83]:
df_aux1.iloc[iii]

Unnamed: 0,l1,l2,l3
5,0.933578,0.066422,0.000000
442,0.649420,0.135856,0.037749
2133,0.290107,0.132806,0.114269
2794,0.303156,0.196654,0.116576
3200,0.826281,0.103805,0.039152
4345,0.962505,0.037495,0.000000
4532,0.930406,0.035570,0.034024
4758,0.424146,0.224884,0.153922
5443,0.887052,0.042985,0.035574
5991,0.251025,0.224865,0.126815


In [84]:
df_aux.to_hdf('grid0predictedplaces.h5','table')

In [85]:
df_aux1.to_hdf('grid0predictedprobability.h5','table')