In [1]:
# coding: utf-8
__author__ = 'Sandro Vega Pons : https://www.kaggle.com/svpons'

'''Partially based on grid_plus_classifier script:
https://www.kaggle.com/svpons/facebook-v-predicting-check-ins/grid-plus-classifier
'''

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier


def prepare_data(df, n_cell_x, n_cell_y):
    """
    Feature engineering and computation of the grid.
    """
    #Creating the grid
    size_x = 10. / n_cell_x
    size_y = 10. / n_cell_y
    eps = 0.00001  
    xs = np.where(df.x.values < eps, 0, df.x.values - eps)
    ys = np.where(df.y.values < eps, 0, df.y.values - eps)
    pos_x = (xs / size_x).astype(np.int)
    pos_y = (ys / size_y).astype(np.int)
    df['grid_cell'] = (pos_y * n_cell_x + pos_x).astype(np.int16)
    
    #Feature engineering
    fw = [500, 1000, 4, 3, 1./22., 2, 10] #feature weights (black magic here)
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)    
    df['hour'] = (d_times.hour * fw[2]).astype(np.int8)
    df['weekday'] = (d_times.weekday * fw[3]).astype(np.int8)
    df['day'] = (d_times.dayofyear * fw[4]).astype(np.int8)
    df['month'] = (d_times.month * fw[5]).astype(np.int8)
    df['year'] = ((d_times.year - 2013) * fw[6]).astype(np.int8)

    df = df.drop(['time'], axis=1) 
    return df
    

def process_one_cell(df_train, df_test, grid_id, th):
    """   
    Classification inside one grid cell.
    """   
    #Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=25, weights='distance', 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    #pred_probas = np.argsort(y_pred, axis=1)[:,::-1][:,:3]
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])
    
    return pred_labels, row_ids, y_pred
    
    
def process_grid(df_train, df_test, th, n_cells):
    """
    Iterates over all grid cells, aggregates the results and makes the
    submission.
    """ 
    preds = np.zeros((df_test.shape[0], 3), dtype=int)
    
    for g_id in range(1):
        if g_id % 100 == 0:
            print('iter: %s' %(g_id))
        
        #Applying classifier to one grid cell
        pred_labels, row_ids, y_pred = process_one_cell(df_train, df_test, g_id, th)

        #Updating predictions
        preds[row_ids] = pred_labels

    print('Generating submission file ...')
    #Auxiliary dataframe with the 3 best predictions for each sample
    df_aux = pd.DataFrame(preds, dtype=str, columns=['l1', 'l2', 'l3'])  
    
    #Concatenating the 3 predictions for each sample
    ds_sub = df_aux.l1.str.cat([df_aux.l2, df_aux.l3], sep=' ')
    
    #Writting to csv
    ds_sub.name = 'place_id'
    ds_sub.to_csv('sub_grid.csv', index=True, header=True, index_label='row_id') 
    
    return y_pred, pred_labels,preds
      

In [2]:
"""
"""
print('Loading data ...')
df1 = pd.read_hdf('grid0predictedplaces_.h5','table')
df2 = pd.read_hdf('grid0predictedprobability_.h5','table')
df3 = pd.read_hdf('todoAnalysis.h5','table')

Loading data ...


In [3]:
df3 = df3.astype(float)

In [4]:
df3['gridRatio12'] = df3['gridFreq1']/df3['gridFreq2']
df3['gridRatio23'] = df3['gridFreq2']/df3['gridFreq3']
df3['AllRatio12'] = df3['AllFreq1']/df3['AllFreq2']
df3['AllRatio23'] = df3['AllFreq2']/df3['AllFreq3']

In [5]:
df3.head()

Unnamed: 0,l1,l2,l3,diff12,diff23,gridFreq1,gridFreq2,gridFreq3,AllFreq1,AllFreq2,AllFreq3,gridRatio12,gridRatio23,AllRatio12,AllRatio23
5,0.933578,0.066422,0.0,0.867156,0.066422,693.0,344.0,74.0,712.0,351.0,92.0,2.014535,4.648649,2.02849,3.815217
442,0.64942,0.135856,0.037749,0.513564,0.098107,121.0,78.0,344.0,121.0,81.0,351.0,1.551282,0.226744,1.493827,0.230769
2133,0.290107,0.132806,0.114269,0.157301,0.018537,330.0,456.0,124.0,346.0,469.0,133.0,0.723684,3.677419,0.73774,3.526316
2794,0.303156,0.196654,0.116576,0.106503,0.080078,312.0,309.0,250.0,322.0,318.0,297.0,1.009709,1.236,1.012579,1.070707
3200,0.826281,0.103805,0.039152,0.722476,0.064653,78.0,45.0,8.0,81.0,50.0,246.0,1.733333,5.625,1.62,0.203252


In [19]:
idxgrid = df3.loc[(df3["diff12"]<0.15) & (df3["gridRatio12"]>2)].index
idxgrid.shape
#idxall = df3.loc[(df3["diff12"]<0.06) & (df3["AllRatio12"]>5)].index
#df3.loc[(df3["diff12"]<0.165) & (df3["gridRatio12"]>2)].index 1044
#df3.loc[(df3["diff12"]<0.2) & (df3["gridRatio12"]>4)].index 554

(1044,)

In [20]:
idxgrid.shape
#idxall.shape

(1044,)

In [21]:
#df1.loc[idxall,['l1','l2']] = df1.loc[idxall,['l2','l1']].values
df1.loc[idxgrid,['l1','l2']] = df1.loc[idxgrid,['l2','l1']].values

In [22]:
dff = pd.read_hdf('grid0predictedplaces_.h5','table')

In [23]:
df1.head()

Unnamed: 0,l1,l2,l3
5,8370753254,9727638738,4120068991
442,1478305117,5003921802,9727638738
2133,7065354365,3642864292,8815983898
2794,8958485237,1376741893,6505057624
3200,5003921802,4012969260,5079685869


In [24]:
idxgrid

Int64Index([   6449,   12495,   14386,   14749,   23091,   28375,   33488,
              34566,   73130,  111133,
            ...
            8551691, 8552146, 8556673, 8559107, 8564420, 8565280, 8568104,
            8576651, 8596957, 8602382],
           dtype='int64', length=1044)

In [27]:
dff.loc[3200]

l1    5003921802
l2    4012969260
l3    5079685869
Name: 3200, dtype: int64

In [28]:
df1.loc[3200]

l1    5003921802
l2    4012969260
l3    5079685869
Name: 3200, dtype: int64

In [29]:
df1.to_hdf('grid0predictedplacesreplaced__grid1.h5','table')