In [1]:
# coding: utf-8
__author__ = 'Sandro Vega Pons : https://www.kaggle.com/svpons'

'''Partially based on grid_plus_classifier script:
https://www.kaggle.com/svpons/facebook-v-predicting-check-ins/grid-plus-classifier
'''

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier


def prepare_data(df, n_cell_x, n_cell_y):
    """
    Feature engineering and computation of the grid.
    """
    #Creating the grid
    size_x = 10. / n_cell_x
    size_y = 10. / n_cell_y
    eps = 0.00001  
    xs = np.where(df.x.values < eps, 0, df.x.values - eps)
    ys = np.where(df.y.values < eps, 0, df.y.values - eps)
    pos_x = (xs / size_x).astype(np.int)
    pos_y = (ys / size_y).astype(np.int)
    df['grid_cell'] = (pos_y * n_cell_x + pos_x).astype(np.int16)
    
    #Feature engineering
    fw = [500, 1000, 4, 3, 1./22., 2, 10] #feature weights (black magic here)
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(int(mn), 'm') 
                               for mn in df.time.values)    
    df['hour'] = (d_times.hour * fw[2]).astype(np.int8)
    df['weekday'] = (d_times.weekday * fw[3]).astype(np.int8)
    df['day'] = (d_times.dayofyear * fw[4]).astype(np.int8)
    df['month'] = (d_times.month * fw[5]).astype(np.int8)
    df['year'] = ((d_times.year - 2013) * fw[6]).astype(np.int8)

    df = df.drop(['time'], axis=1) 
    return df
    

def process_one_cell(df_train, df_test, grid_id, th):
    """   
    Classification inside one grid cell.
    """   
    #Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int)
    X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int)
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=25, weights='distance', 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    #pred_probas = np.argsort(y_pred, axis=1)[:,::-1][:,:3]
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])
    
    return pred_labels, row_ids, y_pred
    
    
def process_grid(df_train, df_test, th, n_cells):
    """
    Iterates over all grid cells, aggregates the results and makes the
    submission.
    """ 
    preds = np.zeros((df_test.shape[0], 3), dtype=int)
    
    for g_id in range(1):
        if g_id % 100 == 0:
            print('iter: %s' %(g_id))
        
        #Applying classifier to one grid cell
        pred_labels, row_ids, y_pred = process_one_cell(df_train, df_test, g_id, th)

        #Updating predictions
        preds[row_ids] = pred_labels

    print('Generating submission file ...')
    #Auxiliary dataframe with the 3 best predictions for each sample
    df_aux = pd.DataFrame(preds, dtype=str, columns=['l1', 'l2', 'l3'])  
    
    #Concatenating the 3 predictions for each sample
    ds_sub = df_aux.l1.str.cat([df_aux.l2, df_aux.l3], sep=' ')
    
    #Writting to csv
    ds_sub.name = 'place_id'
    ds_sub.to_csv('sub_grid.csv', index=True, header=True, index_label='row_id') 
    
    return y_pred, pred_labels,preds
      

In [2]:
"""
"""
print('Loading data ...')
df1 = pd.read_hdf('grid0predictedplaces_.h5','table')
df2 = pd.read_hdf('grid0predictedprobability_.h5','table')

Loading data ...


In [3]:
df1.head()

Unnamed: 0,l1,l2,l3
5,8370753254,9727638738,4120068991
442,1478305117,5003921802,9727638738
2133,7065354365,3642864292,8815983898
2794,8958485237,1376741893,6505057624
3200,5003921802,4012969260,5079685869


In [4]:
df2.head()

Unnamed: 0,l1,l2,l3
5,0.933578,0.066422,0.0
442,0.64942,0.135856,0.037749
2133,0.290107,0.132806,0.114269
2794,0.303156,0.196654,0.116576
3200,0.826281,0.103805,0.039152


In [5]:
df2['diff12'] = df2['l1'] - df2['l2']
df2['diff23'] = df2['l2'] - df2['l3']

In [6]:
df2.head()

Unnamed: 0,l1,l2,l3,diff12,diff23
5,0.933578,0.066422,0.0,0.867156,0.066422
442,0.64942,0.135856,0.037749,0.513564,0.098107
2133,0.290107,0.132806,0.114269,0.157301,0.018537
2794,0.303156,0.196654,0.116576,0.106503,0.080078
3200,0.826281,0.103805,0.039152,0.722476,0.064653


In [12]:
train_orig = pd.read_hdf('train.h5','table')

In [13]:
df_cell_train = train_orig.loc[train_orig.grid_cell == 0]
place_counts = df_cell_train.place_id.value_counts()
mask = (place_counts[df_cell_train.place_id.values] >= 5).values
df_cell_train = df_cell_train.loc[mask]

In [18]:
type(df_cell_train.place_id.unique())

numpy.ndarray

In [30]:
df3 = pd.DataFrame(df_cell_train.place_id.value_counts())

In [41]:
type(df3.loc[7123189219].values[0].astype(np.int16))

numpy.int16

In [42]:
def mapper(x):
    return df3.loc[x].values[0].astype(np.int16)

In [44]:
df4 = df1.applymap(mapper)

In [63]:
df4.columns = ['gridFreq1','gridFreq2','gridFreq3']

In [64]:
df4.head()

Unnamed: 0,gridFreq1,gridFreq2,gridFreq3
5,693,344,74
442,121,78,344
2133,330,456,124
2794,312,309,250
3200,78,45,8


In [57]:
df5 = pd.DataFrame(train_orig.place_id.value_counts())

In [60]:
def mapper1(x):
    return df5.loc[x].values[0].astype(np.int16)

In [61]:
df6 = df1.applymap(mapper1)

In [65]:
df6.columns = ['AllFreq1','AllFreq2','AllFreq3']

In [66]:
df6.head()

Unnamed: 0,AllFreq1,AllFreq2,AllFreq3
5,712,351,92
442,121,81,351
2133,346,469,133
2794,322,318,297
3200,81,50,246


In [67]:
df7 = pd.concat([df2, df4, df6], axis=1)

In [68]:
df7.head()

Unnamed: 0,l1,l2,l3,diff12,diff23,gridFreq1,gridFreq2,gridFreq3,AllFreq1,AllFreq2,AllFreq3
5,0.933578,0.066422,0.0,0.867156,0.066422,693,344,74,712,351,92
442,0.64942,0.135856,0.037749,0.513564,0.098107,121,78,344,121,81,351
2133,0.290107,0.132806,0.114269,0.157301,0.018537,330,456,124,346,469,133
2794,0.303156,0.196654,0.116576,0.106503,0.080078,312,309,250,322,318,297
3200,0.826281,0.103805,0.039152,0.722476,0.064653,78,45,8,81,50,246


In [69]:
df7.to_hdf('todoAnalysis.h5','table')