In [1]:
import warnings
import os
import numpy as np
warnings.filterwarnings('ignore')
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from random import sample,seed
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
DATA_PATH="./cmake-build-debug/"

In [3]:
%%time
X_train=np.genfromtxt(f'{DATA_PATH}train.csv',delimiter=',',skip_header=1,dtype={'names': ('x', 'y', 'accuracy', 'timestamp','place_id'),
                            'formats': ('f8','f8','i8','i8','i8')})
print(X_train.shape)

(5448512,)
CPU times: user 32.9 s, sys: 2.99 s, total: 35.9 s
Wall time: 36.2 s


In [4]:
X_train

array([(1.5024, 2.315 ,  65, 322233, 2479523047),
       (3.1889, 2.7848,  17, 490777, 9183383116),
       (0.0893, 3.7845,  62, 254915, 6838194081), ...,
       (6.8051, 1.997 , 172, 118685, 9476803556),
       (6.7918, 8.415 ,   8, 337603, 8362160962),
       (3.3328, 0.7075,  20,  27079, 2514647860)],
      dtype=[('x', '<f8'), ('y', '<f8'), ('accuracy', '<i8'), ('timestamp', '<i8'), ('place_id', '<i8')])

In [5]:
train_y = X_train['place_id']

In [6]:
train_y

array([2479523047, 9183383116, 6838194081, ..., 9476803556, 8362160962,
       2514647860])

In [7]:
features = ['x','y','accuracy','timestamp','place_id']

train_data = X_train[features].copy()
train_data = train_data.view(('<f8', len(train_data.dtype.names)))
train_data = np.delete(train_data, 4, 1)  # delete 5th column 
train_data

array([[1.502400e+000, 2.315000e+000, 3.211427e-322, 1.592043e-318],
       [3.188900e+000, 2.784800e+000, 8.399116e-323, 2.424761e-318],
       [8.930000e-002, 3.784500e+000, 3.063207e-322, 1.259447e-318],
       ...,
       [6.805100e+000, 1.997000e+000, 8.497929e-322, 5.863818e-319],
       [6.791800e+000, 8.415000e+000, 3.952525e-323, 1.667980e-318],
       [3.332800e+000, 7.075000e-001, 9.881313e-323, 1.337880e-319]])

In [8]:
del X_train

In [15]:
np.save('train.npy', train_data)
np.save('y_train.npy', train_y)

# Preprocessing

In [None]:
size = 40000
ixs=sample(range(0, len(train_y)), size)

In [None]:
X_train=train_data[ixs]
# df_train = pd.DataFrame(train_data[ixs],columns=['X','Y','acc','time'])
y_train = train_y[ixs]
# df_train.head()

In [None]:
target = np.log(df_train.acc)
name='log skew = %s'%target.skew()
print(name)
# A value closer to 0 means that we have improved the skewness of the data.

In [None]:
plt.subplot(height, width, 2)
name='Acc log skew = %s'%target.skew()
plt.hist(target)
plt.title(name)

plt.show()

In [None]:
target = np.log(df_train.time)
name='log skew = %s'%target.skew()
print(name)
# A value closer to 0 means that we have improved the skewness of the data.

In [None]:
name='Time log skew = %s'%target.skew()
plt.hist(target)
plt.title(name)

plt.show()

In [None]:
# df_train['target']=target
# df_train.drop(['time'],axis=1,inplace=True)
# cross_val_score(model,df_train,y_train,cv=5).mean()

In [None]:
radial_r = np.sqrt( np.power(X_train[:,0],2) + np.power(X_train[:,1],2) )
X_train = np.delete(X_train, 0, 1)  # delete 1th column 
X_train = np.delete(X_train, 0, 1)  # delete 1th column 
X_train = np.insert(X_train, 0, radial_r, axis=1)

In [None]:
cross_val_score(model,X_train,y_train,cv=5).mean()

In [None]:
cross_val_score(model,X_train,y_train,cv=5).mean()

# Model selection

In [9]:
model = GaussianNB()

In [12]:
train_data.shape, train_y.shape

((5448512, 4), (5448512,))

# Training

In [13]:
%%time

for i in range(10):
    print(f'epoch {i}')
    model.partial_fit(train_data, train_y,np.unique(train_y))

epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9


In [16]:
import pickle
pickle.dump(model, open("gaus", 'wb'))

# Prediction

In [17]:
%%time
X_test=np.genfromtxt(f'{DATA_PATH}test.csv',delimiter=',',skip_header=1,dtype={'names': ('x', 'y', 'accuracy', 'timestamp'),
                            'formats': ('f8','f8','i8','i8')})
print(X_test.shape)

(5454629,)
CPU times: user 29.3 s, sys: 2.62 s, total: 31.9 s
Wall time: 32.2 s


In [18]:
X_test

array([(3.7816, 3.0745, 25, 419892), (4.7889, 1.3346, 74, 649655),
       (0.3188, 4.5195, 61, 645038), ..., (9.2777, 8.0165, 77, 492692),
       (1.4072, 5.8837, 66, 567104), (8.7906, 3.0338, 65, 644275)],
      dtype=[('x', '<f8'), ('y', '<f8'), ('accuracy', '<i8'), ('timestamp', '<i8')])

In [19]:
features = ['x','y','accuracy','timestamp']

test_data = X_test[features].copy()
test_data = test_data.view(('<f8', len(test_data.dtype.names)))

In [20]:
del X_test

In [21]:
np.save('test.npy', test_data)

In [None]:
n=len(test_data)
k = 17539
j=int(n/k)
print(f'split {n} into {j} chunks of {k} size')

In [None]:
samples=[test_data[k*i:k*(i+1)]for i in range(j)]

In [None]:
samples[0]

In [None]:
ans=[]
i=1
for epoch in samples:
    print(f'epoch {i}')
    i+=1
    print(np.argsort(model.predict_proba(epoch), axis=1)[-5:])
    break

In [None]:
with open('prediction.csv', 'w') as out:
    print('id,place_id', file=out)
    for pair in enumerate(ans, 1):
        print('%i,%i' % pair, file=out)

In [None]:
ans