# TP GEOLOC SIGFOX

## SETUP

In [1]:
%matplotlib inline

import math
import numpy as np
import pandas as pd


from time import time
import matplotlib.pyplot as plt
from geopy.distance import vincenty

from scipy.stats import randint

from sklearn import linear_model, ensemble, svm
from sklearn.model_selection import cross_val_predict, cross_val_score, GridSearchCV, LeaveOneOut, \
    LeaveOneGroupOut, StratifiedShuffleSplit, train_test_split

import warnings
warnings.filterwarnings('ignore')

## PATH

In [2]:
# Path data repository
path_data = 'data/'

## LOAD TRAIN & TEST DATA SET

In [3]:
# X train
df_mess_train = pd.read_csv(path_data + 'mess_train_list.csv')
print("X_train : df_mess_train.shape : ", df_mess_train.shape)

# Y train
pos_train = pd.read_csv(path_data + 'pos_train_list.csv')
print("Y_train : pos_train.shape : ", pos_train.shape)

# X test
df_mess_test = pd.read_csv(path_data + 'mess_test_list.csv')
print("X_test : df_mess_test.shape : ", df_mess_test.shape)

X_train : df_mess_train.shape :  (39250, 8)
Y_train : pos_train.shape :  (39250, 2)
X_test : df_mess_test.shape :  (29286, 8)


In [4]:
did_train = pd.DataFrame(df_mess_train['did'].sort_values(axis=0).unique())
did_test = pd.DataFrame(df_mess_test['did'].sort_values(axis=0).unique())

print("Nombre de did_test dans did_train : ", np.sum(did_test.isin(did_train)))

Nombre de did_test dans did_train :  0    0
dtype: int64


## Split Train / Dev

### Join X_train & y_train (pour garder la correspondance)

In [5]:
train_total = df_mess_train.join(pos_train)
print("train_total.shape : ", len(train_total))

if len(train_total) == len(df_mess_train):
    print("Join ok, train_total : ", len(train_total), " equal df_mess_train : ", len(df_mess_train))
else:
    print("Error join, train_total : ", len(train_total), " not equal df_mess_train : ", len(df_mess_train))

    
train_total.head()

train_total.shape :  39250
Join ok, train_total :  39250  equal df_mess_train :  39250


Unnamed: 0,objid,bsid,did,nseq,rssi,time_ux,bs_lat,bs_lng,lat,lng
0,573bf1d9864fce1a9af8c5c9,2841,473335.0,0.5,-121.5,1463546000000.0,39.617794,-104.954917,39.60669,-104.95849
1,573bf1d9864fce1a9af8c5c9,3526,473335.0,2.0,-125.0,1463546000000.0,39.677251,-104.952721,39.60669,-104.95849
2,573bf3533e952e19126b256a,2605,473335.0,1.0,-134.0,1463547000000.0,39.612745,-105.008827,39.637741,-104.958554
3,573c0cd0f0fe6e735a699b93,2610,473953.0,2.0,-132.0,1463553000000.0,39.797969,-105.07346,39.730417,-104.96894
4,573c0cd0f0fe6e735a699b93,3574,473953.0,1.0,-120.0,1463553000000.0,39.723151,-104.956216,39.730417,-104.96894


### Sort by deviceID

In [6]:
# Sort by deviceID pour s'entrainer sur des devices id different de ceux utilisé pour predire  
train_total = train_total.sort_values(by='did', ascending=True)
train_total.reset_index(drop=True, inplace=True)
train_total.head()

Unnamed: 0,objid,bsid,did,nseq,rssi,time_ux,bs_lat,bs_lng,lat,lng
0,582ae1d712f1434b9ccad421,2189,473288.0,1.0,-127.0,1479205000000.0,39.817654,-105.050984,39.774984,-105.03944
1,582bde7bcf554f7e3a1a7720,3848,473288.0,1.0,-128.666667,1479270000000.0,43.614138,-103.723449,39.774541,-105.039339
2,582bde7bcf554f7e3a1a7720,3501,473288.0,1.5,-136.0,1479270000000.0,43.639226,-103.761023,39.774541,-105.039339
3,582bde7bcf554f7e3a1a7720,2803,473288.0,1.5,-139.5,1479270000000.0,43.657077,-103.75689,39.774541,-105.039339
4,582bde7bcf554f7e3a1a7720,2800,473288.0,1.5,-138.0,1479270000000.0,43.631515,-103.765896,39.774541,-105.039339


### Split 70 / 30

In [7]:
# determine all did
all_did = df_mess_train['did']
unique_did = np.unique(df_mess_train['did'])
nb_did = len(unique_did)

for my_did in unique_did:
    index_start_test = all_did[all_did == my_did].index[0]
    percentage_dev_test = 1 - (index_start_test / len(df_mess_train))
    if percentage_dev_test < 0.31:
        break

print("index_start_test : ", index_start_test)
print("did separateur : ", my_did)
print("percentage_dev_test : ", percentage_dev_test)

index_start_test :  29805
did separateur :  473288.0
percentage_dev_test :  0.240636942675


In [8]:
# Split dev_train & dev_test
X_dev_train, X_dev_test, y_dev_train, y_dev_test = train_test_split(df_mess_train, pos_train, test_size=percentage_dev_test, shuffle=False)

print("Shape X_dev_train : " + str(X_dev_train.shape) + " | Shape X_dev_test : " + str(X_dev_test.shape)) 
print("Shape y_dev_train : " + str(y_dev_train.shape) + " | Shape y_dev_test : " + str(y_dev_test.shape)) 

Shape X_dev_train : (29805, 8) | Shape X_dev_test : (9445, 8)
Shape y_dev_train : (29805, 2) | Shape y_dev_test : (9445, 2)


In [9]:
# Return the last rows of X_dev_train
X_dev_train.tail()

Unnamed: 0,objid,bsid,did,nseq,rssi,time_ux,bs_lat,bs_lng
29800,582aca5b12f1434b9cb450b2,7382,476315.0,1.5,-124.0,1479199000000.0,39.743521,-105.000917
29801,582aca5b12f1434b9cb450b2,8355,476315.0,0.5,-116.5,1479199000000.0,64.3,-68.5
29802,582aca5b12f1434b9cb450b2,8356,476315.0,1.0,-112.0,1479199000000.0,39.757034,-104.976127
29803,582aca5b12f1434b9cb450b2,8397,476315.0,0.5,-130.5,1479199000000.0,39.759396,-105.001415
29804,582aca5b12f1434b9cb450b2,8474,476315.0,1.0,-124.0,1479199000000.0,39.732045,-104.973651


In [10]:
# Return the first rows of y_dev_train
y_dev_train.head()

Unnamed: 0,lat,lng
0,39.60669,-104.95849
1,39.60669,-104.95849
2,39.637741,-104.958554
3,39.730417,-104.96894
4,39.730417,-104.96894


In [11]:
X_dev_test.reset_index(drop=True, inplace=True)
X_dev_test.head()

Unnamed: 0,objid,bsid,did,nseq,rssi,time_ux,bs_lat,bs_lng
0,582ae03712f1434b9cc93a71,1594,473288.0,2.0,-127.0,1479205000000.0,64.3,-68.5
1,582ae03712f1434b9cc93a71,2731,473288.0,1.5,-104.5,1479205000000.0,39.781464,-105.040763
2,582ae03712f1434b9cc93a71,3579,473288.0,2.0,-118.0,1479205000000.0,39.755019,-105.043315
3,582ae03712f1434b9cc93a71,4058,473288.0,0.0,-124.0,1479205000000.0,39.783211,-105.088747
4,582ae03712f1434b9cc93a71,4993,473288.0,1.0,-127.0,1479205000000.0,64.3,-68.5


In [12]:
# Return the first rows of y_dev_test
y_dev_test.reset_index(drop=True, inplace=True)
y_dev_test.head()

Unnamed: 0,lat,lng
0,39.775428,-105.039537
1,39.775428,-105.039537
2,39.775428,-105.039537
3,39.775428,-105.039537
4,39.775428,-105.039537


## Save csv

In [13]:
# Create csv
X_dev_train.to_csv(path_data + 'train_X.csv', index=False)

In [14]:
# Create csv
y_dev_train.to_csv(path_data + 'train_y.csv', index=False)

In [15]:
# Create csv
X_dev_test.to_csv(path_data + 'val_X.csv', index=False)

In [16]:
# Create csv
y_dev_test.to_csv(path_data + 'val_y.csv', index=False)

In [17]:
# Create csv
df_mess_test.to_csv(path_data + 'test_X.csv', index=False)