# Overview
- [GSDC: Position shift](https://www.kaggle.com/wrrosa/gsdc-position-shift)
- 予測値をシフトさせる

In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
from pathlib import Path
import plotly
from plotly import express as px
import optuna
import pyproj
from pyproj import Proj, transform

## utils

In [2]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    
    return dist


def compute_dist(oof, gt):
    # oof = pd.read_csv(fname)
    # gt = pd.read_csv(fname2)
    df = oof.merge(gt, on = ['phone','millisSinceGpsEpoch'])
    dst_oof = calc_haversine(df.latDeg_x,df.lngDeg_x, df.latDeg_y, df.lngDeg_y)
    scores = pd.DataFrame({'phone': df.phone,'dst': dst_oof})
    scores_grp = scores.groupby('phone')
    d50 = scores_grp.quantile(.50).reset_index()
    d50.columns = ['phone','q50']
    d95 = scores_grp.quantile(.95).reset_index()
    d95.columns = ['phone','q95']
    return (scores_grp.quantile(.50).mean() + scores_grp.quantile(.95).mean())/2, d50.merge(d95)


def WGS84_to_ECEF(lat, lon, alt):
    # convert to radians
    rad_lat = lat * (np.pi / 180.0)
    rad_lon = lon * (np.pi / 180.0)
    a    = 6378137.0
    # f is the flattening factor
    finv = 298.257223563
    f = 1 / finv   
    # e is the eccentricity
    e2 = 1 - (1 - f) * (1 - f)    
    # N is the radius of curvature in the prime vertical
    N = a / np.sqrt(1 - e2 * np.sin(rad_lat) * np.sin(rad_lat))
    x = (N + alt) * np.cos(rad_lat) * np.cos(rad_lon)
    y = (N + alt) * np.cos(rad_lat) * np.sin(rad_lon)
    z = (N * (1 - e2) + alt)        * np.sin(rad_lat)
    return x, y, z


transformer = pyproj.Transformer.from_crs(
    {"proj":'geocent', "ellps":'WGS84', "datum":'WGS84'},
    {"proj":'latlong', "ellps":'WGS84', "datum":'WGS84'},)


def ECEF_to_WGS84(x,y,z):
    lon, lat, alt = transformer.transform(x,y,z,radians=False)
    return lon, lat, alt

In [3]:
datadir = Path('../input/google-smartphone-decimeter-challenge/')
testdir = datadir / 'test'
traindir = datadir / 'train'

sample_sub = pd.read_csv(datadir/'sample_submission.csv')
sub_columns = sample_sub.columns

baseline_train = pd.read_csv(datadir / 'baseline_locations_train.csv')
# baseline_train[sub_columns].to_csv('btrain.csv',index = False)
baseline_test = pd.read_csv(datadir / 'baseline_locations_test.csv')
# baseline_test[sub_columns].to_csv('btest.csv',index = False)

In [6]:
msge = 'millisSinceGpsEpoch'

gt = pd.DataFrame()
for d in os.listdir(traindir):
    for p in os.listdir(traindir/d):
        gt = gt.append(pd.read_csv(traindir/d/p/'ground_truth.csv'))

gt['phone'] = gt['collectionName'] + '_' + gt['phoneName']
# gt[sub_columns].to_csv('gt.csv', index = False)
gt['heightAboveWgs84EllipsoidM'].describe()

count    131342.000000
mean         87.028847
std          56.999876
min          31.160000
25%          37.160000
50%          63.520000
75%         122.330000
max         247.850000
Name: heightAboveWgs84EllipsoidM, dtype: float64

In [8]:
score, scores = compute_dist(baseline_train, gt)
print(score)
scores

dst    5.287971
dtype: float64


Unnamed: 0,phone,q50,q95
0,2020-05-14-US-MTV-1_Pixel4,1.313621,2.887582
1,2020-05-14-US-MTV-1_Pixel4XLModded,1.987538,4.352359
2,2020-05-14-US-MTV-2_Pixel4,1.367776,2.799897
3,2020-05-14-US-MTV-2_Pixel4XLModded,3.357252,11.367965
4,2020-05-21-US-MTV-1_Pixel4,1.810603,4.670839
...,...,...,...
68,2021-04-29-US-MTV-1_Pixel4,6.027567,16.040323
69,2021-04-29-US-MTV-1_Pixel5,6.595323,15.591238
70,2021-04-29-US-MTV-1_SamsungS20Ultra,6.346009,13.736971
71,2021-04-29-US-SJC-2_Pixel4,4.837470,35.527722


In [None]:
def position_shift(d, a):
    # d = pd.read_csv(fname)
    d = d.copy()
    d['heightAboveWgs84EllipsoidM'] = 63.5
    d['x'], d['y'], d['z'] = zip(*d.apply(lambda x: WGS84_to_ECEF(x.latDeg, x.lngDeg, x.heightAboveWgs84EllipsoidM), axis=1))

    #a = -0.2
    d.sort_values(['phone', msge], inplace=True)
    for fi in ['x','y','z']:
        d[[fi+'p']] = d[fi].shift().where(d['phone'].eq(d['phone'].shift()))
        d[[fi+'diff']] = d[fi]-d[fi+'p']
    #d[['yp']] = d['y'].shift().where(d['phone'].eq(d['phone'].shift()))
    d[['dist']] = np.sqrt(d['xdiff']**2 + d['ydiff']**2+ d['zdiff']**2)
    for fi in ['x','y','z']:
        d[[fi+'new']] = d[fi+'p'] + d[fi+'diff']*(1-a/d['dist'])
    lng, lat, alt = ECEF_to_WGS84(d['xnew'].values,d['ynew'].values,d['znew'].values)
    
    lng[np.isnan(lng)] = d.loc[np.isnan(lng),'lngDeg']
    lat[np.isnan(lat)] = d.loc[np.isnan(lat),'latDeg']
    d['latDeg'] = lat
    d['lngDeg'] = lng
    
    d.sort_values(['phone',msge],inplace = True)
    # ffname = 'shifted_' + fname
    # d[sub_columns].to_csv(ffname, index = False)
    return d


def objective(trial):
    a = trial.suggest_uniform('a', -1, 1)
    score, scores = compute_dist(position_shift(baseline_train, a), gt)
    return score


study = optuna.create_study()
study.optimize(objective, n_trials=30)

In [None]:
study.best_params

In [None]:
position_shift(sample_sub, a = study.best_params['a'])