# Overview
- [Baseline post-processing by outlier correction](https://www.kaggle.com/dehokanta/baseline-post-processing-by-outlier-correction)
- 外れ値による後処理

In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import seaborn as sns
from tqdm.notebook import tqdm
from pathlib import Path
import plotly
from plotly import express as px
import simdkalman

INPUT_DIR = Path("../input/google-smartphone-decimeter-challenge")

## utils

In [2]:
def calc_haversine(lat1, lon1, lat2, lon2):
    """Calculates the great circle distance between two points
    on the earth. Inputs are array-like and specified in decimal degrees.
    """
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    
    return dist

In [3]:
base_train = pd.read_csv(INPUT_DIR / 'baseline_locations_train.csv')
base_test = pd.read_csv(INPUT_DIR / 'baseline_locations_test.csv')
sample_sub = pd.read_csv(INPUT_DIR / 'sample_submission.csv')

# ground_truth
gt_files = list(INPUT_DIR.glob('train/*/*/ground_truth.csv'))
print('ground_truth.csv count : ', len(gt_files))

gts = []
for gt_file in tqdm(gt_files):
    gts.append(pd.read_csv(gt_file))
ground_truth = pd.concat(gts)

ground_truth.csv count :  73


  0%|          | 0/73 [00:00<?, ?it/s]

In [4]:
df_all = ground_truth.merge(base_train, on=["collectionName", "phoneName", "millisSinceGpsEpoch"], how="left", suffixes=("_truth", "_basepred"))

## check outliers in training data

In [5]:
df_all["dist_base_gt"] = calc_haversine(df_all["latDeg_truth"], df_all["lngDeg_truth"], df_all["latDeg_basepred"], df_all["lngDeg_basepred"])
df_all["dist_base_gt"].describe()

count    131342.000000
mean          3.846848
std          30.739767
min           0.001338
25%           1.210976
50%           2.065769
75%           3.560001
max        8340.257976
Name: dist_base_gt, dtype: float64

In [7]:
df_all.sort_values(by = 'dist_base_gt',ascending = False)[['collectionName','dist_base_gt']].head(10)

Unnamed: 0,collectionName,dist_base_gt
64035,2020-09-04-US-SF-1,8340.257976
114354,2020-07-17-US-MTV-2,5050.995543
52894,2021-04-26-US-SVL-1,2254.344928
113362,2020-07-17-US-MTV-2,2026.294654
113360,2020-07-17-US-MTV-2,1934.676643
108223,2021-04-29-US-SJC-2,1599.570433
83930,2020-05-29-US-MTV-1,1128.348831
113361,2020-07-17-US-MTV-2,1044.316856
54443,2021-01-05-US-SVL-1,653.703379
74448,2021-04-15-US-MTV-1,549.061548


## correct outliers in test data

In [10]:
base_test["dist_pre"] = 0
base_test["dist_pro"] = 0

base_test["latDeg_pre"] = base_test["latDeg"].shift(periods=1, fill_value=0)
base_test["lngDeg_pre"] = base_test["lngDeg"].shift(periods=1, fill_value=0)
base_test["latDeg_pro"] = base_test["latDeg"].shift(periods=-1, fill_value=0)
base_test["lngDeg_pro"] = base_test["lngDeg"].shift(periods=-1, fill_value=0)
base_test["dist_pre"] = calc_haversine(base_test["latDeg_pre"], base_test["lngDeg_pre"], base_test["latDeg"], base_test["lngDeg"])
base_test["dist_pro"] = calc_haversine(base_test["latDeg_pro"], base_test["lngDeg_pro"], base_test["latDeg"], base_test["lngDeg"])

list_phone = base_test["phone"].unique()
for phone in list_phone:
    ind_s = base_test[base_test["phone"] == phone].index[0]
    ind_e = base_test[base_test["phone"] == phone].index[-1]
    base_test.loc[ind_s, "dist_pre"] = 0
    base_test.loc[ind_e, "dist_pro"] = 0

In [11]:
base_test["dist_pre"].describe()

count    91486.000000
mean        16.937410
std         12.526582
min          0.000000
25%          5.200745
50%         14.842604
75%         28.551707
max        391.394578
Name: dist_pre, dtype: float64

In [12]:
pro_95 = base_test["dist_pro"].mean() + (base_test["dist_pro"].std() * 2)
pre_95 = base_test["dist_pre"].mean() + (base_test["dist_pre"].std() * 2)
ind = base_test[(base_test["dist_pro"] > pro_95) & (base_test["dist_pre"] > pre_95)].index

for i in ind:
    base_test.loc[i, "latDeg"] = (base_test.loc[i-1, "latDeg"] + base_test.loc[i+1, "latDeg"])/2
    base_test.loc[i, "lngDeg"] = (base_test.loc[i-1, "lngDeg"] + base_test.loc[i+1, "lngDeg"])/2

## kalman filter

In [13]:
T = 1.0
state_transition = np.array([[1, 0, T, 0, 0.5 * T ** 2, 0], [0, 1, 0, T, 0, 0.5 * T ** 2], [0, 0, 1, 0, T, 0],
                             [0, 0, 0, 1, 0, T], [0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 1]])
process_noise = np.diag([1e-5, 1e-5, 5e-6, 5e-6, 1e-6, 1e-6]) + np.ones((6, 6)) * 1e-9
observation_model = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0]])
observation_noise = np.diag([5e-5, 5e-5]) + np.ones((2, 2)) * 1e-9

kf = simdkalman.KalmanFilter(
        state_transition = state_transition,
        process_noise = process_noise,
        observation_model = observation_model,
        observation_noise = observation_noise)

In [14]:
def apply_kf_smoothing(df, kf_=kf):
    unique_paths = df[['collectionName', 'phoneName']].drop_duplicates().to_numpy()
    for collection, phone in tqdm(unique_paths):
        cond = np.logical_and(df['collectionName'] == collection, df['phoneName'] == phone)
        data = df[cond][['latDeg', 'lngDeg']].to_numpy()
        data = data.reshape(1, len(data), 2)
        smoothed = kf_.smooth(data)
        df.loc[cond, 'latDeg'] = smoothed.states.mean[0, :, 0]
        df.loc[cond, 'lngDeg'] = smoothed.states.mean[0, :, 1]
    return df


In [15]:
kf_smoothed_baseline = apply_kf_smoothing(base_test)
sample_sub = sample_sub.assign(
    latDeg = kf_smoothed_baseline.latDeg,
    lngDeg = kf_smoothed_baseline.lngDeg
)
# sample_sub.to_csv('submission.csv', index=False)

  0%|          | 0/48 [00:00<?, ?it/s]