In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import *

In [2]:
gel_encoded_vc = load_data('../../data/vehicle_claims/gel_encoded.csv')
gel_encoded_vc_10 = load_data('../../data/vehicle_claims/gel_encoded_10.csv')

In [3]:
gel_encoded_vi = load_data('../../data/vehicle_insurance/gel_encoded.csv')
gel_encoded_vi_10 = load_data('../../data/vehicle_insurance/gel_encoded_10.csv')

In [4]:
gel_encoded_ci = load_data('../../data/car_insurance/gel_encoded.csv')
gel_encoded_ci_10 = load_data('../../data/car_insurance/gel_encoded_10.csv')

**Reduce Dimension**

In [5]:
from sklearn import decomposition

In [6]:
pca = decomposition.PCA(n_components=1)
pca.fit(gel_encoded_ci)
low_dim_ci = pca.transform(gel_encoded_ci)

In [7]:
pca.fit(gel_encoded_vi)
low_dim_vi = pca.transform(gel_encoded_vi)

In [8]:
pca.fit(gel_encoded_vc)
low_dim_vc = pca.transform(gel_encoded_vc)

In [9]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()


In [10]:
ci_scaled = min_max_scaler.fit_transform(low_dim_ci)
vi_scaled = min_max_scaler.fit_transform(low_dim_vi)
vc_scaled = min_max_scaler.fit_transform(low_dim_vc)

**GMM**

In [11]:
from sklearn.mixture import GaussianMixture

In [38]:
gm_ci = GaussianMixture(n_components=1, random_state=0).fit(ci_scaled)
gm_vi = GaussianMixture(n_components=1, random_state=0).fit(vi_scaled)
gm_vc = GaussianMixture(n_components=1, random_state=0).fit(vc_scaled)



In [39]:
gm_ci.means_, gm_ci.covariances_

(array([[0.47742685]]), array([[[0.05618719]]]))

In [40]:
gm_vi.means_, gm_vi.covariances_

(array([[0.41652024]]), array([[[0.02994736]]]))

In [41]:
gm_vc.means_, gm_vc.covariances_

(array([[0.34922397]]), array([[[0.145115]]]))

**KS Test**

In [16]:
from scipy import stats

In [17]:
avg_ks = 0
for i in range(int(len(vi_scaled)/1000)-1):
    vi_data = vi_scaled[1000*(i):1000*(i+1),0]
    ks_test = stats.kstest(ci_scaled[:,0], vi_data)
    avg_ks += ks_test.statistic
avg_ks/i

0.23992307692307693

In [18]:
avg_ks = 0
for i in range(int(len(vc_scaled)/1000)-1):
    vc_data = vc_scaled[1000*(i):1000*(i+1),0]
    ks_test = stats.kstest(ci_scaled[:,0], vc_data)
    avg_ks += ks_test.statistic
avg_ks/i

0.46749624060150474

In [19]:
avg_ks = 0
for i in range(int(len(vc_scaled)/10000)-1):
    vc_data = vc_scaled[10000*(i):10000*(i+1),0]
    ks_test = stats.kstest(vi_scaled[:10000,0], vc_data)
    avg_ks += ks_test.statistic
avg_ks/i

0.46031249999999996

**Minkowski Distance**

In [20]:
from scipy.spatial import distance

In [21]:
avg_ms = 0
for i in range(int(len(vi_scaled)/1000)-1):
    vi_data = vi_scaled[1000*(i):1000*(i+1),0]
    ms = distance.minkowski(ci_scaled[:,0], vi_data)
    avg_ms += ms
avg_ms/i

10.21748694832737

In [22]:
avg_ms = 0
for i in range(int(len(vc_scaled)/1000)-1):
    vc_data = vc_scaled[1000*(i):1000*(i+1),0]
    ms = distance.minkowski(ci_scaled[:,0], vc_data)
    avg_ms += ms
avg_ms/i

14.750836083074313

In [80]:
avg_ms = 0
ms1 = 0
for i in range(int(len(vc_scaled)/1000)-1):
    for j in range(int(len(vi_scaled)/1000)-1):
        vc_data = vc_scaled[1000*(i):1000*(i+1),0]
        ms = distance.minkowski(vi_scaled[1000*(j):1000*(j+1),0], vc_data)
        ms1 += (ms)
    avg_ms += ms1
avg_ms / (i*1000)

24.787516144931924

**KL Divergence**

In [44]:
def kld_gauss(u1, v1, u2, v2):
  s1 = np.sqrt(v1)
  s2 = np.sqrt(v2)
  a = np.log(s2/s1) 
  num = v1 + (u1 - u2)**2
  den = 2 * v2
  b = num / den
  return a + b - 0.5

In [47]:
ci_vi = kld_gauss(0.47742685,0.05618719, 0.41652024, 0.02994736)
ci_vi

0.18541100284229706

In [48]:
ci_vc = kld_gauss(0.47742685,0.05618719, 0.34922397,0.145115)
ci_vc

0.224645131723878

In [49]:
vi_vc = kld_gauss(0.41652024, 0.02994736, 0.34922397,0.145115)
vi_vc

0.40783173568479025