In [1]:
%load_ext autoreload 

import numpy as np
import pandas as pd
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
from torch.utils.data import Dataset, DataLoader

from collections import defaultdict
from joblib import Parallel, delayed, pool
from multiprocessing import cpu_count
from IPython.display import clear_output
import matplotlib.pyplot as plt
from itertools import combinations
from tqdm import tqdm_notebook

from sklearn.multioutput import RegressorChain, MultiOutputRegressor
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.manifold import Isomap, LocallyLinearEmbedding, TSNE
from sklearn.metrics import make_scorer, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
from lightgbm import LGBMRegressor
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
from umap import UMAP

from utils import load_dataset
N_CPU = cpu_count()
import warnings
warnings.filterwarnings("ignore")
%autoreload 2

# Calculate distances

In [None]:
distance_path = 'distances'

In [None]:
def compute_pdist_l1(data, name, distance_path):
    data_ = data.copy()
    S = squareform(pdist(data_, metric='minkowski', p=1))
    path = os.path.join(distance_path, f'orig_L1_{name}')
    np.save(path, S)
    
_ = Parallel(n_jobs=len(data_orig))(delayed(compute_pdist_l1)(data, name, distance_path) 
                                        for name, data in data_orig.items())

In [None]:

def compute_pdist_l2(data, name, distance_path):
    data_ = data.copy()
    S = squareform(pdist(data_, metric='minkowski', p=2))
    path = os.path.join(distance_path, f'orig_L2_{name}')
    np.save(path, S)
    
_ = Parallel(n_jobs=len(data_orig))(delayed(compute_pdist_l2)(data, name, distance_path) 
                                        for name, data in data_orig.items())

In [None]:
def js(p,q):
    # jensen-shennon divegence
    EPS = 1e-10
    dkl_pq = np.sum(p * np.log((p+EPS)/(q + EPS)))
    dkl_qp = np.sum(q * np.log((q+EPS)/(p + EPS)))
    J = (dkl_pq + dkl_qp)/2
    return J

def compute_pdist_js(data, name, distance_path):
    data_ = data.copy()
    
    N = data_.shape[0]
    S = np.zeros((N,N))
    for i in range(N):
        for j in range(i+1,N):
            d = js(data_[i], data_[j])
            S[i,j] = d
            S[j,i] = d
            
    path = os.path.join(distance_path, f'orig_JS_{name}')
    np.save(path, S)
    
if 'JS' in distances_names:
    _ = Parallel(n_jobs=len(data_orig))(delayed(compute_pdist_js)(data, name, distance_path) 
                                            for name, data in data_orig.items())

In [None]:
def compute_pdist_bc(data, name):
    data_ = data.copy()
    N = data_.shape[0]
    
    S = np.zeros((N,N))
    for i in range(N):
        for j in range(i+1,N):
            d = braycurtis(data_[i], data_[j])
            S[i,j] = d
            S[j,i] = d
            
    np.save(f'./distances_processed/orig_BC_{name}',S)
    
if 'BC' in distances_names:
    _ = Parallel(n_jobs=len(data_orig))(delayed(compute_pdist_bc)(data, name) 
                                            for name, data in data_orig.items())