### Import libraries and data

In [1]:
import os
os.chdir("..")

import numpy as np
import numpy.random as npr
import numpy.linalg as npl
import torch
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("data/regression/airfoil.csv")
df.head()

Unnamed: 0,Frequency,AngleAttack,ChordLength,FreeStreamVelocity,SuctionSide,Sound
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


$$ x^2 + y^2 + z^2 = 1 $$

$ \mathbf{x} = x_i, i \in \{1, \dots, n\}$, s.t.
$$ \sum_i x_i ^2 = 1 $$

### We start with a hyperplane centered at 0 and we slowly move it towards the direction of the normal until we  have a 80-20 separation.

The movement of the hyperplane at each step depends on how far away we are from the desired split. The hyperplane equation is given by:

$$ n^T x + c = 0, $$

where $n$ is the normal vector (which we have randomly chosen).

We start from $c=0$ and increase $c$ based on how far away we are from the desired split ratio, in a while loop. At each step we calculate the in-domain:OOD ratio by counting how many points lie below and above the hyperplane respectively:

$$
\mathrm{x} \in \begin{cases}
    \text{in domain,} & \text{if } \ n^T x + c > -1 \\ 
    \text{OOD,} & \text{if } \ n^T x + c <= -1,
\end{cases}
$$

and then update $c$ via the following update rule:

$$ \Delta r = (\frac{\text{OOD}}{N} - r), \\
c = c +  \Delta r \cdot c, $$

where $N$ is the total number of data and $r$ is the desired split ratio.

We use 'dom' and 'OOD' abbreviations for in domain data and Out Of Domain data respectively.

In [2]:
def hyperplane_split(X, OOD_size=0.2, eta=1e-0, eps=1e-3, verbose=0):
    """
    Splits data into train-test datasets according to
    a shifting hyperplane. First, a random direction is chosen
    and then the corresponding hyperplane is used to separate data into
    two datasets: Domain (dom) and Out Of Domain (OOD).
    The hyperplane is moved towards the direction of the random normal until 
    the desired train-test split is achieved.
    
    Arguments
    ---------
    X:  (N, k) np.array
        unlabeled data
    OOD_size: float
        desired in domain / OOD split ratio
    r:  float
        current OOD to total data ratio
    eta: float
        "learning rate", affects how far the hyperplane moves at each step
    eps: float
        threshold for stopping
    
    Returns
    -------
    dom_idx, OOD_idx: (N,) np.arrays, indeces of domain / OOD data
    """
    N = X.shape[0]
    
    # Normalize data
    X = StandardScaler().fit_transform(X)
    
    # Pick random direction and normalize
    # https://en.wikipedia.org/wiki/N-sphere#Uniformly_at_random_on_the_(n_%E2%88%92_1)-sphere
    n = npr.randn(X.shape[1])
    n /= npl.norm(n)
    if verbose:
        print("Found random direction n =", n)
    
    # Loop to calculate the best c
    c = 0       # initial c
    r = 1       # initial ratio r
    while np.abs(r-OOD_size) > eps:
        # the quantity n^T * X + c tells us where the data belongs based on positive or negative sign (similar to SVM)
        signs = np.sign(np.dot(X, n) + c) 

        # Find the ID and OOD data
        OOD = np.sum(signs <= -1)   # points that lie on the hyperplane are counted as ID data
        r = OOD / N                 # new ratio
            
        # Update hyperplane
        dr = r - OOD_size
        c += eta * dr
    
    if verbose:
        print("Found split with ratio r =", r)
    
    # Separate data according to the hyperplane found  
    dom_idx = signs > -1
    OOD_idx = ~(signs > -1)
    
    return dom_idx, OOD_idx



### Apply the split

In [7]:
dom_idx, OOD_idx = hyperplane_split(df.iloc[:, :-1].values, OOD_size=0.2)

D = df[dom_idx].values
OOD = df[OOD_idx].values

# Split 
X, y = D[:,:-1], D[:,-1]
X_OOD, y_OOD = OOD[:,:-1], OOD[:,-1]

# Compare some statistics
print("In-domain means:", np.round(X.mean(axis=0), 5))
print("OOD means:", np.round(X_OOD.mean(axis=0), 5))

print("In-domain sd's:", np.round(X.std(axis=0), 5))
print("OOD sd's:", np.round(X_OOD.std(axis=0), 5))

In-domain means: [3.203068e+03 4.484000e+00 1.560000e-01 5.153200e+01 7.000000e-03]
OOD means: [1.6269702e+03 1.5923510e+01 5.7780000e-02 4.8192380e+01 2.9040000e-02]
In-domain sd's: [3.3571301e+03 3.8568400e+00 9.3080000e-02 1.5664400e+01 8.4500000e-03]
OOD sd's: [1.62027709e+03 3.23473000e+00 3.60500000e-02 1.48801100e+01
 1.32400000e-02]
