#### **Libraries**

In [1]:
from scipy.io import loadmat
import os
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
import ot
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

#### **Function**

Here goes explanation

In [2]:
def regularized_OT_classification(S, T, rege, y_target):
    # Initialize the scaler
    scaler = StandardScaler()

    # Z-score normalization 
    S_normalized = scaler.fit_transform(S) 
    T_normalized = scaler.fit_transform(T)

    # Initial PCA
    pca_S = PCA().fit(S_normalized)  # Without specifying n_components
    pca_T = PCA().fit(T_normalized)

    # Explained variance ratio
    explained_variance_S = pca_S.explained_variance_ratio_
    explained_variance_T = pca_T.explained_variance_ratio_

    # Cumulative explained variance
    cumulative_variance_S = explained_variance_S.cumsum()
    cumulative_variance_T = explained_variance_T.cumsum()

    # Calculation of ns and nt (number of components) to capture 90% of the variance
    ds = next(i for i, total in enumerate(cumulative_variance_S) if total >= 0.90) + 1
    dt = next(i for i, total in enumerate(cumulative_variance_T) if total >= 0.90) + 1

    # Select the minimum number of components to ensure equal dimensionality
    d = min(ds, dt)

    # PCA with optimal d value
    pca_S_optimal = PCA(n_components=d)  # Use the minimum value for d
    Xs = pca_S_optimal.fit_transform(S_normalized)

    pca_T_optimal = PCA(n_components=d)  # Use the same value for d
    Xt = pca_T_optimal.fit_transform(T_normalized)

    # Uniform vectors with equal size to ns and nt (same value per array, both values less than 1)
    a = np.ones(Xs.shape[0]) / Xs.shape[0]  
    b = np.ones(Xt.shape[0]) / Xt.shape[0]

    # Calculate the cost matrix using Euclidean distance
    M = cdist(Xs, Xt, metric='euclidean')  # Calculate distances between samples in Xs and Xt

    # Normalize the cost matrix by the maximum value of M
    M_normalized = M / M.max()

    # Fit source to target using the Sinkhorn algorithm
    gamma = ot.sinkhorn(a, b, M_normalized, rege)

    # Transport points from S to T
    Sa = np.dot(gamma.T, Xs)  # Shape of transported points Sa

    # Fit a 1-NN classifier on transported source points Sa and their corresponding labels
    knn = KNeighborsClassifier(n_neighbors=1)

    # Fit the classifier on transported source points Sa
    knn.fit(Sa, y_target)  # Use the target labels passed as a parameter

    # Step 3: Make predictions on the target points T
    y_pred = knn.predict(Xt)

    # Calculate accuracy of Sa over y_target
    accuracy = accuracy_score(y_target, y_pred)
    
    return accuracy


#### **Aplication on the Office/Caltech dataset**

In [3]:
# Dataset with 4096 features (CaffeNet4096)
caffe_dir = 'CaffeNet4096/'
data_webcam = loadmat(os.path.join(caffe_dir, 'webcam.mat'))
X_webcam = data_webcam['fts']
y_webcam = data_webcam['labels'].flatten()

data_dslr = loadmat(os.path.join(caffe_dir, 'dslr.mat'))
X_dslr = data_dslr['fts']
y_dslr = data_dslr['labels'].flatten()

# Regularization parameter
rege = 0.01

# Call the function
accuracy = regularized_OT_classification(X_webcam, X_dslr, rege, y_dslr)
print(f'Accuracy on target observations: {accuracy}')

Accuracy on target observations: 0.8726114649681529
