Данный ноутбук содержит часть с получением весов в комбинации эмбеддингов с использованием качества слоев на обучающей выборке, а также значений метрик сходства текстовых репрезентаций.

В ячейке ниже стоит определить пути к csv-файлам, полученным из ноутбука №1, а также желаемый путь к csv файлу, в который запишутся полученные веса.

Кроме того, можно указать номера слоев (нумерация с 0), которые хочется использовать в ансамбле. По умолчанию веса подбираются для всех слоев.

In [None]:
path_train_quality = ...
path_correlation = ...
path_result = ...
layers_to_use = list(range(0, 12))

In [1]:
from IPython.display import clear_output
! pip install datasets
! pip install transformers[torch]
! pip install bayesian-optimization==1.4.1
! pip install cvxopt
! pip install qpsolvers==3.4.0
! pip install accelerate -U
! pip install git+https://github.com/simonzhang00/ripser-plusplus.git
clear_output()

## Obtaining weights based on mteb quality & correlation

In [37]:
from cvxopt import matrix, solvers
from qpsolvers import solve_qp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

metric_all = pd.read_csv(path_correlation, index_col=None)
embeds_accuracy = pd.read_csv(path_train_quality, index_col=None)

In [38]:
def solve_system(mteb_quality, corr_matrix):
    a = np.array(1 - np.array(mteb_quality))
    P = 2 * corr_matrix
    c = np.diag(corr_matrix)
    q = a - c
    A = np.ones((1, corr_matrix.shape[0]))
    b = np.ones(1)
    lb = np.zeros(corr_matrix.shape[0])
    ub = np.ones(corr_matrix.shape[0])
    n = len(mteb_quality)
    x_sol = solve_qp(P, q, A=A, b=b, lb=lb, ub=ub, initvals=np.array([1/n for i in range(n)]), solver='cvxopt', verbose=True)#, kktsolver='ldl', options={'kktreg':1e-3})
    primal_obj = 0.5 * np.dot(np.dot(x_sol.T, P), x_sol)
    eq_w = np.array([1/n for i in range(n)])
    primal_obj_w = 0.5 * np.dot(np.dot(eq_w.T, P), eq_w)
    print(f'optimal solution: {x_sol}')
    return x_sol

In [39]:
def construct_matrices(layers: list[int]):
    mteb_quality = [embeds_accuracy[embeds_accuracy.layer==i].accuracy.values[0] for i in layers]
    n = len(layers)
    hard_corr_matrix = np.zeros((n, n))
    jaccard_matrix = np.zeros((n, n))
    second_order_matrix = np.zeros((n, n))
    aligned_cosine_matrix = np.zeros((n, n))
    rsa_matrix = np.zeros((n, n))
    concentricity_matrix = np.zeros((n, n))
    for i in range(0, n):
        hard_corr_matrix[i, i] = 1.
        jaccard_matrix[i, i] = 1.
        second_order_matrix[i, i] = 1.
        aligned_cosine_matrix[i, i] = 1.
        rsa_matrix[i, i] = 1.
        concentricity_matrix[i, i] = 0.
        for j in range(i + 1, n):
            layer_min, layer_max = min(layers[i], layers[j]), max(layers[i], layers[j])
            hard_corr_matrix[i, j] = hard_corr_matrix[j, i] = metric_all[(metric_all.layer_1 == layer_min) & (metric_all.layer_2 == layer_max)].hard_correlation.values[0]
            jaccard_matrix[i, j] = jaccard_matrix[j, i] = metric_all[(metric_all.layer_1 == layer_min) & (metric_all.layer_2 == layer_max)].jaccard.values[0]
            second_order_matrix[i, j] = second_order_matrix[j, i] = metric_all[(metric_all.layer_1 == layer_min) & (metric_all.layer_2 == layer_max)].second_order.values[0]
            aligned_cosine_matrix[i, j] = aligned_cosine_matrix[j, i] = metric_all[(metric_all.layer_1 == layer_min) & (metric_all.layer_2 == layer_max)].aligned_cosine.values[0]
            rsa_matrix[i, j] = rsa_matrix[j, i] = metric_all[(metric_all.layer_1 == layer_min) & (metric_all.layer_2 == layer_max)].rsa.values[0]
            concentricity_matrix[i, j] = concentricity_matrix[j, i] = metric_all[(metric_all.layer_1 == layer_min) & (metric_all.layer_2 == layer_max)].concentricity_distance.values[0]
    return {'mteb_quality': mteb_quality, 'hard_correlation': hard_corr_matrix, 'jaccard': jaccard_matrix, 'second_order': second_order_matrix,
            'aligned_cosine': aligned_cosine_matrix, 'rsa': rsa_matrix, 'concentricity': concentricity_matrix,
            'second_aligned': second_order_matrix + aligned_cosine_matrix, 'second_rsa': second_order_matrix + rsa_matrix,
            'aligned_rsa': aligned_cosine_matrix + rsa_matrix, 'hard_jaccard': hard_corr_matrix + jaccard_matrix,
            'hard_rsa': hard_corr_matrix + rsa_matrix, 'jaccard_rsa': jaccard_matrix + rsa_matrix}

In [40]:
def run_all(layers, df_weights):
    print(f'for layers: {layers}')
    result = construct_matrices(layers)
    print('based on hard correlation')
    try:
        solution = solve_system(result['mteb_quality'], result['hard_correlation'])
        df_weights.loc[len(df_weights.index)] = [layers, 'hard_correlation', solution]
    except AttributeError:
        print("can't solve the system")
    print('based on jaccard')
    try:
        solution = solve_system(result['mteb_quality'], result['jaccard'])
        df_weights.loc[len(df_weights.index)] = [layers, 'jaccard', solution]
    except AttributeError:
        print("can't solve the system")
    print('based on second_order')
    try:
        solution = solve_system(result['mteb_quality'], result['second_order'])
        df_weights.loc[len(df_weights.index)] = [layers, 'second_order', solution]
    except AttributeError:
        print("can't solve the system")
    print('based on aligned_cosine')
    try:
        solution = solve_system(result['mteb_quality'], result['aligned_cosine'])
        df_weights.loc[len(df_weights.index)] = [layers, 'aligned_cosine', solution]
    except AttributeError:
        print("can't solve the system")
    print('based on rsa')
    try:
        solution = solve_system(result['mteb_quality'], result['rsa'])
        df_weights.loc[len(df_weights.index)] = [layers, 'rsa', solution]
    except AttributeError:
        print("can't solve the system")
    print('based on concentricity distance')
    try:
        solution = solve_system(result['mteb_quality'], result['concentricity'])
        df_weights.loc[len(df_weights.index)] = [layers, 'concentricity_distance', solution]
    except AttributeError:
        print("can't solve the system")
    for method in ['second_aligned', 'second_rsa', 'aligned_rsa', 'hard_jaccard', 'hard_rsa', 'jaccard_rsa']:
        print(f'based on {method}')
        try:
            solution = solve_system(result['mteb_quality'], result[method])
            df_weights.loc[len(df_weights.index)] = [layers, method, solution]
        except AttributeError:
            print("can't solve the system")

In [41]:
res_weights = pd.DataFrame(columns=['layers', 'correlation', 'weights'])
run_all(layers_to_use, res_weights)
res_weights.to_csv(path_result, index=None)

for layers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
based on hard correlation
     pcost       dcost       gap    pres   dres
 0:  2.1090e-02 -1.1979e+01  2e+01  9e-01  1e+00
 1: -5.2179e-02 -1.0378e+00  1e+00  7e-03  9e-03
 2: -6.5134e-02 -1.9715e-01  1e-01  9e-04  1e-03
 3: -8.3657e-02 -1.0189e-01  2e-02  2e-05  3e-05
 4: -8.5976e-02 -8.8501e-02  3e-03  2e-06  2e-06
 5: -8.6865e-02 -8.7165e-02  3e-04  5e-09  6e-09
 6: -8.6991e-02 -8.7003e-02  1e-05  5e-11  6e-11
 7: -8.6999e-02 -8.6999e-02  2e-07  5e-13  6e-13
 8: -8.6999e-02 -8.6999e-02  2e-09  5e-15  6e-15
Optimal solution found.
optimal solution: [2.65364992e-01 6.25282976e-02 1.13720314e-02 1.20304026e-08
 4.85105156e-02 5.04171065e-09 5.67207904e-02 2.73981246e-09
 3.82320105e-09 1.46910680e-01 3.52680262e-07 4.08592316e-01]
based on jaccard
     pcost       dcost       gap    pres   dres
 0: -1.3285e-01 -1.2133e+01  2e+01  9e-01  6e-01
 1: -1.9367e-01 -1.1815e+00  1e+00  7e-03  5e-03
 2: -2.0265e-01 -3.1422e-01  1e-01  7e-04  5e