In [1]:
import numpy as np
from scipy import linalg
import pandas as pd

In [2]:
# Read it in
path = "../data/surveys.xlsx"
survey_32W = pd.read_excel('Survey_32N and 32W consolidated.xlsx', sheet_name="Survey_32W", converters={'T3':str})
survey_32N = pd.read_excel('Survey_32N and 32W consolidated.xlsx', sheet_name="Survey_32N")

# Drop unnecessary columns
survey_32W.drop(columns=['Unnamed: 0', 'subject_id', 'image_name', 'image_name_2'], inplace=True)
survey_32N.drop(columns=['Unnamed: 0', 'subject_id', 'image_name', 'image_name_2'], inplace=True)

# Drop rows missing response
survey_32W = survey_32W[
    (survey_32W['T3'] != ' ') &
    (survey_32W['T3'] != 'none') &
    (survey_32W['T3'].notnull()) &
    (survey_32W['T3'] != 'No Comments ') &
    (survey_32W['T3'] != 0) &
    (survey_32W['T3'] != 'None') &
    (survey_32W['T3'] != '[NO ANSWER]')]

# Select responses
res_together = survey_32W['T3'][survey_32W['T1'] == "['They should be together in the same outfits']"].tolist()
res_separate = survey_32W['T3'][survey_32W['T1'] == "['They should be in separate outfits']"].tolist()

res_N = survey_32N['T5'].tolist()

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

In [4]:
embeddings = model.encode(res_together)

In [5]:
testing = embeddings

In [6]:
np.shape(testing)

(76, 768)

## Implementing DEIM

I did this implementation in two different functions to clearly split the tasks.  The DEIM function is the row selection process, and the CUR function is then execution of the actual factorization using the DEIM row selection method

In [7]:
def DEIM(V):
    #Column Selection
    Pcol = []
    p1 = np.argmax(abs(V[:, 0]))
    Pcol.append(p1)
    for j in range(1, np.shape(V)[1]):
        r = V[:, j] - (V[:, j][Pcol[0]] / V[:, j-1][Pcol[0]]) * V[:, j-1]
        pj = np.argmax(abs(r))
        Pcol.append(pj)
        
    #Row selection
    Prow = []
    p1 = np.argmax(abs(V[0]))
    Prow.append(p1)
    for j in range(1, np.shape(V)[0]):
        r = V[j] - (V[j][Prow[0]] / V[j-1][Prow[0]]) * V[j-1]
        pj = np.argmax(abs(r))
        Prow.append(pj)
    return (Prow, Pcol)

In [16]:
def cur(A):
    #computing the SVD
    U, S, Vt = np.linalg.svd(A)
    rowID, colID = DEIM(U)
    C = np.array([A[i] for i in colID])
    R = np.array([A[i] for i in rowID])
    U = linalg.pinv(C) @ A @ linalg.pinv(R)
    return (C, U, np.transpose(R))

In [20]:
test = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])

In [21]:
C, U, R = cur(test)

In [22]:
C

array([[10, 11, 12],
       [ 1,  2,  3],
       [ 1,  2,  3],
       [ 4,  5,  6]])

In [23]:
U

array([[-0.08080808, -0.11616162, -0.15151515, -0.15151515],
       [ 0.02525253,  0.00505051, -0.01515152, -0.01515152],
       [ 0.13131313,  0.12626263,  0.12121212,  0.12121212]])

In [24]:
R

array([[ 4,  7, 10, 10],
       [ 5,  8, 11, 11],
       [ 6,  9, 12, 12]])

In [28]:
row, col = DEIM(linalg.svd(testing)[0])

In [30]:
most_important_same = pd.DataFrame({'Responses sorted from most to least important': [res_together[i] for i in row]})

In [35]:
most_important_same.to_excel('most_important.xlsx')

### For separate responses

In [36]:
embeddings2 = model.encode(res_separate)

In [38]:
row, col = DEIM(linalg.svd(embeddings2)[0])

In [40]:
most_important_same = pd.DataFrame({'Responses sorted from most to least important': [res_separate[i] for i in row]})

In [41]:
most_important_same.to_excel('most_important_separate_outfit.xlsx')