In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [28]:
def solve_ls(X, y):
    inv = np.linalg.inv(np.dot(X.T, X))
    a = np.dot(inv, X.T)
    return np.dot(a, y)

def solve_rr(X, y, lam):
    inv = np.linalg.inv(np.dot(X.T, X) + lam * np.identity(np.shape(X)[1]))
    a = np.dot(inv, X.T)
    return np.dot(a, y)

#John's ALS code
def ALS(data_train, data_test, k=2, lam=0.02, max_iter=100):
    '''
    data_train and data_test are lists of tuples (row, column, MSE)
    these need to be integers so we need to map each dataset and lambda value to an integer (MSE need not be integer)
    
    k is rank
    lam is the regularizer on the least squares call in each step
    
    my code also has offsets b_u and b_v
    the prediction for entry (a, i) is (b_u[a] + b_v[i] + np.dot(u[a].T, v[i])[0][0])
    '''
    # size of the problem
    n = max(d[0] for d in data_train)+1 # datasets
    m = max(d[1] for d in data_train)+1 # lambda values
    # which entries are set in each row and column and the MSE
    us_from_v = [[] for i in range(m)]  # II (i-index-set)
    vs_from_u = [[] for a in range(n)]  # AI (a-index set)
    for (a, i, r) in data_train:
        us_from_v[i].append((a, r))
        vs_from_u[a].append((i, r))
    # Initial guesses for u, b_u, v, b_v
    # Note that u and v are lists of column vectors (rows of U, V).
    u, b_u, v, b_v = ([np.random.normal(1/k, size=(k,1)) for a in range(n)],
          np.zeros(n),
          [np.random.normal(1/k, size=(k,1)) for i in range(m)],
          np.zeros(m))
    for itr in range(max_iter):
        if itr%5 == 0:
            print(itr)
        for i in range(len(u)): #run ls on u
            X_mat = np.array([np.append(np.array([1]), v[a[0]].T[0]) for a in vs_from_u[i]])
            y_vec = np.array([[a[1] - b_v[a[0]]] for a in vs_from_u[i]])
            if len(X_mat) > 0:
                sol = solve_rr(X_mat, y_vec, lam)
                b_u[i] = sol[0]
                u[i] = sol[1:]
        for j in range(len(v)): #run ls on v
            X_mat = np.array([np.append(np.array([1]), u[a[0]].T[0]) for a in us_from_v[j]])
            y_vec = np.array([[a[1] - b_u[a[0]]] for a in us_from_v[j]])
            if len(X_mat) > 0:
                sol = solve_rr(X_mat, y_vec, lam)
                b_v[j] = sol[0][0]
                v[j] = sol[:][1:]
            
    # TODO: Evaluate using some error metric measured on test set
    error = 0
    count = 0
    for (a, i, r) in data_test:
        count += 1
        error += (b_u[a] + b_v[i] + np.dot(u[a].T, v[i])[0][0] - r) ** 2
    error = error / count
    return (u, b_u, v, b_v, error)

In [25]:
def load_datasets():
    dataset_dic = {}
    for i in range(1, 101):
        X_df = pd.read_csv('gendata/example_dataset' + str(i) + '.csv')
        dataset_dic[i - 1] = X_df
    return dataset_dic

lambda_dic = {lamb + 60: 10 ** (-.05 * lamb) for lamb in range(-60, 40)}

def get_MSE(X_df, lam, iters = 1, size = 0.5):
    MSE = 0
    for _ in range(iters):
        train, test = train_test_split(X_df, test_size=size)
        X_train, Y_train = np.array(train.drop('Y', axis=1)), np.transpose(np.array([train['Y']]))
        X_1 = np.hstack((np.ones(shape = (len(X_train), 1)), X_train))
        sol = solve_rr(X_1, Y_train, lam)
        X_test, Y_test = np.array(test.drop('Y', axis=1)), np.transpose(np.array([test['Y']]))
        X_2 = np.hstack((np.ones(shape = (len(X_test), 1)), X_test))
        MSE += np.linalg.norm(X_2@sol - Y_test) / len(X_2)
    return MSE / iters
  
def get_points(m, n, p, q):
    train_points = set()

    for i in range(m):
        j = np.random.randint(0, n)
        train_points.add((i, j))
    for j in range(n):
        i = np.random.randint(0, m)
        if (i, j) not in train_points:
            train_points.add((i, j))
    for i in range(m):
        for j in range(n):
            if np.random.random() < p:
                if (i, j) not in train_points:
                    train_points.add((i, j))

    test_points = set()

    for i in range(m):
        for j in range(n):
            if np.random.random() < q:
                if (i, j) not in train_points:
                    test_points.add((i, j))
                    
    return train_points, test_points

def generate_data(dataset_dic, lambda_dic, points):
    data = []
    for (i, j) in points:
        MSE = get_MSE(dataset_dic[i], lambda_dic[j])
        data.append((i, j, MSE))                    
    return data     

In [29]:
dataset_dic = load_datasets()
print('loaded datasets')
train_pts, test_pts = get_points(len(dataset_dic), len(lambda_dic), p = 0.01, q = 0.02)
print('generated points')
data_train = generate_data(dataset_dic, lambda_dic, train_pts)
print('generated train list')
data_test = generate_data(dataset_dic, lambda_dic, test_pts)
print('generated test list')
u, b_u, v, b_v, error = ALS(data_train, data_test, k=2, lam=0.02, max_iter=100)
print('finished ALS')

loaded datasets
generated points
generated train list
generated test list
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
finished ALS


In [30]:
error

0.0027754076322686425

In [31]:
data_test

[(10, 95, 0.011600826280359878),
 (73, 32, 0.09305301756965943),
 (4, 41, 0.12634204687968942),
 (20, 25, 0.1081172817168087),
 (26, 97, 0.0010073850896873194),
 (72, 11, 0.30466693460579586),
 (97, 16, 0.20168114146994046),
 (37, 82, 0.15901251173673908),
 (52, 94, 0.03197963617572955),
 (32, 2, 0.24130219820650517),
 (4, 93, 0.16258596204547757),
 (70, 19, 0.08717893758683658),
 (10, 69, 0.02144003772131511),
 (80, 81, 0.1322487919679641),
 (89, 94, 0.11557139319720808),
 (90, 27, 0.1172102119693704),
 (73, 35, 0.09257703179540666),
 (96, 71, 0.06503473224253728),
 (41, 29, 0.1557443491043857),
 (57, 45, 0.08702084279065471),
 (74, 56, 0.1382932913665159),
 (19, 76, 0.11522379063824649),
 (51, 22, 0.14448418865103535),
 (91, 8, 0.1265299239386899),
 (6, 44, 0.09829802153491456),
 (13, 73, 0.009106476908805165),
 (22, 65, 0.17694525896471783),
 (67, 46, 0.03948789872403857),
 (74, 73, 0.12910652112214044),
 (18, 86, 0.07617381335486088),
 (85, 47, 0.12628920730605248),
 (72, 44, 0.109