In [None]:
"""
So I created 100 random datasets in Data Generator of between 50-100 points and 2-10 dimensional X points
and sigma of between 0-1. You can change those values and make new datasets if you want easily in the last box of 
Data Generator.

The second box here is my solve least squares, ridge regression, and ALS code.
The third box loads the datasets (change the 100 to a different value if you change the number of datasets) with the 
first function. It creates many lambda values (right now 100, so we have a 100x100 matrix overall). get_points 
randomly chooses approximately #rows*#cols*p + #rows + #cols points as training data and 
#rows*#cols*q + #rows + #cols points as test data. get_MSE gives the 
MSE for that dataset and lambda value. generate_data calculates the MSE for all of those and
formats it so ALS can run.

Things to mess around with: rank, as currently it's using 2. p and q are the approximate percentage of the matrix
that is filled. MSE_iters is how precise the MSE is. ALS_lam is how regularized the ALS is.
"""

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [28]:
def solve_ls(X, y):
    inv = np.linalg.inv(np.dot(X.T, X))
    a = np.dot(inv, X.T)
    return np.dot(a, y)

def solve_rr(X, y, lam):
    inv = np.linalg.inv(np.dot(X.T, X) + lam * np.identity(np.shape(X)[1]))
    a = np.dot(inv, X.T)
    return np.dot(a, y)

#John's ALS code
def ALS(data_train, data_test, k=2, lam=0.02, max_iter=100):
    '''
    data_train and data_test are lists of tuples (row, column, MSE)
    these need to be integers so we need to map each dataset and lambda value to an integer (MSE need not be integer)
    
    k is rank
    lam is the regularizer on the least squares call in each step
    
    my code also has offsets b_u and b_v
    the prediction for entry (a, i) is (b_u[a] + b_v[i] + np.dot(u[a].T, v[i])[0][0])
    '''
    # size of the problem
    n = max(d[0] for d in data_train)+1 # datasets
    m = max(d[1] for d in data_train)+1 # lambda values
    # which entries are set in each row and column and the MSE
    us_from_v = [[] for i in range(m)]  # II (i-index-set)
    vs_from_u = [[] for a in range(n)]  # AI (a-index set)
    for (a, i, r) in data_train:
        us_from_v[i].append((a, r))
        vs_from_u[a].append((i, r))
    # Initial guesses for u, b_u, v, b_v
    # Note that u and v are lists of column vectors (rows of U, V).
    u, b_u, v, b_v = ([np.random.normal(1/k, size=(k,1)) for a in range(n)],
          np.zeros(n),
          [np.random.normal(1/k, size=(k,1)) for i in range(m)],
          np.zeros(m))
    for itr in range(max_iter):
        if itr%5 == 0:
            print(itr)
        for i in range(len(u)): #run ls on u
            X_mat = np.array([np.append(np.array([1]), v[a[0]].T[0]) for a in vs_from_u[i]])
            y_vec = np.array([[a[1] - b_v[a[0]]] for a in vs_from_u[i]])
            if len(X_mat) > 0:
                sol = solve_rr(X_mat, y_vec, lam)
                b_u[i] = sol[0]
                u[i] = sol[1:]
        for j in range(len(v)): #run ls on v
            X_mat = np.array([np.append(np.array([1]), u[a[0]].T[0]) for a in us_from_v[j]])
            y_vec = np.array([[a[1] - b_u[a[0]]] for a in us_from_v[j]])
            if len(X_mat) > 0:
                sol = solve_rr(X_mat, y_vec, lam)
                b_v[j] = sol[0][0]
                v[j] = sol[:][1:]
            
    # TODO: Evaluate using some error metric measured on test set
    error = 0
    count = 0
    for (a, i, r) in data_test:
        count += 1
        error += (b_u[a] + b_v[i] + np.dot(u[a].T, v[i])[0][0] - r) ** 2
    error = error / count
    return (u, b_u, v, b_v, error)

In [33]:
def load_datasets():
    dataset_dic = {}
    for i in range(100):
        X_df = pd.read_csv('gendata/example_dataset' + str(i + 1) + '.csv')
        dataset_dic[i] = X_df
    return dataset_dic

lambda_dic = {lamb + 60: 10 ** (-.05 * lamb) for lamb in range(-60, 40)}
  
def get_points(m, n, p, q):
    train_points = set()

    for i in range(m):
        j = np.random.randint(0, n)
        train_points.add((i, j))
    for j in range(n):
        i = np.random.randint(0, m)
        if (i, j) not in train_points:
            train_points.add((i, j))
    for i in range(m):
        for j in range(n):
            if np.random.random() < p:
                if (i, j) not in train_points:
                    train_points.add((i, j))

    test_points = set()

    for i in range(m):
        for j in range(n):
            if np.random.random() < q:
                if (i, j) not in train_points:
                    test_points.add((i, j))
                    
    return train_points, test_points

def get_MSE(X_df, lam, iters = 1, size = 0.5):
    MSE = 0
    for _ in range(iters):
        train, test = train_test_split(X_df, test_size=size)
        X_train, Y_train = np.array(train.drop('Y', axis=1)), np.transpose(np.array([train['Y']]))
        X_1 = np.hstack((np.ones(shape = (len(X_train), 1)), X_train))
        sol = solve_rr(X_1, Y_train, lam)
        X_test, Y_test = np.array(test.drop('Y', axis=1)), np.transpose(np.array([test['Y']]))
        X_2 = np.hstack((np.ones(shape = (len(X_test), 1)), X_test))
        MSE += np.linalg.norm(X_2@sol - Y_test) / len(X_2)
    return MSE / iters

def generate_data(dataset_dic, lambda_dic, points, iters = 1, size = 0.5):
    data = []
    for (i, j) in points:
        MSE = get_MSE(dataset_dic[i], lambda_dic[j], iters, size)
        data.append((i, j, MSE))                    
    return data     

In [34]:
p = 0.05
q = 0.1
MSE_iters = 10
rank = 2
ALS_lam = 0.02
ALS_iters = 100


dataset_dic = load_datasets()
print('loaded datasets')
train_pts, test_pts = get_points(len(dataset_dic), len(lambda_dic), p = p, q = q)
print('generated points')
data_train = generate_data(dataset_dic, lambda_dic, train_pts, iters = MSE_iters)
print('generated train list')
data_test = generate_data(dataset_dic, lambda_dic, test_pts, iters = MSE_iters)
print('generated test list')
u, b_u, v, b_v, error = ALS(data_train, data_test, k=rank, lam=ALS_lam, max_iter=ALS_iters)
print('finished ALS')

loaded datasets
generated points
generated train list
generated test list
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
finished ALS


In [35]:
error

0.0008270462721635648

In [36]:
data_test

[(5, 31, 0.1226417283773221),
 (20, 25, 0.11584954113705241),
 (61, 82, 0.05210126241558151),
 (39, 70, 0.04567973456317971),
 (17, 20, 0.06865960400819968),
 (65, 50, 0.1026690307283475),
 (11, 90, 0.0067888673201218965),
 (61, 76, 0.05227717200083558),
 (4, 66, 0.12755908426939166),
 (60, 39, 0.07604956714719352),
 (34, 46, 0.10101135627326088),
 (58, 19, 0.08539060480919053),
 (91, 35, 0.10076732979627027),
 (65, 84, 0.10933345826219816),
 (99, 68, 0.15212688693429305),
 (79, 76, 0.1232242571078644),
 (44, 34, 0.06191109519844832),
 (76, 66, 0.07883404885163436),
 (38, 40, 0.10156128791228194),
 (99, 42, 0.1483712714576241),
 (13, 58, 0.027226876327508736),
 (63, 48, 0.11030354316148243),
 (70, 53, 0.03676938225447916),
 (94, 54, 0.16057750246161712),
 (51, 22, 0.13220607414788738),
 (66, 66, 0.12547866379862793),
 (33, 95, 0.17763345094124178),
 (70, 68, 0.03694979715057024),
 (25, 49, 0.17086500712220948),
 (88, 17, 0.1907686122274587),
 (94, 98, 0.16023362926682333),
 (50, 27, 0.