### SLEEC

In [78]:
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix

from pyspark.ml.recommendation import ALS
from pyspark.ml.recommendation import ALSModel

from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors.graph import kneighbors_graph
from sklearn.neighbors import NearestNeighbors

from scipy.io import loadmat
from collections import namedtuple

from ensemble import Model, Ensemble
from helpers import precision_at_ks, print_hdf5_object, project
from core import learn_V

from tqdm import tqdm

import implicit

In [2]:
import os
os.environ['MKL_NUM_THREADS'] = '1'
import numpy as np
from scipy.sparse import coo_matrix
import implicit.als
X=np.random.randint(5, size=(100,200))
X0=coo_matrix(X, dtype=np.float64)
model = implicit.als.AlternatingLeastSquares(factors=50)
model.fit(X0)

In [5]:
# !brew install gcc
# !pip install implicit

In [16]:
train_X = pd.read_csv('data/train_X.csv')
train_Y = pd.read_csv('data/train_Y.csv')
val_X = pd.read_csv('data/val_X.csv')
val_Y = pd.read_csv('data/val_Y.csv')

In [17]:
train_X = train_X.drop(columns = ['ex_id'])
train_Y = train_Y.drop(columns = ['ex_id'])
val_X = val_X.drop(columns = ['ex_id'])
val_Y = val_Y.drop(columns = ['ex_id'])

In [18]:
train_X, train_Y = csc_matrix(train_X), csc_matrix(train_Y)
val_X, val_Y = csc_matrix(val_X), csc_matrix(val_Y)

In [19]:
train_X

<15539x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 3684745 stored elements in Compressed Sparse Column format>

In [6]:
params = namedtuple('args', ['num_learner', 'num_clusters',
                             'num_threads', 'SVP_neigh', 'out_dim',
                             'w_thresh', 'sp_thresh', 'cost',
                             'NNtest', 'normalize'])
params.num_learners = 1 
params.num_clusters = 1
params.num_threads = 32
params.SVP_neigh = 250
params.out_Dim = 100
params.w_thresh = 0.01
params.sp_thresh = 0.01
params.NNtest = 25
params.normalize = 1
params.regressor_lambda1 = 1e-6
params.regressor_lambda2 = 1e-3
params.embedding_lambda = 0.1  # determined automatically in WAltMin_asymm.m

In [16]:
clusterings = []
for i in range(params.num_learners):
    model = KMeans(n_clusters=params.num_clusters, n_jobs=-1, n_init=8, max_iter=100)
    model.fit(train_X)
    clusterings.append(model)

In [17]:
clus_model = clusterings[0]
models = []
i = 0
data_idx = np.nonzero(clus_model.labels_ == i)[0]
X = train_X[data_idx, :]
Y = train_Y[data_idx, :]
graph = kneighbors_graph(Y, params.SVP_neigh, mode='distance', metric='cosine',
                         include_self=True,
                         n_jobs=-1)
graph.data = 1 - graph.data

In [1]:
import os
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['export OPENBLAS_NUM_THREADS'] = '1'

als_model = implicit.als.AlternatingLeastSquares(factors=params.out_Dim,
                                                 regularization=params.embedding_lambda)
als_model.fit(graph) 

In [14]:
Z = als_model.item_factors
regressor = ElasticNet(alpha=0.1, l1_ratio=0.001)
regressor.fit(X, Z)
V = regressor.coef_
fitted_Z = X.toarray() @ V.T

In [None]:
Z_neighbors = NearestNeighbors(n_neighbors=params.NNtest, metric='cosine').fit(fitted_Z)

In [None]:
projected_center = project(V, clus_model.cluster_centers_[i])
learned = {'center_z': projected_center,
           'V': V,
           'Z_neighbors': Z_neighbors,
           'data_idx': data_idx}

In [84]:
import pickle
ensemble = pickle.load(open('sleec_default_2.pickle', 'rb'))

In [85]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

from helpers import project


class Model():
    def __init__(self, cluster_models, train_Y):
        self.models = cluster_models
        self.train_Y = train_Y
        
    def closet_cluster(self, x):
        sims = []
        for m in self.models:
            z = project(m['V'], x)  # the projected value
            sim = cosine_similarity([z], [m['center_z']])
            sims.append(sim)
        return self.models[np.argmax(sims)]
    
    def predict(self, x):
        model = self.closet_cluster(x)
        z = project(model['V'], x)
        dist, nbrs = model['Z_neighbors'].kneighbors([z], return_distance=True)
        real_idx = [model['data_idx'][i] for i in nbrs[0]]

        # weight by 1 / distance
        dist += 1e-10
        weights = (1 / dist).T
        labels = np.asarray(self.train_Y[real_idx, :].todense())
        # print(weights.shape)
        # print(labels.shape)
        # print(type(weights))
        # print(type(labels))
        scores_per_instance = labels * weights
        scores = scores_per_instance.sum(axis=0)
        return np.array(scores).flatten()

learners = ensemble.models[0].models
models = [Model([learner], train_Y) for learner in learners]
ensemble_new = Ensemble(models)

In [86]:
pred_Y_new = ensemble_new.predict_many(val_X)

100%|██████████| 1316/1316 [00:20<00:00, 65.20it/s]


In [87]:
val_Y = pd.read_csv('data/val_Y.csv')
val_Y = val_Y.drop(columns = ['ex_id'])

In [88]:
from sklearn.metrics import label_ranking_average_precision_score
label_ranking_average_precision_score(val_Y.to_numpy(), pred_Y_new)

0.49119083483312037

### 1-vs-all models

In [None]:
import sys  
sys.path.insert(0, '/Users/yuema/Desktop/DSGA1003/Project')

In [None]:
import numpy as np

from pyspark.ml.recommendation import ALS
from pyspark.ml.recommendation import ALSModel

from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors.graph import kneighbors_graph
from sklearn.neighbors import NearestNeighbors

from scipy.io import loadmat
from collections import namedtuple

from sleec.ensemble import Model, Ensemble
from sleec.helpers import precision_at_ks, print_hdf5_object, project
from sleec.core import learn_V

from tqdm import tqdm

In [None]:
train_X = pd.read_csv('train_X.csv')
train_Y = pd.read_csv('train_Y.csv')
val_X = pd.read_csv('val_X.csv')
val_Y = pd.read_csv('val_Y.csv')

In [None]:
train_X = train_X.drop(columns = ['ex_id'])
train_Y = train_Y.drop(columns = ['ex_id'])
val_X = val_X.drop(columns = ['ex_id'])
val_Y = val_Y.drop(columns = ['ex_id'])

In [None]:
start = time.time()
clf = OneVsRestClassifier(XGBClassifier(random_state=123)).fit(train_X, train_Y)
predictions = clf.predict(val_X)
end = time.time() - start
np.savetxt('/content/gdrive/My Drive/1003 Project/OvA_boosting_default.csv', predictions)
model_file_path='/content/gdrive/My Drive/1003 Project/OvA_boosting_default.pickle'
pickle.dump(clf, open(model_file_path, 'wb'))

print('Time need for model fitting: {} hrs'.format(end/3600))