## Modeling

Python version spiral ... 
- > tf 2.11.0 introduced changes that break recommenders integration, need to step back in time 
- pyenv install 3.9
- pyenv virtualenv 3.9 recommenders
- pyenv uninstall recommenders
- pyenv activate recommenders

In [None]:
%set_env TF_USE_LEGACY_KERAS=True

In [None]:
import pandas as pd
import keras 

In [None]:
reviews = pd.read_parquet("../data/processed/reviews_small.parquet")
items = pd.read_parquet("../data/processed/items_small.parquet")

In [None]:
reviews

In [None]:
items

In [None]:
def extract_users(reviews): 
    """
    Given reviews, generate a user dataframe     
    """
    users = reviews.groupby(['user_id']).rating.count()
    users = pd.DataFrame(users).reset_index()
    users.rename(columns={'rating':'ratings'}, inplace=True)
    return users 

In [None]:
users = extract_users(reviews)

In [None]:
all_items = set(items.item_id)
reviews = reviews[reviews.item_id.isin(all_items)]

❗in the notebook, ratings are thresholded ... do we need to follow suit? what are the ramifications if we don't? OH... in the notebook, a click is an interaction, there's no middle ground. the network is going to operate on 0s or 1s. by leaving low reviews in our matrix, the network would learn to recommend things users have interacted with, but not necessarily positively. our case is the same, a review is an interaction. we're aiming to recommend, and we should not want to recommend low reviews. so filter... 

In [None]:
# Discard reviews by users outside our core group 
reviews = reviews[reviews.user_id.isin(set(users.user_id))]

In [None]:
reviews

In [None]:
import numpy as np 
import pandas as pd 

### Naive

In [None]:
# TODO

### Nearest Neighbor

In [None]:
# If we need sparse types, use the scipy COO since it seems to be incorporated in both pytorch and recommenders
from scipy import coo_matrix

In [None]:
from recommenders.datasets.sparse import AffinityMatrix

In [None]:
reviews = reviews.drop(['timestamp'])

In [None]:
header = {
    "col_user": "user_id",
    "col_item": "item_id",
    "col_rating": "rating",
}
ui_sparse = AffinityMatrix(reviews, **header)

# This isn't implied by the name, but this densifies the matrix, i.e. we have a contiguous u x i
# matrix here (user vector of item ratings) ... though it's actually not clear how the memory is 
# managed underneath in scipy, the 'dense' array might just be a bunch of pointers to the DFs stored 
# in the AM object... 
ui_dense, u_map, i_map = ui_sparse.gen_affinity_matrix()

In [None]:
import sys 
print(sys.getsizeof(ui_sparse))
print(sys.getsizeof(ui_sparse.df))
print(sys.getsizeof(ui_dense))

In [None]:
u_map

In [None]:
len(u_map)

In [None]:
ui_dense[0] 

In [None]:
ui_dense[1][[0,413]]

In [None]:
a = np.nonzero(ui_dense[0])[0]
b = np.nonzero(ui_dense[1])[0]

In [None]:
np.concatenate([a, b], axis=0) 

In [None]:
ui_dense[0][np.nonzero(ui_dense[0])]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
sim_a = cosine_similarity([ui_dense[0]], [ui_dense[33]])
sims = [0] *len(users)

In [None]:
sims[0] = sim_a

In [None]:
top_k = 5

In [None]:
[[0]*2]*top_k 

In [None]:
from scipy.stats import pearsonr 

def pearson_similarity(a, b):
    """
    Compute Pearson similarity
    """
    return (1 + pearsonr(a, b).statistic) / 2

In [None]:
# Populating a full user similarity matrix is inherently limited by (and is a questionable 
# strategy because of) the u^2 memory requirement. Unlike our affinity matrices, 
# these are not sparse. Since our goal is recommending items, we'll compute the similarity
# iterativealy and store the most similar users to get down to C * u memory
similarity_matrix = np.array([[0.] * top_k] * len(users))
for a in range(len(users)): 

    # Collect our similarities w/ respect to user A
    sim_a = {}
    for b in range(len(users)): 
        if a != b: 
            
            # Given the sparsity of our review vectors, cosine similarity is going to be 
            # effectively zero if we look across the entire item space... compare only those 
            # items these two users have in common (at least 1 rating between the two). 
            a_item_ix = np.nonzero(ui_dense[a])[0]            
            b_item_ix = np.nonzero(ui_dense[b])[0]
            all_ix = np.concatenate([a_item_ix, b_item_ix])
            a_items = ui_dense[a][all_ix]            
            b_items = ui_dense[b][all_ix]

            # Fill non-ratings with middling scores. Non-interactions appear 
            # dissimilar to positive reviews and similar to negative ones otherwise.
            a_items[a_items==0] = 4
            b_items[b_items==0] = 2
            
            # Cosine similarity risks insensitivity to rating value, while imperfect here, 
            # Pearson similarity gets us sensitivty to rating magnitude and trends
            sim_a[b] = pearson_similarity(a_items, b_items)
        
    # Find and store the top k user matches, in order    
    # NOTE: dict sorting logic courtesy of gpt-4o (https://chatgpt.com/share/687dc72f-54b4-8013-806e-b1de20d0ef12)
    top = sorted(sim_a.items(), key=lambda x: x[1], reverse=True)[:top_k]
    similarity_matrix[a] = [x[0] for x in top]

    break 

In [None]:
#a_items[-7] = 2
a_items

In [None]:
b_items

In [None]:
cosine_similarity([a_items],[b_items])

In [None]:
pearson_similarity(a_items, b_items)

### VAE

In [None]:
from recommenders.datasets.sparse import AffinityMatrix

In [None]:
# Only relevant to VAE strategy 
reviews = reviews[reviews.rating >= 3]

In [None]:
reviews.columns

In [None]:
reviews

In [None]:
# NOTE: Strategy adapted from tutorials available in the Recommenders project, see 
# https://github.com/recommenders-team/recommenders/tree/main
from recommenders.datasets.python_splitters import python_random_split

# Split along user boundaries to ensure no leakage of preference between train and test
train_users, test_users, val_users = python_random_split(users, [.9, .05, .05])

In [None]:
print(train_users.shape, test_users.shape, val_users.shape)

In [None]:
train = reviews[reviews.user_id.isin(train_users.user_id)]
val = reviews[reviews.user_id.isin(val_users.user_id)]
test = reviews[reviews.user_id.isin(test_users.user_id)]

In [None]:
print(train.shape, val.shape, test.shape)

In [None]:
# Technique from Recommenders (see https://github.com/recommenders-team/recommenders/blob/45e1b215a35e69b92390e16eb818d4528d0a33a2/examples/02_model_collaborative_filtering/standard_vae_deep_dive.ipynb) 
# to improve utility of validation set during training - only allow items in
# the validation set that are also present in the train set
val = val[val.item_id.isin(train.item_id.unique())]

In [None]:
val.shape

In [None]:
from recommenders.datasets.python_splitters import python_stratified_split 

# Another technique employed in Recommenders (see above link for notebook), for in-flight validation to be 
# meaningful during training, our validation set needs not just ground truth, but unseen validation samples 
# to see if predictions for validation users are relevant (to those users). Anyway, break down our val and test 
# sets again to support this strategy
val_src, val_target = python_stratified_split(
    data=val, 
    ratio=0.8, 
    filter_by="item", 
    col_user="user_id", 
    col_item="item_id"
    )
test_src, test_target = python_stratified_split(
    data=test, 
    ratio=0.8, 
    filter_by="item", 
    col_user="user_id", 
    col_item="item_id"
    )

In [None]:
print(val.shape, " -> ", val_src.shape, val_target.shape)
print(test.shape, " -> ", test_src.shape, test_target.shape)

In [None]:
#to use standard names across the analysis 
header = {
        "col_user": "user_id",
        "col_item": "item_id",
        "col_rating": "rating",
        # Unclear why this doesn't also eat a timestamp, but many of the functions that split temporally use, fortunately 
        # the column 'timestamp' (i.e. DEFAULT_TIMESTAMP_COL='timestamp') so I think we're fine. 
        # "col_timestamp" : "timestamp"
    }

train_matrix = AffinityMatrix(df=train, **header)
val_matrix = AffinityMatrix(df=val, **header)
val_src_matrix = AffinityMatrix(df=val_src, **header)
val_tgt_matrix = AffinityMatrix(df=val_target, **header)
test_src_matrix = AffinityMatrix(df=test_src, **header)
test_tgt_matrix = AffinityMatrix(df=test_target, **header)

In [None]:
# This generates a sparse array of user vectors, aka user-item matrix
# X[0] is the first user in the list, with entries for all items known when the matrix was constructed in that row
train, _, _ = train_matrix.gen_affinity_matrix()
val, _, _ = val_matrix.gen_affinity_matrix() 
val_src, _, _ = val_src_matrix.gen_affinity_matrix()
val_tgt, _, _ = val_tgt_matrix.gen_affinity_matrix()
test_src, _, _ = test_src_matrix.gen_affinity_matrix()
test_tgt, _, _ = test_src_matrix.gen_affinity_matrix()

In [None]:
from recommenders.utils.python_utils import binarize

train = binarize(train, 3)
val = binarize(train, 3)
val_src = binarize(val_src, 3) 
val_tgt = binarize(val_tgt, 3)
test_src = binarize(test_src, 3)
test_tgt = binarize(test_tgt, 3)

In [None]:
# TODO: Make sure this is reported during training/configuration
sparsity = np.count_nonzero(train)/(train.shape[0]*train.shape[1])*100
print(f"sparsity: {sparsity:.2f}%")

In [None]:
keras.__version__ 

In [None]:
from recommenders.models.vae.standard_vae import StandardVAE

In [None]:
model = StandardVAE(
    n_users = train.shape[0], 
    original_dim = train.shape[1],
    intermediate_dim=250, 
    latent_dim=50, 
    n_epochs=1, 
    batch_size=1, 
    k=10, 
    verbose=1, 
    seed=4, 
    save_path="models/svae.hdf5", 
    drop_encoder=0.5, 
    drop_decoder=0.5, 
    annealing=False, 
    beta=1.0) 

In [None]:
type(train)

In [None]:
model.fit(
    x_train=train, 
    x_valid=val, 
    x_val_tr=val_src, 
    x_val_te=val_tgt, 
    mapper=val_matrix,
    )

Managing text-based reviews at this scale could be a challenge, and I'd like to steer clear of LLMs for this effort. We could do an embedding on the review and use that for similarity, but we have pretty rich item data. Perhaps let's ignore the collaborative aspect here and build a shopping interface that: 
- surfaces the most popular items, and encourages you to add items to your shopping cart for a big discount/promo
- based on clicks and cart items, improves the recommendations and surfaces new products

We can use an autoencoder to accept a sparse matrix of users and items, learn to reproduce that matrix, and in so doing support prediction on missing values. However, this matrix is of size users x items, which here is 1.8e7 x 1.6e6 = 28,125,000,000 KB (best-case, higher if stored as np floats) ~= 26 TB !! WTF. 
- In the standard VAE example (https://github.com/recommenders-team/recommenders/blob/main/examples/02_model_collaborative_filtering/standard_vae_deep_dive.ipynb) the clicks are turned into a histogram for each user ... so we have n_user vector of length n_items... then I guess each of these is a training sample. The VAE presumably learns, given a sparse user vector, to predict every rating. This takes the complexity down and gives us a training set we can iterate over. 

Let's avoid any distributional pressure (present in VAE, SVAE, disentangled VAE) and go for a basic autoencoder using the strategy laid out above, i.e.

### Scratch Autoencoder

That damned Recommenders VAE is a dependency dumpster fire ... walking away after 10h of fighting crusty environments that generate more errors than outcomes. Shift to a basic autoencoder in pytorch and just eat the cost of having to implement our own validation. 

In [None]:
users

In [None]:
reviews

In [None]:
items

In [None]:
import os
import math 
import torch 
import pandas as pd 
from tqdm import tqdm
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import wandb
import sys
sys.path.append('..')
from sparse import AffinityMatrix

In [None]:
class Autoencoder(nn.Module):
    """
    Autoencoder

    NOTE: with cues from https://www.geeksforgeeks.org/deep-learning/implementing-an-autoencoder-in-pytorch/
    """

    def __init__(self, dims=1000):
        """
        Initialize a new object given an item count 
        """
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(dims, 500),
            nn.ReLU(), 
            nn.Linear(500, 75),
            nn.ReLU(), 
        )
        self.decoder = nn.Sequential(
            nn.Linear(75, 500),
            nn.ReLU(), 
            nn.Linear(500, dims),
            nn.ReLU(), 
        )

    def forward(self, x):
        """
        Implement our forward pass 
        """
        h = self.encoder(x) 
        r = self.decoder(h)

        return r

In [None]:
class DeepCartDataset(torch.utils.data.Dataset): 
    """
    Custom pytorch-compatible dataset. Adapted from 
    https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#creating-a-custom-dataset-for-your-files
    """
    def __init__(self, users, reviews): 
        """
        Initialize a new instance

        Oof. The ideal pattern here is for the dataset to be blissfully ignorant of our split strategy and just 
        make a dataset available to its client based on the raw data passed. However the split strategy is rather intricate below ... 
        note the five splits. Can we easily raise that up to a higher level? The refactoring might not be trivial and it may result in 
        residue of this split strategy bleeding over to the other models -- what's common and what's not? 

        common: 
        - train, val, test split
        - val test and test test bonus splits - for all validation stages we need to check performance, however non-NN techniques 
          really only need a test split, right? if we do train holdout for validation, we essentially have three test sets (train-test, val, test)
        - a need to operate on the same validation or at least test data, lest the comparison be biased by the selection method each model applies

        unique
        - logic to prune reviews < 3.5 -- we don't do this in cfnn, and naive doesn't care (predicts highest review in the matrix), if this is 
          done during training, it will also need to be done during inference
        - need for a pytorch-style dataset ... the naive method is doing a O(n) search, the cfnn needs dataframes -- while refactoring is 
          possible, why understake the risk it will be a disjoint and inelegant fit? 
        - the VAE implementation wants all train and val, but doesn't require a test dataset. we ou


        we could: 
        - pass train and val, hold test out
        - pass test to predict function, which we need for the demo anyway
        - keep the pytorch dataset unique to the pytorch-compatible class... doesn't make sense to try and foist on other algos... we are 
        doing this in the wrong order, filtering and then splitting... we need to outsource the splitting and then do the filtering inside each 
        model 

        right now this is speculation, just get something working! we can figure out how to streamline after -- oh, but we need a dataset 
        implementation
        """
        self.users = users 
        self.reviews = reviews 
        self.matrix = 

    def build_affinity_matrices(): 
        """
        
        """
        oof

    def split(users, reviews, items):
        """
        Generate splits 
        """
        print(f"Full user-item matrix is {len(users) * len(items)}")

        # We are trying to teach the model what a good interaction is like, and we'll 
        # ultimately be interested only in whether to recommend an item or not ... 
        # low reviews are not something we want the model suggesting... 
        reviews_low = reviews[reviews.rating < 3]
        reviews = reviews[reviews.rating >= 3]

        # NOTE: Strategy adapted from tutorials available in the Recommenders project, see 
        # https://github.com/recommenders-team/recommenders/tree/main
        # Split along user boundaries to ensure no leakage of preference between train and test
        train_users, test_users, val_users = python_random_split(users, [.9, .05, .05])
        print(train_users.shape, test_users.shape, val_users.shape)

        train = reviews[reviews.user_id.isin(train_users.user_id)]
        val = reviews[reviews.user_id.isin(val_users.user_id)]
        test = reviews[reviews.user_id.isin(test_users.user_id)]
        print(train.shape, val.shape, test.shape)
        
        # Technique from Recommenders (see https://github.com/recommenders-team/recommenders/blob/45e1b215a35e69b92390e16eb818d4528d0a33a2/examples/02_model_collaborative_filtering/standard_vae_deep_dive.ipynb) 
        # to improve utility of validation set during training - only allow items in
        # the validation set that are also present in the train set
        val = val[val.item_id.isin(train.item_id.unique())]
        print(val.shape)

        # Another technique employed in Recommenders (see above link for notebook), for in-flight validation to be 
        # meaningful during training, our validation set needs not just ground truth, but unseen validation samples 
        # to see if predictions for validation users are relevant (to those users). Anyway, break down our val and test 
        # sets again to support this strategy
        val_src, val_target = python_stratified_split(
            data=val, 
            ratio=0.8, 
            filter_by="item", 
            col_user="user_id", 
            col_item="item_id"
            )
        test_src, test_target = python_stratified_split(
            data=test, 
            ratio=0.8, 
            filter_by="item", 
            col_user="user_id", 
            col_item="item_id"
            )
        
        print(val.shape, " -> ", val_src.shape, val_target.shape)
        print(test.shape, " -> ", test_src.shape, test_target.shape)

        train_matrix = AffinityMatrix(df=train, **header)
        val_matrix = AffinityMatrix(df=val, **header)
        val_src_matrix = AffinityMatrix(df=val_src, **header)
        val_tgt_matrix = AffinityMatrix(df=val_target, **header)
        test_src_matrix = AffinityMatrix(df=test_src, **header)
        test_tgt_matrix = AffinityMatrix(df=test_target, **header)

        # This generates a sparse array of user vectors, aka user-item matrix
        # X[0] is the first user in the list, with entries for all items known when the matrix was constructed in that row
        train, _, _ = train_matrix.gen_affinity_matrix()
        val, _, _ = val_matrix.gen_affinity_matrix() 
        val_src, _, _ = val_src_matrix.gen_affinity_matrix()
        val_tgt, _, _ = val_tgt_matrix.gen_affinity_matrix()
        test_src, _, _ = test_src_matrix.gen_affinity_matrix()
        test_tgt, _, _ = test_src_matrix.gen_affinity_matrix()    

        train = binarize(train, 3)
        val = binarize(train, 3)
        val_src = binarize(val_src, 3) 
        val_tgt = binarize(val_tgt, 3)
        test_src = binarize(test_src, 3)
        test_tgt = binarize(test_tgt, 3)

        sparsity = np.count_nonzero(train)/(train.shape[0]*train.shape[1])*100
        print(f"sparsity: {sparsity:.2f}%")
    def __len__(self): 
        """
        Retrieve length of the dataset
        """
        return len(self.img_labels) 
    
    def __getitem__(self, idx): 
        """
        Retrieve an item at the provided index
        """
        #TODO: implement
        pass

In [None]:
def get_data_loader(batch_size=5, shuffle=True): 
    """
    Retrieve a pytorch-style dataloader 
    """

    #TODO: implement
    #transform = transforms.Compose([
    #     transforms.ConvertImageDtype(torch.float),
    #     transforms.Normalize(mean=[0.5], std=[0.5])
    #])

    #data = DeepCartDataset(transform=transform)
    #loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=shuffle)
    
    #return loader
    pass

In [None]:
def train(loader, model, loss_interval=20, epochs=2, lr=0.01, momentum=0.9):
    """
    Train the model with the provided dataset

    NOTE: this is a similar training loop as we used for our vision model in the 
    the vision project, forward pass
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    train_loss = []

    tqdm.write(f"Starting training run...")    
    # TODO: configure WandB
    # see https://docs.wandb.ai/guides/integrations/pytorch/
    config = {}
    run = wandb.init(config=config) 

    model.train()
    model = model.to(device)
    
    loss_fn = nn.CrossEntropyLoss()

    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

    for epoch in range(epochs):

        running_loss = 0.0
        for i, data in enumerate(loader):

            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)

            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            # collect metrics
            running_loss += loss.item()

            if (i % loss_interval) == (loss_interval - 1): 
                train_loss.append(running_loss / loss_interval)
                tqdm.write(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / loss_interval:.3f}")
                running_loss = 0 
    
    tqdm.write("Training complete!") 

    return train_loss 