In [1]:
!pip3 install torch numpy pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import time
import numpy as np
from src.utils import save_model, load_model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_id'].values, dtype=torch.long, device=device)
        self.items = torch.tensor(df['movie_id'].values, dtype=torch.long, device=device)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float, device=device)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]    


def prepare_data(data):
    users = data['user_id'].unique()
    movies = data['movie_id'].unique()
    user_to_index = {original: idx for idx, original in enumerate(users)}
    movie_to_index = {original: idx for idx, original in enumerate(movies)}

    index_to_user = {idx: original for original, idx in user_to_index.items()}
    index_to_movie = {idx: original for original, idx in movie_to_index.items()}

    mappings = {
        'user_to_idx': user_to_index,
        'idx_to_user': index_to_user,
        'item_to_idx': movie_to_index,
        'idx_to_item': index_to_movie
    }

    data['user_id'] = data['user_id'].map(user_to_index)
    data['movie_id'] = data['movie_id'].map(movie_to_index)

    train_size = int(0.8 * len(data))
    train_data = data.iloc[:train_size]
    test_data = data.iloc[train_size:]

    train_dataset = MovieLensDataset(train_data)
    test_dataset = MovieLensDataset(test_data)
    dataset = MovieLensDataset(data)

    return dataset, train_dataset, test_dataset, mappings


class NCF(nn.Module):
    def __init__(self, num_users, num_items, embed_dim=32, linear_dims = [64, 32], mappings=None):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embed_dim)
        self.item_embedding = nn.Embedding(num_items, embed_dim)
        self.idx_to_item = mappings['idx_to_item']
        self.item_to_idx = mappings['item_to_idx']
        self.user_to_idx = mappings['user_to_idx']
        self.idx_to_user = mappings['idx_to_user']

        # Add linear + relu layers
        layers = []
        input_dim = 2 * embed_dim
        output_dim = 1
        for dim in linear_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim

        layers.append(nn.Linear(input_dim, output_dim))
        self.mlp = nn.Sequential(*layers)


    def forward(self, user, item):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        x = torch.cat([user_emb, item_emb], dim=1)
        
        return self.mlp(x).squeeze()
    
    
    
    def get_model_size(self):
        """
        This function will return the size of the model in bytes.
        Args:
            None
        Returns:
            int: The size of the model in bytes
        """

        return sum(p.numel() for p in self.parameters())

def train_tf(model, train_loader, optimizer, criterion):
    t0 = time.time()
    model.train()
    train_loss = 0
    for user, item, rating in train_loader:
        optimizer.zero_grad()
        output = model(user, item)
        loss = criterion(output, rating)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    return train_loss, time.time() - t0

def eval_tf(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    t0 = time.time()
    with torch.no_grad():
        for user, item, rating in test_loader:
            output = model(user, item)
            loss = criterion(output, rating)
            test_loss += loss.item()

    return test_loss, time.time() - t0
    

def train(model, train_data, test_data, epochs=10, lr=0.001, bs=64):
    train_loader = DataLoader(train_data, batch_size=bs, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=bs, shuffle=False)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    t0 = time.time()

    for epoch in range(epochs):
        train_loss, train_time = train_tf(model, train_loader, optimizer, criterion)
        test_loss, test_time = eval_tf(model, test_loader, criterion)

        print(f'Epoch {epoch + 1}/{epochs}. Time = {train_time+test_time} s\n    - Train Loss (MSE): {train_loss / len(train_loader)}\n    - Test Loss (MSE): {test_loss / len(test_loader)}')
    print(f'Total Training time: {time.time() - t0} s')


def predict_movies(user_id, model, data, mappings, n):
    user_idx = mappings['user_to_idx'][user_id]

    all_movies = data['movie_id'].unique()
    watched_movies = data[data['user_id'] == user_id]['movie_id'].unique()
    not_watched_movies = np.setdiff1d(all_movies, watched_movies)

    # Convert numerical movie IDs back to their original titles
    not_watched_movies = [mappings['idx_to_item'][movie_id] for movie_id in not_watched_movies if movie_id in mappings['idx_to_item']]

    # Ensure movie IDs match the trained model's item_to_idx
    valid_movies = [movie_id for movie_id in not_watched_movies if movie_id in model.item_to_idx]

    user_tensor = torch.tensor([user_idx] * len(valid_movies), device=device)
    movie_tensors = torch.tensor([model.item_to_idx[movie_id] for movie_id in valid_movies], device=device)

    predictions = model(user_tensor, movie_tensors).detach().cpu().numpy()

    top_predictions = sorted(zip(valid_movies, predictions), key=lambda x: x[1], reverse=True)[:n]
    top_predictions = [(idx, min(5, max(1, pred))) for idx, pred in top_predictions]

    return top_predictions

if __name__ == '__main__':
        
    # Load the data
    data = pd.read_csv('data/recommendation_data.csv')

    unique_users = data['user_id'].unique()
    unique_movies = data['movie_id'].unique()
    
    # # Prepare the data
    dataset, train_data, test_data, mappings = prepare_data(data)
    # train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
    # test_loader = DataLoader(test_data, batch_size=64, shuffle=False)
    
    # # Train the model
    # model = NCF(len(unique_users), len(unique_movies), 64, [64, 128, 64], mappings=mappings).to(device)
    # train(model, train_data, test_data, epochs=4, lr=0.001, bs=64)

    # # Save model
    # save_model(model, 'ncf_model')

    model = load_model('ncf_model')

MSE by Gender: {'F': 0.7565591931343079, 'M': 0.770621657371521}
MSE by Occupation: {'college/grad student': 0.7425245046615601, 'retired': 0.7821750640869141, 'customer service': 0.778343915939331, 'executive/managerial': 0.7775155901908875, 'sales/marketing': 0.8145403265953064, 'other or not specified': 0.7221888303756714, 'scientist': 0.7782127857208252, 'writer': 0.7338643670082092, 'K-12 student': 0.7202019691467285, 'academic/educator': 0.8391637206077576, 'homemaker': 0.6698481440544128, 'artist': 0.8785552382469177, 'technician/engineer': 0.7697812914848328, 'self-employed': 0.7011167407035828, 'unemployed': 1.1300959587097168, 'clerical/admin': 0.9682397246360779, 'programmer': 0.8568381667137146, 'doctor/health care': 0.7279679775238037, 'lawyer': 0.71218341588974, 'tradesman/craftsman': 0.7952181100845337, 'farmer': 1.543031930923462}
MSE by Age Group: {'26-35': 0.7733426690101624, '19-25': 0.7635174989700317, '36-45': 0.7389461994171143, '0-18': 0.7446747422218323, '46-55'

In [6]:
# Load and merge user data with model predictions
user_df = pd.read_csv('data/user_data_full.csv')
recommendations = pd.read_csv('data/recommendation_data.csv')
merged_df = pd.merge(user_df, recommendations, on="user_id")

# Sample some users for WIT
sample_df = merged_df.sample(100)
wit_inputs = sample_df.to_dict(orient='records')

In [7]:
class DummyModel:
    def predict_movies(user_id, model, data, mappings, n):
        user_idx = mappings['user_to_idx'][user_id]
    
        all_movies = data['movie_id'].unique()
        watched_movies = data[data['user_id'] == user_id]['movie_id'].unique()
        not_watched_movies = np.setdiff1d(all_movies, watched_movies)
    
        # Convert numerical movie IDs back to their original titles
        not_watched_movies = [mappings['idx_to_item'][movie_id] for movie_id in not_watched_movies if movie_id in mappings['idx_to_item']]
    
        # Ensure movie IDs match the trained model's item_to_idx
        valid_movies = [movie_id for movie_id in not_watched_movies if movie_id in model.item_to_idx]
    
        user_tensor = torch.tensor([user_idx] * len(valid_movies), device=device)
        movie_tensors = torch.tensor([model.item_to_idx[movie_id] for movie_id in valid_movies], device=device)
    
        predictions = model(user_tensor, movie_tensors).detach().cpu().numpy()
    
        top_predictions = sorted(zip(valid_movies, predictions), key=lambda x: x[1], reverse=True)[:n]
        top_predictions = [(idx, min(5, max(1, pred))) for idx, pred in top_predictions]

        return top_predictions
        
model = DummyModel()

In [8]:
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

config = WitConfigBuilder(wit_inputs).set_custom_predict_fn(model.predict_movies)
wit = WitWidget(config)
wit

ImportError: Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/tensorflow/python/pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 28, in <module>
    _pywrap_tensorflow_internal = swig_import_helper()
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 15, in swig_import_helper
    import imp
ModuleNotFoundError: No module named 'imp'


Failed to load the native TensorFlow runtime.

See https://www.tensorflow.org/install/errors

for some common reasons and solutions.  Include the entire stack trace
above this error message when asking for help.

In [13]:
!python3 --version

Python 3.13.1


In [18]:
# !pyenv install 3.10.12
# !pyenv virtualenv 3.11.16 wit-env
# !pyenv activate wit-env

pyenv-virtualenv: `3.11.16' is not installed in pyenv.
It does not look like a valid Python version. See `pyenv install --list' for available versions.
[31;1m
Failed to activate virtualenv.

Perhaps pyenv-virtualenv has not been loaded into your shell properly.
Please restart current shell and try again.

[0m

In [14]:
!brew install pyenv-virtualenv

[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with HOMEBREW_AUTO_UPDATE_SECS or disable with
HOMEBREW_NO_AUTO_UPDATE. Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
ab-av1                                   lazyjournal
ad                                       lazysql
adapterremoval                           leetgo
adaptivecpp                              libcdio-paranoia
algolia                                  libgoa
alloy                                    libgudev
anyquery                                 libpostal
aqtinstall                               libpostal-rest
aqua                                     ludusavi
arelo                                    lume
astroterm                                mac
async_simple                             martin
bacon-ls                                 md2pdf
bagels      