In [2]:
!pip3 install torch numpy pandas

Collecting torch
  Downloading torch-2.6.0-cp310-none-macosx_11_0_arm64.whl (66.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting numpy
  Downloading numpy-2.2.4-cp310-cp310-macosx_14_0_arm64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pandas
  Downloading pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting networkx
  Using cached networkx-3.4.2-py3-none-any.whl (1.7 MB)
Collecting filelock
  Downloading filelock-3.18.0-py3-none-any.whl (16 kB)
Collecting sympy==1.13.1
  Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[

In [3]:
!pip3 install tensorflow witwidget

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp310-cp310-macosx_12_0_arm64.whl (252.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.5/252.5 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting witwidget
  Using cached witwidget-1.8.1-py3-none-any.whl (1.5 MB)
Collecting opt-einsum>=2.3.2
  Using cached opt_einsum-3.4.0-py3-none-any.whl (71 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3
  Downloading protobuf-5.29.4-cp38-abi3-macosx_10_9_universal2.whl (417 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.8/417.8 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting grpcio<2.0,>=1.24.3
  Downloading grpcio-1.71.0-cp310-cp310-macosx_12_0_universal2.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting astunparse>=1.6.0
  Using cached 

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import time
import numpy as np
from src.utils import save_model, load_model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_id'].values, dtype=torch.long, device=device)
        self.items = torch.tensor(df['movie_id'].values, dtype=torch.long, device=device)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float, device=device)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]    


def prepare_data(data):
    users = data['user_id'].unique()
    movies = data['movie_id'].unique()
    user_to_index = {original: idx for idx, original in enumerate(users)}
    movie_to_index = {original: idx for idx, original in enumerate(movies)}

    index_to_user = {idx: original for original, idx in user_to_index.items()}
    index_to_movie = {idx: original for original, idx in movie_to_index.items()}

    mappings = {
        'user_to_idx': user_to_index,
        'idx_to_user': index_to_user,
        'item_to_idx': movie_to_index,
        'idx_to_item': index_to_movie
    }

    data['user_id'] = data['user_id'].map(user_to_index)
    data['movie_id'] = data['movie_id'].map(movie_to_index)

    train_size = int(0.8 * len(data))
    train_data = data.iloc[:train_size]
    test_data = data.iloc[train_size:]

    train_dataset = MovieLensDataset(train_data)
    test_dataset = MovieLensDataset(test_data)
    dataset = MovieLensDataset(data)

    return dataset, train_dataset, test_dataset, mappings


class NCF(nn.Module):
    def __init__(self, num_users, num_items, embed_dim=32, linear_dims = [64, 32], mappings=None):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embed_dim)
        self.item_embedding = nn.Embedding(num_items, embed_dim)
        self.idx_to_item = mappings['idx_to_item']
        self.item_to_idx = mappings['item_to_idx']
        self.user_to_idx = mappings['user_to_idx']
        self.idx_to_user = mappings['idx_to_user']

        # Add linear + relu layers
        layers = []
        input_dim = 2 * embed_dim
        output_dim = 1
        for dim in linear_dims:
            layers.append(nn.Linear(input_dim, dim))
            layers.append(nn.ReLU())
            input_dim = dim

        layers.append(nn.Linear(input_dim, output_dim))
        self.mlp = nn.Sequential(*layers)


    def forward(self, user, item):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        x = torch.cat([user_emb, item_emb], dim=1)
        
        return self.mlp(x).squeeze()
    
    
    
    def get_model_size(self):
        """
        This function will return the size of the model in bytes.
        Args:
            None
        Returns:
            int: The size of the model in bytes
        """

        return sum(p.numel() for p in self.parameters())

def train_tf(model, train_loader, optimizer, criterion):
    t0 = time.time()
    model.train()
    train_loss = 0
    for user, item, rating in train_loader:
        optimizer.zero_grad()
        output = model(user, item)
        loss = criterion(output, rating)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    return train_loss, time.time() - t0

def eval_tf(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    t0 = time.time()
    with torch.no_grad():
        for user, item, rating in test_loader:
            output = model(user, item)
            loss = criterion(output, rating)
            test_loss += loss.item()

    return test_loss, time.time() - t0
    

def train(model, train_data, test_data, epochs=10, lr=0.001, bs=64):
    train_loader = DataLoader(train_data, batch_size=bs, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=bs, shuffle=False)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    t0 = time.time()

    for epoch in range(epochs):
        train_loss, train_time = train_tf(model, train_loader, optimizer, criterion)
        test_loss, test_time = eval_tf(model, test_loader, criterion)

        print(f'Epoch {epoch + 1}/{epochs}. Time = {train_time+test_time} s\n    - Train Loss (MSE): {train_loss / len(train_loader)}\n    - Test Loss (MSE): {test_loss / len(test_loader)}')
    print(f'Total Training time: {time.time() - t0} s')


def predict_movies(user_id, model, data, mappings, n):
    user_idx = mappings['user_to_idx'][user_id]

    all_movies = data['movie_id'].unique()
    watched_movies = data[data['user_id'] == user_id]['movie_id'].unique()
    not_watched_movies = np.setdiff1d(all_movies, watched_movies)

    # Convert numerical movie IDs back to their original titles
    not_watched_movies = [mappings['idx_to_item'][movie_id] for movie_id in not_watched_movies if movie_id in mappings['idx_to_item']]

    # Ensure movie IDs match the trained model's item_to_idx
    valid_movies = [movie_id for movie_id in not_watched_movies if movie_id in model.item_to_idx]

    user_tensor = torch.tensor([user_idx] * len(valid_movies), device=device)
    movie_tensors = torch.tensor([model.item_to_idx[movie_id] for movie_id in valid_movies], device=device)

    predictions = model(user_tensor, movie_tensors).detach().cpu().numpy()

    top_predictions = sorted(zip(valid_movies, predictions), key=lambda x: x[1], reverse=True)[:n]
    top_predictions = [(idx, min(5, max(1, pred))) for idx, pred in top_predictions]

    return top_predictions

if __name__ == '__main__':
        
    # Load the data
    data = pd.read_csv('data/recommendation_data.csv')

    unique_users = data['user_id'].unique()
    unique_movies = data['movie_id'].unique()
    
    # # Prepare the data
    dataset, train_data, test_data, mappings = prepare_data(data)
    # train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
    # test_loader = DataLoader(test_data, batch_size=64, shuffle=False)
    
    # # Train the model
    # model = NCF(len(unique_users), len(unique_movies), 64, [64, 128, 64], mappings=mappings).to(device)
    # train(model, train_data, test_data, epochs=4, lr=0.001, bs=64)

    # # Save model
    # save_model(model, 'ncf_model')

    model = load_model('ncf_model')

In [5]:
# Load and merge user data with model predictions
user_df = pd.read_csv('data/user_data_full.csv')
recommendations = pd.read_csv('data/recommendation_data.csv')
merged_df = pd.merge(user_df, recommendations, on="user_id")

# Sample some users for WIT
sample_df = merged_df.sample(100)
wit_inputs = sample_df.to_dict(orient='records')

In [6]:
class DummyModel:
    def predict_movies(user_id, model, data, mappings, n):
        user_idx = mappings['user_to_idx'][user_id]
    
        all_movies = data['movie_id'].unique()
        watched_movies = data[data['user_id'] == user_id]['movie_id'].unique()
        not_watched_movies = np.setdiff1d(all_movies, watched_movies)
    
        # Convert numerical movie IDs back to their original titles
        not_watched_movies = [mappings['idx_to_item'][movie_id] for movie_id in not_watched_movies if movie_id in mappings['idx_to_item']]
    
        # Ensure movie IDs match the trained model's item_to_idx
        valid_movies = [movie_id for movie_id in not_watched_movies if movie_id in model.item_to_idx]
    
        user_tensor = torch.tensor([user_idx] * len(valid_movies), device=device)
        movie_tensors = torch.tensor([model.item_to_idx[movie_id] for movie_id in valid_movies], device=device)
    
        predictions = model(user_tensor, movie_tensors).detach().cpu().numpy()
    
        top_predictions = sorted(zip(valid_movies, predictions), key=lambda x: x[1], reverse=True)[:n]
        top_predictions = [(idx, min(5, max(1, pred))) for idx, pred in top_predictions]

        return top_predictions
        
model = DummyModel()

In [14]:
!pip install tensorflow witwidget pandas numpy torch notebook

Collecting notebook
  Downloading notebook-7.3.3-py3-none-any.whl (13.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting jupyterlab<4.4,>=4.3.6
  Downloading jupyterlab-4.3.6-py3-none-any.whl (11.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting jupyterlab-server<3,>=2.27.1
  Downloading jupyterlab_server-2.27.3-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.7/59.7 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tornado>=6.2.0
  Downloading tornado-6.4.2-cp38-abi3-macosx_10_9_universal2.whl (436 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m436.3/436.3 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jupyter-server<3,>=2.4.0
  Downloading jupyter_server-2.15.0-py3-none-any.whl (3

In [15]:
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

examples = [{'feature': 5, 'label': 1}, {'feature': 2, 'label': 0}]
config = WitConfigBuilder(examples).set_label_feature('label')
WitWidget(config)


ImportError: Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/tensorflow/python/pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 28, in <module>
    _pywrap_tensorflow_internal = swig_import_helper()
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 15, in swig_import_helper
    import imp
ModuleNotFoundError: No module named 'imp'


Failed to load the native TensorFlow runtime.

See https://www.tensorflow.org/install/errors

for some common reasons and solutions.  Include the entire stack trace
above this error message when asking for help.

In [12]:
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

config = WitConfigBuilder(wit_inputs).set_custom_predict_fn(model.predict_movies)
wit = WitWidget(config)
wit

ImportError: Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/tensorflow/python/pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 28, in <module>
    _pywrap_tensorflow_internal = swig_import_helper()
  File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 15, in swig_import_helper
    import imp
ModuleNotFoundError: No module named 'imp'


Failed to load the native TensorFlow runtime.

See https://www.tensorflow.org/install/errors

for some common reasons and solutions.  Include the entire stack trace
above this error message when asking for help.

In [8]:
!python3 --version

Python 3.10.12
