# Neural Collaborative Filtering (NCF) - Vollständige Modulare Implementation

Ein komplettes Neural Collaborative Filtering System für Empfehlungssysteme:
- **User & Item Embeddings** für Matrix Factorization
- **Generalized Matrix Factorization (GMF)** 
- **Multi-Layer Perceptron (MLP)** für nicht-lineare Interaktionen
- **Neural Matrix Factorization (NeuMF)** - Kombination von GMF + MLP
- Modular mit expliziten Layern und ausführlicher Dokumentation

In [25]:
# Imports / Bibliotheken importieren
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from typing import Optional, Tuple, List
from sklearn.model_selection import train_test_split

# Device-Konfiguration / Configure computation device
Device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {Device}")

Using device: cpu


In [26]:
# ============================================================================
# USER EMBEDDING LAYER
# Erstellt Embedding-Vektoren für User-IDs
# ============================================================================

class UserEmbeddingLayer(nn.Module):
    # Initialisiert User-Embedding mit lernbaren Parametern
    # Initialize user embedding with learnable parameters
    def __init__(self, NumUsers: int, EmbeddingDim: int):
        super(UserEmbeddingLayer, self).__init__()
        
        # Speichere Anzahl der User / Store number of users
        self.NumUsers = NumUsers
        
        # Speichere Embedding-Dimension / Store embedding dimension
        self.EmbeddingDim = EmbeddingDim
        
        # Erstelle User-Embedding-Layer / Create user embedding layer
        self.UserEmbedding = nn.Embedding(
            num_embeddings=NumUsers,      # Anzahl verschiedener User / Number of unique users
            embedding_dim=EmbeddingDim    # Dimensionalität jedes Embeddings / Dimensionality of each embedding
        )
        
        # Initialisiere Gewichte / Initialize weights
        nn.init.normal_(self.UserEmbedding.weight, mean=0.0, std=0.01)
    
    # Forward-Pass: Hole User-Embeddings
    # Forward pass: Retrieve user embeddings
    def forward(self, UserIDs: torch.Tensor) -> torch.Tensor:
        # Input: UserIDs mit Shape [BatchSize]
        # Output: UserEmbeddings mit Shape [BatchSize, EmbeddingDim]
        
        # Führe Embedding-Lookup durch / Perform embedding lookup
        UserEmbeddings = self.UserEmbedding(UserIDs)
        
        # Gebe User-Embeddings zurück / Return user embeddings
        return UserEmbeddings

In [27]:
# ============================================================================
# ITEM EMBEDDING LAYER
# Erstellt Embedding-Vektoren für Item-IDs
# ============================================================================

class ItemEmbeddingLayer(nn.Module):
    # Initialisiert Item-Embedding mit lernbaren Parametern
    # Initialize item embedding with learnable parameters
    def __init__(self, NumItems: int, EmbeddingDim: int):
        super(ItemEmbeddingLayer, self).__init__()
        
        # Speichere Anzahl der Items / Store number of items
        self.NumItems = NumItems
        
        # Speichere Embedding-Dimension / Store embedding dimension
        self.EmbeddingDim = EmbeddingDim
        
        # Erstelle Item-Embedding-Layer / Create item embedding layer
        self.ItemEmbedding = nn.Embedding(
            num_embeddings=NumItems,      # Anzahl verschiedener Items / Number of unique items
            embedding_dim=EmbeddingDim    # Dimensionalität jedes Embeddings / Dimensionality of each embedding
        )
        
        # Initialisiere Gewichte / Initialize weights
        nn.init.normal_(self.ItemEmbedding.weight, mean=0.0, std=0.01)
    
    # Forward-Pass: Hole Item-Embeddings
    # Forward pass: Retrieve item embeddings
    def forward(self, ItemIDs: torch.Tensor) -> torch.Tensor:
        # Input: ItemIDs mit Shape [BatchSize]
        # Output: ItemEmbeddings mit Shape [BatchSize, EmbeddingDim]
        
        # Führe Embedding-Lookup durch / Perform embedding lookup
        ItemEmbeddings = self.ItemEmbedding(ItemIDs)
        
        # Gebe Item-Embeddings zurück / Return item embeddings
        return ItemEmbeddings

In [28]:
# ============================================================================
# GENERALIZED MATRIX FACTORIZATION (GMF)
# Element-wise Produkt von User- und Item-Embeddings
# ============================================================================

class GeneralizedMatrixFactorization(nn.Module):
    # Initialisiert GMF-Komponente
    # Initialize GMF component
    def __init__(self, NumUsers: int, NumItems: int, EmbeddingDim: int):
        super(GeneralizedMatrixFactorization, self).__init__()
        
        # Speichere Konfiguration / Store configuration
        self.NumUsers = NumUsers
        self.NumItems = NumItems
        self.EmbeddingDim = EmbeddingDim
        
        # Erstelle User-Embedding für GMF / Create user embedding for GMF
        self.GMFUserEmbeddingLayer = UserEmbeddingLayer(
            NumUsers=NumUsers,
            EmbeddingDim=EmbeddingDim
        )
        
        # Erstelle Item-Embedding für GMF / Create item embedding for GMF
        self.GMFItemEmbeddingLayer = ItemEmbeddingLayer(
            NumItems=NumItems,
            EmbeddingDim=EmbeddingDim
        )
    
    # Forward-Pass: Berechnet element-wise Produkt
    # Forward pass: Compute element-wise product
    def forward(self, UserIDs: torch.Tensor, ItemIDs: torch.Tensor) -> torch.Tensor:
        # Input: UserIDs [BatchSize], ItemIDs [BatchSize]
        # Output: GMF-Features [BatchSize, EmbeddingDim]
        
        # Hole User-Embeddings / Get user embeddings
        UserEmbeddings = self.GMFUserEmbeddingLayer(UserIDs)
        # Shape: [BatchSize, EmbeddingDim]
        
        # Hole Item-Embeddings / Get item embeddings
        ItemEmbeddings = self.GMFItemEmbeddingLayer(ItemIDs)
        # Shape: [BatchSize, EmbeddingDim]
        
        # Berechne element-wise Produkt (Hadamard-Produkt)
        # Compute element-wise product (Hadamard product)
        GMFOutput = UserEmbeddings * ItemEmbeddings
        # Shape: [BatchSize, EmbeddingDim]
        
        # Gebe GMF-Output zurück / Return GMF output
        return GMFOutput

In [29]:
# ============================================================================
# MULTI-LAYER PERCEPTRON (MLP) COMPONENT
# Nicht-lineare Interaktionen zwischen User und Item
# ============================================================================

class MultiLayerPerceptron(nn.Module):
    # Initialisiert MLP-Komponente mit expliziten Layern
    # Initialize MLP component with explicit layers
    def __init__(self, NumUsers: int, NumItems: int, EmbeddingDim: int, 
                 MLPLayers: List[int], Dropout: float = 0.2):
        super(MultiLayerPerceptron, self).__init__()
        
        # Speichere Konfiguration / Store configuration
        self.NumUsers = NumUsers
        self.NumItems = NumItems
        self.EmbeddingDim = EmbeddingDim
        self.MLPLayers = MLPLayers  # Liste von Hidden-Dimensionen / List of hidden dimensions
        
        # Erstelle User-Embedding für MLP / Create user embedding for MLP
        self.MLPUserEmbeddingLayer = UserEmbeddingLayer(
            NumUsers=NumUsers,
            EmbeddingDim=EmbeddingDim
        )
        
        # Erstelle Item-Embedding für MLP / Create item embedding for MLP
        self.MLPItemEmbeddingLayer = ItemEmbeddingLayer(
            NumItems=NumItems,
            EmbeddingDim=EmbeddingDim
        )
        
        # Berechne Input-Dimension für MLP (User + Item Embeddings konkateniert)
        # Calculate input dimension for MLP (user + item embeddings concatenated)
        InputDim = EmbeddingDim * 2
        
        # Erstelle explizite MLP-Layer / Create explicit MLP layers
        # Layer 1: Input -> Erste Hidden-Schicht
        self.MLPLinearLayer1 = nn.Linear(
            in_features=InputDim,
            out_features=MLPLayers[0],
            bias=True
        )
        
        # Layer 2: Erste Hidden -> Zweite Hidden
        self.MLPLinearLayer2 = nn.Linear(
            in_features=MLPLayers[0],
            out_features=MLPLayers[1],
            bias=True
        )
        
        # Layer 3: Zweite Hidden -> Dritte Hidden
        self.MLPLinearLayer3 = nn.Linear(
            in_features=MLPLayers[1],
            out_features=MLPLayers[2],
            bias=True
        )
        
        # Layer 4: Dritte Hidden -> Output
        self.MLPLinearLayer4 = nn.Linear(
            in_features=MLPLayers[2],
            out_features=MLPLayers[3],
            bias=True
        )
        
        # Erstelle Dropout-Layer / Create dropout layers
        self.DropoutLayer = nn.Dropout(p=Dropout)
        
        # Initialisiere Gewichte / Initialize weights
        self._initialize_weights()
    
    # Initialisiert MLP-Gewichte
    # Initialize MLP weights
    def _initialize_weights(self):
        # Iteriere über alle Linear-Layer / Iterate over all linear layers
        for Module in self.modules():
            if isinstance(Module, nn.Linear):
                # Xavier-Initialisierung / Xavier initialization
                nn.init.xavier_uniform_(Module.weight)
                if Module.bias is not None:
                    nn.init.zeros_(Module.bias)
    
    # Forward-Pass: Berechnet MLP-Output
    # Forward pass: Compute MLP output
    def forward(self, UserIDs: torch.Tensor, ItemIDs: torch.Tensor) -> torch.Tensor:
        # Input: UserIDs [BatchSize], ItemIDs [BatchSize]
        # Output: MLP-Features [BatchSize, MLPLayers[-1]]
        
        # Hole User-Embeddings / Get user embeddings
        UserEmbeddings = self.MLPUserEmbeddingLayer(UserIDs)
        # Shape: [BatchSize, EmbeddingDim]
        
        # Hole Item-Embeddings / Get item embeddings
        ItemEmbeddings = self.MLPItemEmbeddingLayer(ItemIDs)
        # Shape: [BatchSize, EmbeddingDim]
        
        # Konkateniere User und Item Embeddings / Concatenate user and item embeddings
        ConcatenatedEmbeddings = torch.cat([UserEmbeddings, ItemEmbeddings], dim=1)
        # Shape: [BatchSize, EmbeddingDim * 2]
        
        # Layer 1 + ReLU + Dropout
        Hidden = self.MLPLinearLayer1(ConcatenatedEmbeddings)
        Hidden = F.relu(Hidden)
        Hidden = self.DropoutLayer(Hidden)
        
        # Layer 2 + ReLU + Dropout
        Hidden = self.MLPLinearLayer2(Hidden)
        Hidden = F.relu(Hidden)
        Hidden = self.DropoutLayer(Hidden)
        
        # Layer 3 + ReLU + Dropout
        Hidden = self.MLPLinearLayer3(Hidden)
        Hidden = F.relu(Hidden)
        Hidden = self.DropoutLayer(Hidden)
        
        # Layer 4 (finale Schicht)
        MLPOutput = self.MLPLinearLayer4(Hidden)
        # Shape: [BatchSize, MLPLayers[-1]]
        
        # Gebe MLP-Output zurück / Return MLP output
        return MLPOutput

In [30]:
# ============================================================================
# NEURAL MATRIX FACTORIZATION (NeuMF)
# Kombiniert GMF und MLP für leistungsstarke Empfehlungen
# ============================================================================

class NeuralMatrixFactorization(nn.Module):
    # Initialisiert NeuMF-Modell
    # Initialize NeuMF model
    def __init__(self, NumUsers: int, NumItems: int, GMFEmbeddingDim: int = 64,
                 MLPEmbeddingDim: int = 64, MLPLayers: List[int] = [128, 64, 32, 16],
                 Dropout: float = 0.2, LearningRate: float = 0.001):
        super(NeuralMatrixFactorization, self).__init__()
        
        # Speichere Konfiguration / Store configuration
        self.NumUsers = NumUsers
        self.NumItems = NumItems
        self.GMFEmbeddingDim = GMFEmbeddingDim
        self.MLPEmbeddingDim = MLPEmbeddingDim
        self.LearningRate = LearningRate
        
        # Erstelle GMF-Komponente / Create GMF component
        self.GMFComponent = GeneralizedMatrixFactorization(
            NumUsers=NumUsers,
            NumItems=NumItems,
            EmbeddingDim=GMFEmbeddingDim
        )
        
        # Erstelle MLP-Komponente / Create MLP component
        self.MLPComponent = MultiLayerPerceptron(
            NumUsers=NumUsers,
            NumItems=NumItems,
            EmbeddingDim=MLPEmbeddingDim,
            MLPLayers=MLPLayers,
            Dropout=Dropout
        )
        
        # Berechne Dimension der konkatierten Features
        # Calculate dimension of concatenated features
        ConcatenatedDim = GMFEmbeddingDim + MLPLayers[-1]
        
        # Erstelle finale Vorhersage-Layer / Create final prediction layer
        self.FinalPredictionLayer = nn.Linear(
            in_features=ConcatenatedDim,
            out_features=1,  # Binäre Vorhersage (Interaktion ja/nein) / Binary prediction
            bias=True
        )
        
        # Initialisiere finale Layer-Gewichte / Initialize final layer weights
        nn.init.xavier_uniform_(self.FinalPredictionLayer.weight)
        nn.init.zeros_(self.FinalPredictionLayer.bias)
        
        # Erstelle Optimizer / Create optimizer
        self.Optimizer = torch.optim.Adam(
            params=self.parameters(),
            lr=LearningRate,
            weight_decay=1e-5  # L2-Regularisierung / L2 regularization
        )
        
        # Erstelle Loss-Funktion / Create loss function
        self.LossFunction = nn.BCELoss(reduction='mean')
    
    # Forward-Pass: Berechnet Vorhersage-Score
    # Forward pass: Compute prediction score
    def forward(self, UserIDs: torch.Tensor, ItemIDs: torch.Tensor) -> torch.Tensor:
        # Input: UserIDs [BatchSize], ItemIDs [BatchSize]
        # Output: Predictions [BatchSize, 1]
        
        # Berechne GMF-Features / Compute GMF features
        GMFFeatures = self.GMFComponent(UserIDs, ItemIDs)
        # Shape: [BatchSize, GMFEmbeddingDim]
        
        # Berechne MLP-Features / Compute MLP features
        MLPFeatures = self.MLPComponent(UserIDs, ItemIDs)
        # Shape: [BatchSize, MLPLayers[-1]]
        
        # Konkateniere GMF und MLP Features / Concatenate GMF and MLP features
        CombinedFeatures = torch.cat([GMFFeatures, MLPFeatures], dim=1)
        # Shape: [BatchSize, GMFEmbeddingDim + MLPLayers[-1]]
        
        # Berechne finale Vorhersage / Compute final prediction
        Logits = self.FinalPredictionLayer(CombinedFeatures)
        # Shape: [BatchSize, 1]
        
        # Wende Sigmoid an für Wahrscheinlichkeit [0, 1]
        # Apply sigmoid for probability [0, 1]
        Predictions = torch.sigmoid(Logits)
        
        # Gebe Vorhersagen zurück / Return predictions
        return Predictions
    
    # Führt einen einzelnen Trainings-Schritt durch
    # Perform a single training step (Forward + Backward + Optimizer Step)
    def train_step(self, 
                   UserIDs: torch.Tensor, 
                   ItemIDs: torch.Tensor, 
                   Labels: torch.Tensor,
                   Device: torch.device) -> float:
        """
        Führt einen einzelnen Trainings-Schritt durch
        Perform a single training step
        
        Args:
            UserIDs: Batch von User-IDs [BatchSize]
            ItemIDs: Batch von Item-IDs [BatchSize]
            Labels: Batch von Labels [BatchSize] (1 = Interaktion, 0 = keine)
            Device: torch.device (cpu oder cuda)
        
        Returns:
            Loss-Wert für diesen Batch / Loss value for this batch
        """
        
        # Setze Modell in Trainings-Modus / Set model to training mode
        self.train()
        
        # Bewege Daten zu Device / Move data to device
        UserIDs = UserIDs.to(Device)
        ItemIDs = ItemIDs.to(Device)
        Labels = Labels.to(Device).float().unsqueeze(1)  # Shape: [BatchSize, 1]
        
        # Setze Gradienten auf Null / Zero gradients
        self.Optimizer.zero_grad()
        
        # Forward-Pass: Berechne Vorhersagen / Forward pass: compute predictions
        Predictions = self.forward(UserIDs, ItemIDs)  # Shape: [BatchSize, 1]
        
        # Berechne Loss / Compute loss
        Loss = self.LossFunction(Predictions, Labels)
        
        # Backward-Pass: Berechne Gradienten / Backward pass: compute gradients
        Loss.backward()
        
        # Optimizer-Schritt: Update Gewichte / Optimizer step: update weights
        self.Optimizer.step()
        
        # Gebe Loss-Wert zurück / Return loss value
        return Loss.item()
    
    # Trainiert das Modell über mehrere Epochen
    # Train the model over multiple epochs
    def train_model(self,
                    TrainDataLoader,
                    NumEpochs: int,
                    Device: torch.device,
                    ValidationDataLoader = None,
                    PrintEverySteps: int = 100) -> Tuple[List[float], List[float]]:
        """
        Trainiert das NCF-Modell über mehrere Epochen
        Train the NCF model over multiple epochs
        
        Args:
            TrainDataLoader: DataLoader mit Trainings-Daten / Training data loader
            NumEpochs: Anzahl der Epochen / Number of epochs
            Device: torch.device (cpu oder cuda)
            ValidationDataLoader: Optional DataLoader für Validierung / Optional validation data loader
            PrintEverySteps: Drucke alle N Batches / Print every N batches
        
        Returns:
            (TrainingLossHistory, ValidationLossHistory) - Listen mit Loss-Werten pro Epoche
        """
        
        
        # Initialisiere Listen für Verlauf / Initialize history lists
        TrainingLossHistory = []
        ValidationLossHistory = []
        ValidationAccuracyHistory = []
        
        # Iteriere über alle Epochen / Iterate over all epochs
        for Epoch in range(NumEpochs):
            # Initialisiere Epoch-Loss / Initialize epoch loss
            EpochLoss = 0.0
            NumBatches = 0
            
            # Iteriere über alle Batches / Iterate over all batches
            for BatchIdx, (UserIDs, ItemIDs, Labels) in enumerate(TrainDataLoader):
                # Führe Trainings-Schritt durch / Perform training step
                BatchLoss = self.train_step(
                    UserIDs=UserIDs,
                    ItemIDs=ItemIDs,
                    Labels=Labels,
                    Device=Device
                )
                
                # Akkumuliere Loss / Accumulate loss
                EpochLoss += BatchLoss
                NumBatches += 1
                
                # Drucke Statistiken / Print statistics
                if (BatchIdx + 1) % PrintEverySteps == 0:
                    AvgLoss = EpochLoss / NumBatches
                    print(f"Epoch [{Epoch+1}/{NumEpochs}] | "
                          f"Batch [{BatchIdx+1}/{len(TrainDataLoader)}] | "
                          f"Loss: {BatchLoss:.4f} | "
                          f"Avg Loss: {AvgLoss:.4f}")
            
            # Berechne durchschnittlichen Epoch-Loss / Calculate average epoch loss
            AvgEpochLoss = EpochLoss / NumBatches
            TrainingLossHistory.append(AvgEpochLoss)
            
            # Validierung falls DataLoader gegeben / Validation if dataloader provided
            if ValidationDataLoader is not None:
                # Setze Modell in Evaluations-Modus / Set model to evaluation mode
                self.eval()
                
                # Initialisiere Validierungs-Metriken / Initialize validation metrics
                ValLoss = 0.0
                ValBatches = 0
                CorrectPredictions = 0
                TotalSamples = 0
                
                # Deaktiviere Gradienten / Disable gradients
                with torch.no_grad():
                    for UserIDs, ItemIDs, Labels in ValidationDataLoader:
                        # Bewege zu Device / Move to device
                        UserIDs = UserIDs.to(Device)
                        ItemIDs = ItemIDs.to(Device)
                        Labels = Labels.to(Device).float().unsqueeze(1)
                        
                        # Forward-Pass / Forward pass
                        Predictions = self.forward(UserIDs, ItemIDs)
                        
                        # Berechne Loss / Compute loss
                        Loss = self.LossFunction(Predictions, Labels)
                        ValLoss += Loss.item()
                        
                        # Berechne Accuracy / Compute accuracy
                        BinaryPredictions = (Predictions > 0.5).float()
                        CorrectPredictions += (BinaryPredictions == Labels).sum().item()
                        TotalSamples += Labels.size(0)
                        
                        ValBatches += 1
                
                # Berechne durchschnittliche Metriken / Calculate average metrics
                AvgValLoss = ValLoss / ValBatches
                ValAccuracy = CorrectPredictions / TotalSamples
                
                ValidationLossHistory.append(AvgValLoss)
                ValidationAccuracyHistory.append(ValAccuracy)
                
                print(f"\nEpoch [{Epoch+1}/{NumEpochs}] Summary:")
                print(f"  Training Loss:     {AvgEpochLoss:.4f}")
                print(f"  Validation Loss:   {AvgValLoss:.4f}")
                print(f"  Validation Acc:    {ValAccuracy:.4f}")
            else:
                print(f"\nEpoch [{Epoch+1}/{NumEpochs}] Summary:")
                print(f"  Training Loss: {AvgEpochLoss:.4f}")
            
        
        print("Training completed!")
        
        # Gebe Historie zurück / Return history
        return TrainingLossHistory, ValidationLossHistory
    
    # Generiert Top-K Empfehlungen für einen Benutzer
    # Generate Top-K recommendations for a user
    def recommend(self,
                  UserID: int,
                  NumItems: int,
                  TopK: int,
                  Device: torch.device) -> List[Tuple[int, float]]:
        """
        Generiert Top-K Empfehlungen für einen Benutzer
        Generate top-K recommendations for a user
        
        Args:
            UserID: ID des Benutzers / User ID
            NumItems: Anzahl aller Items / Total number of items
            TopK: Anzahl der Top-Empfehlungen / Number of top recommendations
            Device: torch.device (cpu oder cuda)
        
        Returns:
            Liste von (ItemID, Score) Tupeln / List of (ItemID, Score) tuples
        """
        
        # Setze Modell in Evaluations-Modus / Set model to evaluation mode
        self.eval()
        
        # Erstelle Tensoren für alle Items / Create tensors for all items
        AllItemIDs = list(range(NumItems))
        UserIDs = torch.tensor([UserID] * NumItems, dtype=torch.long)
        ItemIDs = torch.tensor(AllItemIDs, dtype=torch.long)
        
        # Bewege zu Device / Move to device
        UserIDs = UserIDs.to(Device)
        ItemIDs = ItemIDs.to(Device)
        
        # Deaktiviere Gradienten / Disable gradients
        with torch.no_grad():
            # Berechne Vorhersagen für alle Items / Compute predictions for all items
            Predictions = self.forward(UserIDs, ItemIDs)
            Predictions = Predictions.squeeze().cpu().numpy()
        
        # Erstelle Liste von (ItemID, Score) / Create list of (ItemID, Score)
        ItemScores = list(zip(AllItemIDs, Predictions))
        
        # Sortiere nach Score absteigend / Sort by score descending
        ItemScores.sort(key=lambda x: x[1], reverse=True)
        
        # Gebe Top-K Empfehlungen zurück / Return top-K recommendations
        return ItemScores[:TopK]

## Anwendungsbeispiel: Fake Empfehlungssystem

Fake daten erstellen 

In [31]:
# ============================================================================
# SCHRITT 1: SYNTHETISCHE DATEN GENERIEREN
# Erstellt einen fake Film-Bewertungs-Datensatz
# ============================================================================

# Erstelle synthetische Daten / Create synthetic data
np.random.seed(42)

# Konfiguration / Configuration
NumUsers = 1000           # Anzahl der User / Number of users
NumItems = 500            # Anzahl der Items (Filme) / Number of items (movies)
NumInteractions = 10000   # Anzahl der Interaktionen / Number of interactions

# Generiere zufällige User-Item-Interaktionen / Generate random user-item interactions
UserIDs = np.random.randint(0, NumUsers, size=NumInteractions)
ItemIDs = np.random.randint(0, NumItems, size=NumInteractions)

# Generiere Labels (1 = positive Interaktion, 0 = negative)
# Generate labels (1 = positive interaction, 0 = negative)
# Simuliere dass beliebte Items höhere Wahrscheinlichkeit haben
# Simulate that popular items have higher probability
ItemPopularity = np.random.beta(2, 5, size=NumItems)  # Schiefe Verteilung / Skewed distribution
Labels = []

for UserID, ItemID in zip(UserIDs, ItemIDs):
    # Positive Interaktion mit Wahrscheinlichkeit basierend auf Item-Popularität
    # Positive interaction with probability based on item popularity
    Probability = ItemPopularity[ItemID]
    Label = 1 if np.random.random() < Probability else 0
    Labels.append(Label)

Labels = np.array(Labels)

# Erstelle DataFrame / Create dataframe
Data = pd.DataFrame({
    'UserID': UserIDs,
    'ItemID': ItemIDs,
    'Label': Labels
})

# Entferne Duplikate / Remove duplicates
Data = Data.drop_duplicates(subset=['UserID', 'ItemID'])


print(f"Anzahl Users:        {NumUsers:,}")
print(f"Anzahl Items:        {NumItems:,}")
print(f"Anzahl Interaktionen: {len(Data):,}")
print(f"Positive Samples:    {(Data['Label'] == 1).sum():,}")
print(f"Negative Samples:    {(Data['Label'] == 0).sum():,}")
print(f"Positive Rate:       {(Data['Label'] == 1).mean():.2%}")
print("\nErste 10 Zeilen:")
print(Data.head(10))

Anzahl Users:        1,000
Anzahl Items:        500
Anzahl Interaktionen: 9,910
Positive Samples:    2,710
Negative Samples:    7,200
Positive Rate:       27.35%

Erste 10 Zeilen:
   UserID  ItemID  Label
0     102     441      0
1     435     278      0
2     860     250      0
3     270     309      0
4     106     207      1
5      71     320      0
6     700     139      0
7      20     279      0
8     614      33      0
9     121     308      0


In [32]:
# ============================================================================
# SCHRITT 2: DATEN VORBEREITEN UND AUFTEILEN
# Train-Test-Split und DataLoader erstellen
# ============================================================================

# Teile Daten in Training und Test / Split data into training and test
TrainData, TestData = train_test_split(
    Data, 
    test_size=0.2,      # 20% für Test / 20% for test
    random_state=42,
    stratify=Data['Label']  # Stratifiziere nach Labels / Stratify by labels
)

print(f"Training-Samples: {len(TrainData):,}")
print(f"Test-Samples:     {len(TestData):,}")

# Erstelle PyTorch-Dataset / Create PyTorch dataset
class NCFDataset(torch.utils.data.Dataset):
    # Initialisiert Dataset
    # Initialize dataset
    def __init__(self, DataFrame):
        # Speichere User-IDs / Store user IDs
        self.UserIDs = torch.tensor(DataFrame['UserID'].values, dtype=torch.long)
        
        # Speichere Item-IDs / Store item IDs
        self.ItemIDs = torch.tensor(DataFrame['ItemID'].values, dtype=torch.long)
        
        # Speichere Labels / Store labels
        self.Labels = torch.tensor(DataFrame['Label'].values, dtype=torch.long)
    
    # Gibt Länge des Datasets zurück
    # Return length of dataset
    def __len__(self):
        return len(self.UserIDs)
    
    # Gibt einzelnes Sample zurück
    # Return single sample
    def __getitem__(self, Index):
        return self.UserIDs[Index], self.ItemIDs[Index], self.Labels[Index]

# Erstelle Datasets / Create datasets
TrainDataset = NCFDataset(TrainData)
TestDataset = NCFDataset(TestData)

# Erstelle DataLoader / Create dataloaders
BatchSize = 256

TrainDataLoader = torch.utils.data.DataLoader(
    TrainDataset,
    batch_size=BatchSize,
    shuffle=True,        # Mische Trainings-Daten / Shuffle training data
    num_workers=0        # Anzahl Worker-Prozesse / Number of worker processes
)

TestDataLoader = torch.utils.data.DataLoader(
    TestDataset,
    batch_size=BatchSize,
    shuffle=False,       # Mische Test-Daten nicht / Don't shuffle test data
    num_workers=0
)

print(f"\n DataLoader erstellt")
print(f"  Batch-Größe:        {BatchSize}")
print(f"  Training-Batches:   {len(TrainDataLoader)}")
print(f"  Test-Batches:       {len(TestDataLoader)}")

Training-Samples: 7,928
Test-Samples:     1,982

 DataLoader erstellt
  Batch-Größe:        256
  Training-Batches:   31
  Test-Batches:       8


In [33]:
# ============================================================================
# SCHRITT 3: MODELL INITIALISIEREN
# Erstellt das NeuMF-Modell mit Hyperparametern
# ============================================================================

# Definiere Hyperparameter / Define hyperparameters
GMFEmbeddingDim = 64        # Embedding-Dimension für GMF / Embedding dimension for GMF
MLPEmbeddingDim = 64        # Embedding-Dimension für MLP / Embedding dimension for MLP
MLPLayers = [128, 64, 32, 16]  # MLP-Layer-Größen / MLP layer sizes
Dropout = 0.2               # Dropout-Rate / Dropout rate
LearningRate = 0.001        # Lernrate / Learning rate

# Erstelle NeuMF-Modell / Create NeuMF model
NCFModel = NeuralMatrixFactorization(
    NumUsers=NumUsers,
    NumItems=NumItems,
    GMFEmbeddingDim=GMFEmbeddingDim,
    MLPEmbeddingDim=MLPEmbeddingDim,
    MLPLayers=MLPLayers,
    Dropout=Dropout,
    LearningRate=LearningRate  # Optimizer und Loss werden automatisch erstellt
)

# Zähle Modell-Parameter / Count model parameters
TotalParams = sum(p.numel() for p in NCFModel.parameters())
TrainableParams = sum(p.numel() for p in NCFModel.parameters() if p.requires_grad)


Man brauch das meist nie aber kann ja nicht schaden

In [34]:
print("NEURAL COLLABORATIVE FILTERING MODEL INITIALISIERT")
print(f"Number of Users:          {NumUsers:,}")
print(f"Number of Items:          {NumItems:,}")
print(f"GMF Embedding Dimension:  {GMFEmbeddingDim}")
print(f"MLP Embedding Dimension:  {MLPEmbeddingDim}")
print(f"MLP Layers:               {MLPLayers}")
print(f"Dropout Rate:             {Dropout}")
print(f"Learning Rate:            {LearningRate}")
print(f"\nTotal Parameters:         {TotalParams:,}")
print(f"Trainable Parameters:     {TrainableParams:,}")

# Zeige Modell-Architektur / Show model architecture
print("\n MODELL-KOMPONENTEN:")
print(f"  ├─ GMF Component:")
print(f"  │   ├─ GMFUserEmbeddingLayer:  {sum(p.numel() for p in NCFModel.GMFComponent.GMFUserEmbeddingLayer.parameters()):,} Parameter")
print(f"  │   └─ GMFItemEmbeddingLayer:  {sum(p.numel() for p in NCFModel.GMFComponent.GMFItemEmbeddingLayer.parameters()):,} Parameter")
print(f"  │")
print(f"  ├─ MLP Component:")
print(f"  │   ├─ MLPUserEmbeddingLayer:  {sum(p.numel() for p in NCFModel.MLPComponent.MLPUserEmbeddingLayer.parameters()):,} Parameter")
print(f"  │   ├─ MLPItemEmbeddingLayer:  {sum(p.numel() for p in NCFModel.MLPComponent.MLPItemEmbeddingLayer.parameters()):,} Parameter")
print(f"  │   ├─ MLPLinearLayer1:        {sum(p.numel() for p in NCFModel.MLPComponent.MLPLinearLayer1.parameters()):,} Parameter")
print(f"  │   ├─ MLPLinearLayer2:        {sum(p.numel() for p in NCFModel.MLPComponent.MLPLinearLayer2.parameters()):,} Parameter")
print(f"  │   ├─ MLPLinearLayer3:        {sum(p.numel() for p in NCFModel.MLPComponent.MLPLinearLayer3.parameters()):,} Parameter")
print(f"  │   └─ MLPLinearLayer4:        {sum(p.numel() for p in NCFModel.MLPComponent.MLPLinearLayer4.parameters()):,} Parameter")
print(f"  │")
print(f"  └─ FinalPredictionLayer:       {sum(p.numel() for p in NCFModel.FinalPredictionLayer.parameters()):,} Parameter")


NEURAL COLLABORATIVE FILTERING MODEL INITIALISIERT
Number of Users:          1,000
Number of Items:          500
GMF Embedding Dimension:  64
MLP Embedding Dimension:  64
MLP Layers:               [128, 64, 32, 16]
Dropout Rate:             0.2
Learning Rate:            0.001

Total Parameters:         219,457
Trainable Parameters:     219,457

 MODELL-KOMPONENTEN:
  ├─ GMF Component:
  │   ├─ GMFUserEmbeddingLayer:  64,000 Parameter
  │   └─ GMFItemEmbeddingLayer:  32,000 Parameter
  │
  ├─ MLP Component:
  │   ├─ MLPUserEmbeddingLayer:  64,000 Parameter
  │   ├─ MLPItemEmbeddingLayer:  32,000 Parameter
  │   ├─ MLPLinearLayer1:        16,512 Parameter
  │   ├─ MLPLinearLayer2:        8,256 Parameter
  │   ├─ MLPLinearLayer3:        2,080 Parameter
  │   └─ MLPLinearLayer4:        528 Parameter
  │
  └─ FinalPredictionLayer:       81 Parameter


In [35]:
# ============================================================================
# SCHRITT 4: TRAINING AUSFÜHREN
# Trainiert das NCF-Modell mit train_model Methode
# ============================================================================

# Trainiere das Modell / Train the model
TrainingLossHistory, ValidationLossHistory = NCFModel.train_model(
    TrainDataLoader=TrainDataLoader,
    NumEpochs=5,
    Device=Device,
    ValidationDataLoader=TestDataLoader,
    PrintEverySteps=10
)

Epoch [1/5] | Batch [10/31] | Loss: 0.6762 | Avg Loss: 0.6863
Epoch [1/5] | Batch [20/31] | Loss: 0.6585 | Avg Loss: 0.6749
Epoch [1/5] | Batch [30/31] | Loss: 0.5914 | Avg Loss: 0.6578

Epoch [1/5] Summary:
  Training Loss:     0.6555
  Validation Loss:   0.5807
  Validation Acc:    0.7265
Epoch [2/5] | Batch [10/31] | Loss: 0.5613 | Avg Loss: 0.5613
Epoch [2/5] | Batch [20/31] | Loss: 0.5585 | Avg Loss: 0.5481
Epoch [2/5] | Batch [30/31] | Loss: 0.5162 | Avg Loss: 0.5451

Epoch [2/5] Summary:
  Training Loss:     0.5465
  Validation Loss:   0.5517
  Validation Acc:    0.7265
Epoch [3/5] | Batch [10/31] | Loss: 0.4936 | Avg Loss: 0.4952
Epoch [3/5] | Batch [20/31] | Loss: 0.4906 | Avg Loss: 0.4871
Epoch [3/5] | Batch [30/31] | Loss: 0.4418 | Avg Loss: 0.4920

Epoch [3/5] Summary:
  Training Loss:     0.4911
  Validation Loss:   0.5858
  Validation Acc:    0.7265
Epoch [2/5] | Batch [10/31] | Loss: 0.5613 | Avg Loss: 0.5613
Epoch [2/5] | Batch [20/31] | Loss: 0.5585 | Avg Loss: 0.5481


In [36]:
# ============================================================================
# SCHRITT 5: EMPFEHLUNGEN GENERIEREN FÜR BEISPIELBENUTZER
# Demonstration der Top-10 Empfehlungen mit recommend Methode
# ============================================================================


# Wähle 5 Beispielbenutzer aus / Select 5 example users
BeispielBenutzer = [0, 50, 100, 500, 999]  # Example users

for BenutzerID in BeispielBenutzer:
    print(f"Top-10 Empfehlungen für Benutzer {BenutzerID}:")    
    # Generiere Top-10 Empfehlungen mit recommend Methode
    # Generate top-10 recommendations using recommend method
    Empfehlungen = NCFModel.recommend(
        UserID=BenutzerID,
        NumItems=NumItems,
        TopK=10,
        Device=Device
    )
    
    # Zeige Empfehlungen mit Scores / Show recommendations with scores
    for Rang, (ItemID, Score) in enumerate(Empfehlungen, start=1):
        print(f"   {Rang:2d}. Item {ItemID:3d} → Konfidenz: {Score:.4f}")

Top-10 Empfehlungen für Benutzer 0:
    1. Item   7 → Konfidenz: 0.6506
    2. Item 322 → Konfidenz: 0.6398
    3. Item 325 → Konfidenz: 0.6383
    4. Item 225 → Konfidenz: 0.6367
    5. Item 335 → Konfidenz: 0.6358
    6. Item  25 → Konfidenz: 0.6285
    7. Item 326 → Konfidenz: 0.6283
    8. Item 129 → Konfidenz: 0.6249
    9. Item 356 → Konfidenz: 0.6235
   10. Item 479 → Konfidenz: 0.6195
Top-10 Empfehlungen für Benutzer 50:
    1. Item   7 → Konfidenz: 0.6710
    2. Item 325 → Konfidenz: 0.6637
    3. Item 225 → Konfidenz: 0.6610
    4. Item 322 → Konfidenz: 0.6588
    5. Item 335 → Konfidenz: 0.6577
    6. Item 326 → Konfidenz: 0.6538
    7. Item  25 → Konfidenz: 0.6517
    8. Item 129 → Konfidenz: 0.6487
    9. Item 479 → Konfidenz: 0.6442
   10. Item 356 → Konfidenz: 0.6408
Top-10 Empfehlungen für Benutzer 100:
    1. Item   7 → Konfidenz: 0.6526
    2. Item 322 → Konfidenz: 0.6407
    3. Item 325 → Konfidenz: 0.6405
    4. Item 225 → Konfidenz: 0.6394
    5. Item 335 → Konfide