# Partner Recommender Training Pipeline

This notebook demonstrates how to train the HugeCTR two-tower model for startup partner matching.

## Architecture Overview

The model uses a two-tower architecture:
- **Company A Tower**: Processes features for the query company
- **Company B Tower**: Processes features for candidate partners
- **Similarity Calculation**: Dot product + sigmoid for match probability

## Features
- Dense: user_overlap_score, funding_amount, employee_count, growth_rate, market_sentiment
- Sparse: company_id, industry, stage, technologies (categorical embeddings)
- Culture Vector: 128-dimensional embedding from NLP analysis

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import cudf
import hugectr
from hugectr import Session, solver_parser_helper, get_learning_rate_scheduler
import nvtabular as nvt
from nvtabular.ops import *
from merlin.core.utils import Distributed
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json

# Set up paths
BASE_DIR = "/app"
DATA_DIR = f"{BASE_DIR}/data"
MODEL_DIR = f"{BASE_DIR}/models"
TRAINING_DIR = f"{DATA_DIR}/training"

# Create directories
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(TRAINING_DIR, exist_ok=True)

print(f"Training started at: {datetime.now()}")
print(f"CUDA devices available: {os.getenv('CUDA_VISIBLE_DEVICES', 'Not set')}")

## 1. Data Preparation

Generate synthetic training data for partner matching.

In [None]:
def generate_synthetic_data(num_samples=100000):
    """
    Generate synthetic training data for partner matching
    """
    np.random.seed(42)
    
    # Company IDs
    num_companies = 1000
    company_ids = [f"company_{i}" for i in range(num_companies)]
    
    data = []
    
    for _ in range(num_samples):
        # Select two different companies
        company_a, company_b = np.random.choice(company_ids, 2, replace=False)
        
        # Generate features for company A
        funding_a = np.random.lognormal(15, 2)  # Log-normal distribution for funding
        employees_a = np.random.randint(10, 1000)
        growth_a = np.random.normal(25, 15)  # Growth rate percentage
        sentiment_a = np.random.normal(0, 0.3)  # Market sentiment
        overlap_score = np.random.beta(2, 5)  # User overlap score
        
        # Generate features for company B (with some correlation to A)
        funding_b = funding_a * np.random.lognormal(0, 0.5)
        employees_b = employees_a * np.random.lognormal(0, 0.3)
        growth_b = growth_a + np.random.normal(0, 10)
        sentiment_b = sentiment_a + np.random.normal(0, 0.2)
        
        # Generate culture vectors (128-dim)
        culture_a = np.random.normal(0, 1, 128)
        culture_b = culture_a + np.random.normal(0, 0.5, 128)  # Correlated
        
        # Calculate similarity features
        funding_similarity = 1 - abs(np.log(funding_a) - np.log(funding_b)) / 10
        size_similarity = 1 - abs(np.log(employees_a) - np.log(employees_b)) / 5
        growth_similarity = 1 - abs(growth_a - growth_b) / 50
        sentiment_similarity = 1 - abs(sentiment_a - sentiment_b)
        culture_similarity = np.dot(culture_a, culture_b) / (np.linalg.norm(culture_a) * np.linalg.norm(culture_b))
        
        # Generate label based on similarity features
        match_probability = (
            funding_similarity * 0.2 +
            size_similarity * 0.15 +
            growth_similarity * 0.15 +
            sentiment_similarity * 0.2 +
            (culture_similarity + 1) / 2 * 0.3  # Normalize culture similarity
        )
        
        # Add noise and threshold
        match_probability += np.random.normal(0, 0.1)
        match_probability = np.clip(match_probability, 0, 1)
        
        label = 1 if match_probability > 0.6 else 0
        
        # Create record
        record = {
            'company_a': company_a,
            'company_b': company_b,
            'funding_a': funding_a,
            'employees_a': employees_a,
            'growth_a': growth_a,
            'sentiment_a': sentiment_a,
            'funding_b': funding_b,
            'employees_b': employees_b,
            'growth_b': growth_b,
            'sentiment_b': sentiment_b,
            'overlap_score': overlap_score,
            'label': label
        }
        
        # Add culture vectors
        for i in range(128):
            record[f'culture_a_{i}'] = culture_a[i]
            record[f'culture_b_{i}'] = culture_b[i]
        
        data.append(record)
    
    return pd.DataFrame(data)

# Generate training data
print("Generating synthetic training data...")
df = generate_synthetic_data(100000)

print(f"Generated {len(df)} training samples")
print(f"Positive samples: {df['label'].sum()} ({df['label'].mean():.2%})")
print(f"Negative samples: {len(df) - df['label'].sum()} ({1 - df['label'].mean():.2%})")

# Display sample data
df.head()

## 2. Data Preprocessing with NVTabular

Use NVTabular for GPU-accelerated data preprocessing.

In [None]:
# Convert to cuDF for GPU processing
gdf = cudf.from_pandas(df)

# Define feature columns
categorical_cols = ['company_a', 'company_b']
continuous_cols = [
    'funding_a', 'employees_a', 'growth_a', 'sentiment_a',
    'funding_b', 'employees_b', 'growth_b', 'sentiment_b',
    'overlap_score'
]

# Culture vector columns
culture_cols_a = [f'culture_a_{i}' for i in range(128)]
culture_cols_b = [f'culture_b_{i}' for i in range(128)]
culture_cols = culture_cols_a + culture_cols_b

label_col = ['label']

print(f"Categorical features: {len(categorical_cols)}")
print(f"Continuous features: {len(continuous_cols)}")
print(f"Culture features: {len(culture_cols)}")

# Create NVTabular workflow
categorical_features = categorical_cols >> Categorify()
continuous_features = continuous_cols >> FillMissing() >> Normalize()
culture_features = culture_cols >> FillMissing() >> Normalize()
label_features = label_col >> LambdaOp(lambda x: x.astype('float32'))

workflow_ops = categorical_features + continuous_features + culture_features + label_features
workflow = nvt.Workflow(workflow_ops)

# Create dataset
dataset = nvt.Dataset(gdf)

# Fit workflow
print("Fitting NVTabular workflow...")
workflow.fit(dataset)

# Transform data
print("Transforming data...")
transformed_dataset = workflow.transform(dataset)
transformed_gdf = transformed_dataset.to_ddf().compute()

print(f"Transformed data shape: {transformed_gdf.shape}")
print("Transformation completed!")

## 3. Train/Validation Split and Data Export

In [None]:
# Split data
train_size = int(0.8 * len(transformed_gdf))
train_gdf = transformed_gdf.iloc[:train_size]
val_gdf = transformed_gdf.iloc[train_size:]

print(f"Training samples: {len(train_gdf)}")
print(f"Validation samples: {len(val_gdf)}")

# Save processed data
train_path = f"{TRAINING_DIR}/train_data.parquet"
val_path = f"{TRAINING_DIR}/val_data.parquet"

train_gdf.to_parquet(train_path)
val_gdf.to_parquet(val_path)

print(f"Training data saved to: {train_path}")
print(f"Validation data saved to: {val_path}")

# Save workflow
workflow_path = f"{MODEL_DIR}/nvt_workflow"
workflow.save(workflow_path)
print(f"NVTabular workflow saved to: {workflow_path}")

## 4. HugeCTR Model Configuration

Define the two-tower model architecture.

In [None]:
def create_hugectr_config():
    """
    Create HugeCTR configuration for two-tower model
    """
    config = {
        "solver": {
            "lr_policy": "fixed",
            "display": 1000,
            "max_iter": 20000,
            "snapshot": 10000,
            "snapshot_prefix": f"{MODEL_DIR}/partner_recommender",
            "eval_interval": 1000,
            "eval_batches": 100,
            "mixed_precision": 1024,
            "batchsize": 1024,
            "batchsize_eval": 1024,
            "lr": 0.001,
            "warmup_steps": 1000,
            "decay_start": 15000,
            "decay_steps": 5000,
            "decay_power": 2.0,
            "end_lr": 0.0
        },
        "optimizer": {
            "type": "Adam",
            "adam_hparam": {
                "learning_rate": 0.001,
                "beta1": 0.9,
                "beta2": 0.999,
                "epsilon": 0.0000001
            }
        },
        "layers": [
            # Data layer
            {
                "name": "data",
                "type": "Data",
                "source": train_path,
                "eval_source": val_path,
                "check": "None",
                "label": {
                    "top": "label",
                    "label_dim": 1
                },
                "dense": {
                    "top": "dense",
                    "dense_dim": len(continuous_cols) + len(culture_cols)
                },
                "sparse": [
                    {
                        "top": "company_a_cat",
                        "type": "DistributedSlot",
                        "max_feature_num_per_sample": 1,
                        "slot_num": 1
                    },
                    {
                        "top": "company_b_cat",
                        "type": "DistributedSlot",
                        "max_feature_num_per_sample": 1,
                        "slot_num": 1
                    }
                ]
            },
            
            # Sparse embeddings
            {
                "name": "sparse_embedding_a",
                "type": "DistributedSlotSparseEmbeddingHash",
                "bottom": "company_a_cat",
                "top": "sparse_embedding_a",
                "sparse_embedding_hparam": {
                    "vocabulary_size": 10000,
                    "embedding_vec_size": 128,
                    "combiner": "sum"
                }
            },
            {
                "name": "sparse_embedding_b",
                "type": "DistributedSlotSparseEmbeddingHash",
                "bottom": "company_b_cat",
                "top": "sparse_embedding_b",
                "sparse_embedding_hparam": {
                    "vocabulary_size": 10000,
                    "embedding_vec_size": 128,
                    "combiner": "sum"
                }
            },
            
            # Company A tower
            {
                "name": "reshape_a",
                "type": "Reshape",
                "bottom": "sparse_embedding_a",
                "top": "reshape_a",
                "leading_dim": 128
            },
            {
                "name": "concat_a",
                "type": "Concat",
                "bottom": ["dense", "reshape_a"],
                "top": "concat_a"
            },
            {
                "name": "fc1_a",
                "type": "InnerProduct",
                "bottom": "concat_a",
                "top": "fc1_a",
                "fc_param": {"num_output": 512}
            },
            {
                "name": "relu1_a",
                "type": "ReLU",
                "bottom": "fc1_a",
                "top": "relu1_a"
            },
            {
                "name": "dropout1_a",
                "type": "Dropout",
                "rate": 0.5,
                "bottom": "relu1_a",
                "top": "dropout1_a"
            },
            {
                "name": "fc2_a",
                "type": "InnerProduct",
                "bottom": "dropout1_a",
                "top": "fc2_a",
                "fc_param": {"num_output": 256}
            },
            {
                "name": "relu2_a",
                "type": "ReLU",
                "bottom": "fc2_a",
                "top": "relu2_a"
            },
            {
                "name": "tower_a",
                "type": "InnerProduct",
                "bottom": "relu2_a",
                "top": "tower_a",
                "fc_param": {"num_output": 128}
            },
            
            # Company B tower (similar structure)
            {
                "name": "reshape_b",
                "type": "Reshape",
                "bottom": "sparse_embedding_b",
                "top": "reshape_b",
                "leading_dim": 128
            },
            {
                "name": "concat_b",
                "type": "Concat",
                "bottom": ["dense", "reshape_b"],
                "top": "concat_b"
            },
            {
                "name": "fc1_b",
                "type": "InnerProduct",
                "bottom": "concat_b",
                "top": "fc1_b",
                "fc_param": {"num_output": 512}
            },
            {
                "name": "relu1_b",
                "type": "ReLU",
                "bottom": "fc1_b",
                "top": "relu1_b"
            },
            {
                "name": "dropout1_b",
                "type": "Dropout",
                "rate": 0.5,
                "bottom": "relu1_b",
                "top": "dropout1_b"
            },
            {
                "name": "fc2_b",
                "type": "InnerProduct",
                "bottom": "dropout1_b",
                "top": "fc2_b",
                "fc_param": {"num_output": 256}
            },
            {
                "name": "relu2_b",
                "type": "ReLU",
                "bottom": "fc2_b",
                "top": "relu2_b"
            },
            {
                "name": "tower_b",
                "type": "InnerProduct",
                "bottom": "relu2_b",
                "top": "tower_b",
                "fc_param": {"num_output": 128}
            },
            
            # Similarity calculation
            {
                "name": "dot_product",
                "type": "DotProduct",
                "bottom": ["tower_a", "tower_b"],
                "top": "dot_product"
            },
            {
                "name": "sigmoid",
                "type": "Sigmoid",
                "bottom": "dot_product",
                "top": "sigmoid"
            },
            
            # Loss
            {
                "name": "loss",
                "type": "BinaryCrossEntropyLoss",
                "bottom": ["sigmoid", "label"],
                "top": "loss"
            }
        ]
    }
    
    return config

# Create and save configuration
config = create_hugectr_config()
config_path = f"{MODEL_DIR}/partner_recommender.json"

with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print(f"HugeCTR configuration saved to: {config_path}")
print("Model architecture:")
print(f"- Input features: {len(continuous_cols) + len(culture_cols)} dense + 2 sparse")
print(f"- Tower architecture: 512 -> 256 -> 128")
print(f"- Output: Dot product + Sigmoid")
print(f"- Loss: Binary Cross Entropy")

## 5. Model Training

Train the HugeCTR two-tower model.

In [None]:
# Create and train HugeCTR model
print("Creating HugeCTR model...")
model = hugectr.Model(config, hugectr.Check_t.Sum)

# Compile model
print("Compiling model...")
model.compile()

# Display model summary
print("Model summary:")
model.summary()

# Train model
print("Starting training...")
start_time = datetime.now()

model.fit(
    max_iter=20000,
    display=1000,
    eval_interval=1000,
    snapshot=10000
)

end_time = datetime.now()
training_duration = end_time - start_time

print(f"Training completed in: {training_duration}")
print(f"Model saved to: {MODEL_DIR}/partner_recommender_dense_*.model")

## 6. Model Evaluation

Evaluate the trained model performance.

In [None]:
# Load model for inference
print("Loading model for evaluation...")

inference_session = hugectr.inference.CreateInferenceSession(
    model_config_path=config_path,
    model_weights_path=f"{MODEL_DIR}/partner_recommender_dense_20000.model",
    device_id=0,
    cache_size_percentage=0.5,
    i64_input_key=True
)

print("Model loaded successfully for inference!")

# Evaluate on validation set (sample)
val_sample = val_gdf.sample(n=1000).to_pandas()

predictions = []
true_labels = []

for idx, row in val_sample.iterrows():
    try:
        # Prepare features for inference
        dense_features = np.array([
            row[continuous_cols + culture_cols].values
        ], dtype=np.float32)
        
        sparse_features = np.array([
            [row['company_a'], row['company_b']]
        ], dtype=np.int64)
        
        # Make prediction
        pred = inference_session.predict(
            dense_features,
            [sparse_features[:, 0:1], sparse_features[:, 1:2]]
        )
        
        predictions.append(pred[0][0])
        true_labels.append(row['label'])
        
    except Exception as e:
        print(f"Prediction error for row {idx}: {e}")
        continue

# Calculate metrics
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

predictions = np.array(predictions)
true_labels = np.array(true_labels)
binary_predictions = (predictions > 0.5).astype(int)

metrics = {
    'auc': roc_auc_score(true_labels, predictions),
    'accuracy': accuracy_score(true_labels, binary_predictions),
    'precision': precision_score(true_labels, binary_predictions),
    'recall': recall_score(true_labels, binary_predictions),
    'f1_score': f1_score(true_labels, binary_predictions)
}

print("\nModel Performance Metrics:")
print(f"AUC: {metrics['auc']:.4f}")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1 Score: {metrics['f1_score']:.4f}")

# Save metrics
metrics_path = f"{MODEL_DIR}/metrics.json"
with open(metrics_path, 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"\nMetrics saved to: {metrics_path}")

## 7. Visualization

Create visualizations of model performance.

In [None]:
# Plot prediction distribution
plt.figure(figsize=(15, 5))

# Prediction distribution
plt.subplot(1, 3, 1)
plt.hist(predictions[true_labels == 0], bins=50, alpha=0.7, label='Negative', color='red')
plt.hist(predictions[true_labels == 1], bins=50, alpha=0.7, label='Positive', color='blue')
plt.xlabel('Prediction Score')
plt.ylabel('Frequency')
plt.title('Prediction Distribution by True Label')
plt.legend()

# ROC Curve
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(true_labels, predictions)

plt.subplot(1, 3, 2)
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {metrics["auc"]:.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()

# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(true_labels, binary_predictions)

plt.subplot(1, 3, 3)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

plt.tight_layout()
plt.savefig(f"{MODEL_DIR}/model_evaluation.png", dpi=300, bbox_inches='tight')
plt.show()

print(f"Evaluation plots saved to: {MODEL_DIR}/model_evaluation.png")

## 8. Training Summary

Save training summary and metadata.

In [None]:
# Create training summary
training_summary = {
    "model_info": {
        "architecture": "Two-tower HugeCTR",
        "framework": "NVIDIA HugeCTR",
        "version": "23.08",
        "embedding_dim": 128
    },
    "training_data": {
        "total_samples": len(df),
        "training_samples": len(train_gdf),
        "validation_samples": len(val_gdf),
        "positive_ratio": float(df['label'].mean()),
        "features": {
            "categorical": len(categorical_cols),
            "continuous": len(continuous_cols),
            "culture_vector": len(culture_cols)
        }
    },
    "training_config": {
        "batch_size": 1024,
        "learning_rate": 0.001,
        "max_iterations": 20000,
        "optimizer": "Adam",
        "loss_function": "Binary Cross Entropy"
    },
    "performance_metrics": metrics,
    "training_time": {
        "start_time": start_time.isoformat(),
        "end_time": end_time.isoformat(),
        "duration_seconds": training_duration.total_seconds()
    },
    "model_files": {
        "config": config_path,
        "weights": f"{MODEL_DIR}/partner_recommender_dense_20000.model",
        "workflow": workflow_path,
        "metrics": metrics_path
    }
}

# Save training summary
summary_path = f"{MODEL_DIR}/training_summary.json"
with open(summary_path, 'w') as f:
    json.dump(training_summary, f, indent=2, default=str)

print("\n" + "="*60)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("="*60)
print(f"Model files saved in: {MODEL_DIR}")
print(f"Training summary: {summary_path}")
print(f"Final AUC: {metrics['auc']:.4f}")
print(f"Training duration: {training_duration}")
print("\nModel is ready for inference!")
print("="*60)