# MLP Classifier - Complete Implementation

Complete MLP classifier implementation including model class definition, training, evaluation, and saving.


In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path().absolute().parent))

import numpy as np
import json
import pickle
from typing import Dict, List, Optional, Tuple
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Model Class Definition


In [None]:
class PatentNoveltyClassifier:
    """
    MLP for binary classification with:
    - Single hidden layer with ReLU activation
    - Output layer with sigmoid activation
    - Cross-entropy loss function
    - Minibatch SGD optimization via sklearn
    
    Parameters
    ----------
    hidden_layer_sizes : tuple of int, default=(64,)
        Number of hidden units in each layer
    alpha : float, default=1e-5
        L2 regularization parameter
    learning_rate_init : float, default=0.005
        Initial learning rate
    max_iter : int, default=500
        Maximum number of iterations
    early_stopping : bool, default=True
        Whether to use early stopping
    n_iter_no_change : int, default=20
        Number of iterations with no improvement before stopping
    random_state : int, default=42
        Random seed
    """
    
    def __init__(
        self,
        hidden_layer_sizes=(64,),
        alpha=1e-5,
        learning_rate_init=0.005,
        max_iter=500,
        early_stopping=True,
        n_iter_no_change=20,
        random_state=42
    ):
        self.hidden_layer_sizes = hidden_layer_sizes
        self.alpha = alpha
        self.learning_rate_init = learning_rate_init
        self.max_iter = max_iter
        self.early_stopping = early_stopping
        self.n_iter_no_change = n_iter_no_change
        self.random_state = random_state
        
        self.model = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            alpha=alpha,
            learning_rate_init=learning_rate_init,
            max_iter=max_iter,
            early_stopping=early_stopping,
            n_iter_no_change=n_iter_no_change,
            random_state=random_state,
            verbose=False
        )
        
        self.scaler = StandardScaler()
        self.feature_names = None
        self.training_history = None
    
    def fit(self, X_train, y_train, X_val=None, y_val=None, feature_names=None):
        self.feature_names = feature_names
        
        X_train_scaled = self.scaler.fit_transform(X_train)
        
        if X_val is not None and y_val is not None:
            X_val_scaled = self.scaler.transform(X_val)
            self.model.fit(X_train_scaled, y_train)
            
            if hasattr(self.model, 'loss_curve_'):
                self.training_history = {
                    'loss': self.model.loss_curve_,
                    'n_iter': self.model.n_iter_
                }
        else:
            self.model.fit(X_train_scaled, y_train)
            if hasattr(self.model, 'loss_curve_'):
                self.training_history = {
                    'loss': self.model.loss_curve_,
                    'n_iter': self.model.n_iter_
                }
    
    def predict(self, X):
        """Predict class for each row in X"""
        X_scaled = self.scaler.transform(X)
        return self.model.predict(X_scaled)
    
    def predict_proba(self, X):
        """Predict probabilities for each row in X for each class"""
        X_scaled = self.scaler.transform(X)
        return self.model.predict_proba(X_scaled)
    
    def evaluate(self, X, y):
        y_pred = self.predict(X)
        y_proba = self.predict_proba(X)[:, 1]
        
        metrics = {
            'accuracy': accuracy_score(y, y_pred),
            'precision': precision_score(y, y_pred, zero_division=0),
            'recall': recall_score(y, y_pred, zero_division=0),
            'f1': f1_score(y, y_pred, zero_division=0),
            'roc_auc': roc_auc_score(y, y_proba) if len(np.unique(y)) > 1 else 0.0
        }
        
        metrics['brier_score'] = np.mean((y_proba - y) ** 2)
        
        return metrics
    
    def get_feature_importance(self, n_samples=1000):
        if self.feature_names is None:
            return {}
        
        importance = {}
        
        if hasattr(self.model, 'coefs_'):
            first_layer_weights = np.abs(self.model.coefs_[0])
            feature_importance = np.mean(first_layer_weights, axis=1)
            
            total = np.sum(feature_importance)
            if total > 0:
                feature_importance = feature_importance / total
            
            for i, name in enumerate(self.feature_names):
                if i < len(feature_importance):
                    importance[name] = float(feature_importance[i])
                else:
                    importance[name] = 0.0
        
        return importance
    
    def plot_training_curve(self, output_path=None):
        if self.training_history is None or 'loss' not in self.training_history:
            print("No training history available")
            return
        
        plt.figure(figsize=(10, 6))
        plt.plot(self.training_history['loss'], label='Training Loss')
        plt.xlabel('Iteration')
        plt.ylabel('Loss')
        plt.title('Training Loss Curve')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        if output_path:
            plt.savefig(output_path, dpi=150, bbox_inches='tight')
            plt.close()
        else:
            plt.show()
    
    def plot_roc_curve(self, X, y, output_path=None):
        y_proba = self.predict_proba(X)[:, 1]
        fpr, tpr, _ = roc_curve(y, y_proba)
        auc = roc_auc_score(y, y_proba)
        
        plt.figure(figsize=(8, 8))
        plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.3f})', linewidth=2)
        plt.plot([0, 1], [0, 1], 'k--', label='Random')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        if output_path:
            plt.savefig(output_path, dpi=150, bbox_inches='tight')
            plt.close()
        else:
            plt.show()
    
    def plot_confusion_matrix(self, X, y, output_path=None):
        y_pred = self.predict(X)
        cm = confusion_matrix(y, y_pred)
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        
        if output_path:
            plt.savefig(output_path, dpi=150, bbox_inches='tight')
            plt.close()
        else:
            plt.show()
    
    def save(self, models_dir: str):
        models_dir = Path(models_dir)
        models_dir.mkdir(parents=True, exist_ok=True)
        
        mlp_dir = models_dir / 'mlp'
        mlp_dir.mkdir(parents=True, exist_ok=True)
        
        with open(mlp_dir / 'mlp_model.pkl', 'wb') as f:
            pickle.dump(self.model, f)
        
        with open(mlp_dir / 'scaler.pkl', 'wb') as f:
            pickle.dump(self.scaler, f)
        
        metadata = {
            'hidden_layer_sizes': self.hidden_layer_sizes,
            'alpha': self.alpha,
            'learning_rate_init': self.learning_rate_init,
            'max_iter': self.max_iter,
            'early_stopping': self.early_stopping,
            'n_iter_no_change': self.n_iter_no_change,
            'random_state': self.random_state,
            'feature_names': self.feature_names,
            'training_history': self.training_history
        }
        
        with open(mlp_dir / 'metadata.json', 'w') as f:
            json.dump(metadata, f, indent=2, default=str)
    
    @classmethod
    def load(cls, models_dir: str):
        models_dir = Path(models_dir)
        mlp_dir = models_dir / 'mlp'
        
        if not mlp_dir.exists():
            raise FileNotFoundError(f"MLP model directory not found: {mlp_dir}")
        
        with open(mlp_dir / 'metadata.json', 'r') as f:
            metadata = json.load(f)
        
        instance = cls(
            hidden_layer_sizes=tuple(metadata['hidden_layer_sizes']),
            alpha=metadata['alpha'],
            learning_rate_init=metadata['learning_rate_init'],
            max_iter=metadata['max_iter'],
            early_stopping=metadata['early_stopping'],
            n_iter_no_change=metadata['n_iter_no_change'],
            random_state=metadata['random_state']
        )
        
        with open(mlp_dir / 'mlp_model.pkl', 'rb') as f:
            instance.model = pickle.load(f)
        
        with open(mlp_dir / 'scaler.pkl', 'rb') as f:
            instance.scaler = pickle.load(f)
        
        instance.feature_names = metadata.get('feature_names')
        instance.training_history = metadata.get('training_history')
        
        return instance


## Load Features


In [3]:
features_dir = Path('data/features')

data = {}
for split in ['train', 'val', 'test']:
    X = np.load(features_dir / f'{split}_features.X.npy')
    y = np.load(features_dir / f'{split}_features.y.npy')
    data[split] = {'X': X, 'y': y}
    print(f"Loaded {split}: {X.shape[0]} samples, {X.shape[1]} features")

with open(features_dir / 'feature_names.json', 'r') as f:
    feature_names = json.load(f)

X_train, y_train = data['train']['X'], data['train']['y']
X_val, y_val = data['val']['X'], data['val']['y']
X_test, y_test = data['test']['X'], data['test']['y']

print(f"\nFeature names: {feature_names}")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


Loaded train: 39979 samples, 12 features
Loaded val: 8567 samples, 12 features
Loaded test: 8568 samples, 12 features

Feature names: ['bm25_doc_score', 'bm25_best_claim_score', 'cosine_doc_similarity', 'cosine_max_claim_similarity', 'embedding_diff_mean', 'embedding_diff_std', 'cpc_jaccard', 'year_diff', 'title_jaccard', 'abstract_length_ratio', 'claim_count_ratio', 'shared_rare_terms_ratio']
Training set: 39979 samples
Validation set: 8567 samples
Test set: 8568 samples

## Feature Analysis


In [4]:
for i, name in enumerate(feature_names):
    train_col = X_train[:, i]
    if train_col.std() == 0:
        print(f"Warning: {name}: zero variance (all {train_col[0]:.2f})")
    else:
        print(f"{name}: mean={train_col.mean():.3f}, std={train_col.std():.3f}")


cosine_doc_similarity: mean=0.572, std=0.243
cosine_max_claim_similarity: mean=0.572, std=0.243
embedding_diff_mean: mean=0.024, std=0.010
embedding_diff_std: mean=0.019, std=0.008
year_diff: mean=0.181, std=0.118
title_jaccard: mean=0.382, std=0.421
abstract_length_ratio: mean=0.804, std=0.363
claim_count_ratio: mean=0.678, std=0.328
shared_rare_terms_ratio: mean=0.282, std=0.380

## Initialize and Train Model


In [None]:
clf = PatentNoveltyClassifier(
    hidden_layer_sizes=(64,),
    alpha=1e-5,
    learning_rate_init=0.005,
    max_iter=500,
    early_stopping=True,
    n_iter_no_change=20
)

clf.fit(X_train, y_train, X_val, y_val, feature_names)

## Evaluate Model


In [6]:
train_metrics = clf.evaluate(X_train, y_train)
val_metrics = clf.evaluate(X_val, y_val)
test_metrics = clf.evaluate(X_test, y_test)

print("Training Metrics:")
print(f"  Accuracy:  {train_metrics['accuracy']:.4f}")
print(f"  Precision: {train_metrics['precision']:.4f}")
print(f"  Recall:    {train_metrics['recall']:.4f}")
print(f"  F1:        {train_metrics['f1']:.4f}")
print(f"  ROC-AUC:   {train_metrics['roc_auc']:.4f}")

print("\nValidation Metrics:")
print(f"  Accuracy:  {val_metrics['accuracy']:.4f}")
print(f"  Precision: {val_metrics['precision']:.4f}")
print(f"  Recall:    {val_metrics['recall']:.4f}")
print(f"  F1:        {val_metrics['f1']:.4f}")
print(f"  ROC-AUC:   {val_metrics['roc_auc']:.4f}")

print("\nTest Metrics:")
print(f"  Accuracy:  {test_metrics['accuracy']:.4f}")
print(f"  Precision: {test_metrics['precision']:.4f}")
print(f"  Recall:    {test_metrics['recall']:.4f}")
print(f"  F1:        {test_metrics['f1']:.4f}")
print(f"  ROC-AUC:   {test_metrics['roc_auc']:.4f}")
print(f"  Brier:     {test_metrics['brier_score']:.4f}")

Training Metrics:
  Accuracy:  0.8720
  Precision: 0.8943
  Recall:    0.8432
  F1:        0.8680
  ROC-AUC:   0.9406

Validation Metrics:
  Accuracy:  0.8760
  Precision: 0.9013
  Recall:    0.8486
  F1:        0.8742
  ROC-AUC:   0.9444

Test Metrics:
  Accuracy:  0.8758
  Precision: 0.8928
  Recall:    0.8516
  F1:        0.8717
  ROC-AUC:   0.9453
  Brier:     0.0905

## Save Model


In [8]:
models_dir = Path('models')
results_dir = Path('results')
plots_dir = results_dir / 'plots'
metrics_dir = results_dir / 'metrics'

for d in [models_dir, plots_dir, metrics_dir]:
    d.mkdir(parents=True, exist_ok=True)

clf.save(models_dir)

clf.plot_training_curve(plots_dir / 'training_curve.png')
clf.plot_roc_curve(X_test, y_test, plots_dir / 'roc_curve.png')
clf.plot_confusion_matrix(X_test, y_test, plots_dir / 'confusion_matrix.png')

all_metrics = {
    'train': train_metrics,
    'val': val_metrics,
    'test': test_metrics,
    'feature_importance': importance
}

with open(metrics_dir / 'all_metrics.json', 'w') as f:
    json.dump(all_metrics, f, indent=2, default=str)

print(f"Model saved to: {models_dir}")
print(f"Plots saved to: {plots_dir}")
print(f"Metrics saved to: {metrics_dir}")

Model saved to: models
Plots saved to: results/plots
Metrics saved to: results/metrics