This Jupyter Notebook implements a BERT-based similarity model using MLflow for tracking, managing, and deploying the model. It loads a pre-trained BERT model, computes sentence embeddings, and retrieves the most similar stock information from a stored corpus based on cosine similarity.

# 🔧 Import Dependencies

In [14]:
!pip install importlib_metadata



In [15]:
!pip install --upgrade mlflow



In [16]:
import os
import json
import shutil
import torch
import numpy as np
import pandas as pd
from tabulate import tabulate
import mlflow
import mlflow.pyfunc

from mlflow import MlflowClient
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec, TensorSpec, ParamSchema, ParamSpec

from transformers import pipeline, AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

from nemo.collections.nlp.models.language_modeling import BERTLMModel

ModuleNotFoundError: No module named 'importlib_metadata'

In [None]:
import os
import json
import shutil
import torch
import numpy as np
import pandas as pd
from tabulate import tabulate
import mlflow
import mlflow.pyfunc

from mlflow import MlflowClient
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec, TensorSpec, ParamSchema, ParamSpec

from transformers import pipeline, AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

from nemo.collections.nlp.models.language_modeling import BERTLMModel

# 🏗️ Defining the BERT Similarity Model Class for Stock Data

In [None]:
class StockBERTSimilarityModel(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        """
        Load precomputed embeddings, stock data, and the pre-trained BERT model.
        """
        # Load precomputed embeddings from NumPy file
        self.embeddings = np.load(context.artifacts['embeddings_file'])
        
        # Load stock data corpus
        self.stock_df = pd.read_csv(context.artifacts['stock_data'])
        
        # Load tokenizer for BERT
        self.tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
        
        # Set device to GPU if available, otherwise use CPU
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load pre-trained BERT model
        self.bert_model = BERTLMModel.restore_from(context.artifacts['bert_model'], strict=False).to(self.device)
    
    def generate_query_embedding(self, query):
        """
        Generate BERT embeddings for the input query.
        """
        self.bert_model.eval()  # Set model to evaluation mode
        
        # Tokenize the input query and move tensors to the selected device
        encoded_input = self.tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=128)
        encoded_input = {key: val.to(self.device) for key, val in encoded_input.items()}
        
        # Get the model's output embedding
        with torch.no_grad():
            output = self.bert_model.bert_model(**encoded_input)
        
        # Return the [CLS] token embedding as a NumPy array
        return output[:, 0, :].cpu().numpy()
    
    def predict(self, context, model_input, params):
        """
        Compute similarity between query and precomputed stock embeddings,
        then return the top 5 most similar results.
        """
        # Extract the query string from model input
        query = model_input["query"][0]
        
        # Generate query embedding
        query_embedding = self.generate_query_embedding(query)
        
        # Compute cosine similarity between query and precomputed embeddings
        similarities = cosine_similarity(query_embedding, self.embeddings)
        
        # Get indices of top 5 most similar results
        top_indices = np.argsort(similarities[0])[::-1][:5]
        
        # Retrieve corresponding results from the stock data corpus
        results = self.stock_df.iloc[top_indices].copy()
        results.loc[:, 'Similarity'] = similarities[0][top_indices]
        
        # Return results as a dictionary
        return results.to_dict(orient="records")
    
    @classmethod
    def log_model(cls, model_name, demo_folder="demo"):
        """
        Logs the model to MLflow with appropriate artifacts and schema.
        """
        # Define input and output schema
        input_schema = Schema([ColSpec("string", "query")])
        output_schema = Schema([
            TensorSpec(np.dtype("object"), (-1,), "Stock Recommendations and Similarities")
        ])
        params_schema = ParamSchema([ParamSpec("show_score", "boolean", False)])
        
        # Define model signature
        signature = ModelSignature(inputs=input_schema, outputs=output_schema, params=params_schema)
        
        # Define necessary package requirements
        requirements = ["transformers==4.47.0", "huggingface-hub==0.20.2"]
        
        # Log the model in MLflow
        mlflow.pyfunc.log_model(
            model_name,
            python_model=cls(),
            artifacts={
                "embeddings_file": "data/embeddings.npy", 
                "stock_data": "data/stock_data.csv",
                "bert_model": "/home/jovyan/datafabric/Bertlargeuncased/bertlargeuncased.nemo",
                "demo": demo_folder,
            },
            signature=signature,
            pip_requirements=requirements
        )

 # 📜 Logging Model to MLflow

In [None]:
# Set the MLflow experiment name
mlflow.set_experiment(experiment_name="Stock BERT Similarity Model")

# Start an MLflow run
with mlflow.start_run(run_name="Stock_BERT_Similarity_Run") as run:
    # Print the artifact URI for reference
    print(f"Run's Artifact URI: {run.info.artifact_uri}")
    
    # Log the BERT similarity model to MLflow
    StockBERTSimilarityModel.log_model(model_name="Stock_BERT_Similarity")

    # Register the logged model in MLflow Model Registry
    mlflow.register_model(
        model_uri=f"runs:/{run.info.run_id}/Stock_BERT_Similarity", 
        name="Stock_BERT_Similarity"
    )

# 📦 Fetching the Latest Model Version from MLflow

In [None]:
# Initialize the MLflow client
client = MlflowClient()

# Retrieve the latest version of the "Stock_BERT_Similarity" model (not yet in a specific stage)
model_metadata = client.get_latest_versions("Stock_BERT_Similarity", stages=["None"])
latest_model_version = model_metadata[0].version  # Extract the latest model version

# Fetch model information, including its signature
model_info = mlflow.models.get_model_info(f"models:/Stock_BERT_Similarity/{latest_model_version}")

# Print the latest model version and its signature
print(f"Latest Model Version: {latest_model_version}")
print(f"Model Signature: {model_info.signature}")

# 🛠️ Loading the Model and Running Inference

In [None]:
# Load the trained BERT similarity model from MLflow
model = mlflow.pyfunc.load_model(model_uri=f"models:/Stock_BERT_Similarity/{latest_model_version}")

# Define a sample query for testing
query = "Find technology stocks with high growth potential"

# Use the model to predict similar results based on the query
result = model.predict({"query": [query]})

# 📜 Displaying Results for the Input Query

In [None]:
# Convert the result into a pandas DataFrame
df = pd.DataFrame(result)

# Rename columns for better readability if needed
# Note: This depends on the actual column names in your stock_data.csv
df.rename(columns={"Similarity": "Relevance Score"}, inplace=True)

# Display the DataFrame in a tabular format
print(tabulate(df, headers="keys", tablefmt="fancy_grid"))

# 📈 Optional: Visualizing Top Stock Recommendations

In [None]:
# Import visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
plt.style.use('ggplot')
sns.set_palette('viridis')

# Create a bar chart of the similarity scores
plt.figure(figsize=(10, 6))
# Assuming there's a 'Symbol' or 'Name' column in the results
labels = df.get('Symbol', df.get('Name', df.index))
sns.barplot(x=labels, y='Relevance Score', data=df)
plt.title('Top Stock Recommendations Based on Query')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()