In [42]:
import os
import sys
import json
import subprocess
import mlflow
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, GridSearchCV
import pickle

In [43]:
#import data
data_path = os.path.join("..", "..", "data", "kafka_log_(2024-02-11T16_2024-02-11T16).csv")

xx, user_rating_data_df, xxx = utilities.process_csv(data_path)

In [61]:
#Set up ML Flow
MLFLOW_TRACKING_TOKEN = os.environ.get("MLFLOW_TRACKING_TOKEN")
MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI")
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [74]:
def train_and_evaluate_model(user_rating_data_df):
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(user_rating_data_df[['UserID', 'Title', 'Rating']], reader)

    param_grid = {
        'n_factors': [50, 100, 150],
        'n_epochs': [20, 30],
        'lr_all': [0.005, 0.010],
        'reg_all': [0.02, 0.05]
    }

    gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)
    gs.fit(data)

    best_model_params = gs.best_params['rmse']
    print(f"Best model parameters: {best_model_params}")

    # Now, let's evaluate the best model on a separate test set
    trainset, testset = train_test_split(data, test_size=0.25)
    best_model = SVD(**best_model_params)
    best_model.fit(trainset.build_full_trainset())

    predictions = best_model.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    mae = accuracy.mae(predictions, verbose=True)

    print(f"Test Set RMSE: {rmse}")
    print(f"Test Set MAE: {mae}")


    ### Log best model ###

    #Set up the experiment and the run in that experiment
    mlflow.set_experiment(experiment_name=f'SVD_v0')
    mlflow.start_run(run_name=f"Candidate {2}")

    # Log the best hyperparameters
    for param, value in best_model_params.items():
        mlflow.log_param(f"best_{param}", value)

    # Log performance of model 
    mlflow.log_metric(f"RMSE", rmse)
    mlflow.log_metric(f"MAE", mae)

    # Log dataset path 
    csv_file_path = "data/kafka_log_(2024-02-11T16_2024-02-11T16).csv"
    mlflow.log_param("dataset_path", csv_file_path)

    # Log Python version
    mlflow.log_param("python_version", sys.version)

    # Log requirements.txt and make a requirements.txt if not there
    requirements_file = "requirements.txt"
    if not os.path.exists(requirements_file):
        print("requirements.txt not found, generating...")
        requirements = subprocess.check_output([sys.executable, '-m', 'pip', 'freeze']).decode('utf-8')
        with open(requirements_file, "w") as f:
            f.write(requirements)
    mlflow.log_artifact(requirements_file)

    # Serialize and log the best model
    model_filename = "best_SVD_model.pkl"
    with open(model_filename, "wb") as f:
        pickle.dump(best_model, f)
    mlflow.log_artifact(model_filename)

    #end the logging 
    mlflow.end_run()
    
    return best_model


In [75]:

best_model = train_and_evaluate_model(user_rating_data_df)

AttributeError: 'Trainset' object has no attribute 'build_full_trainset'

In [55]:
mlflow.end_run()

In [66]:
mlflow.set_experiment(experiment_name=f"A nice demo")
mlflow.start_run(run_name=f"Candidate {2}")
mlflow.log_param("l1_ratio", 3)
mlflow.log_metric("rmse", 5)
mlflow.end_run()