In [2]:
import random
import implicit
import mlflow
import numpy as np
import pandas as pd
import scipy.sparse as sparse
import pickle
import wandb
import optuna
from sklearn.model_selection import train_test_split

In [5]:
# Data Versioning with DVC and DagsHub
import dvc.api
import neptune.new as neptune

  import neptune.new as neptune


In [8]:
# Data Quality with Great Expectations and Deepchecks
import great_expectations as ge
from deepchecks import Dataset, 



ImportError: cannot import name 'check_distribution' from 'deepchecks' (/Users/aditshrimal/opt/anaconda3/envs/mlops/lib/python3.9/site-packages/deepchecks/__init__.py)

In [2]:
def create_user_item_matrix(model):
    return model.user_factors.dot(model.item_factors.T)

In [3]:
def mse(user_item_matrix, sparse_matrix):
    user_item_array = np.asarray(user_item_matrix)
    test_indices = np.asarray(sparse_matrix.nonzero()).T
    true_ratings = sparse_matrix[test_indices[:, 0], test_indices[:, 1]].A1
    predicted_ratings = user_item_array[test_indices[:, 0], test_indices[:, 1]]

    mse_sum = np.sum((true_ratings - predicted_ratings) ** 2)
    n_total = len(test_indices)
    return mse_sum / n_total

In [4]:
df = pd.read_csv("data/Movies_and_TV.csv", header=None)
df = df.sample(frac=0.2, random_state=42)
df.columns = ["asin", "reviewerId", "overallRating", "timestamp"]

In [5]:
df.head()

Unnamed: 0,asin,reviewerId,overallRating,timestamp
1725463,B000068M9Q,A2YNL0PAKAQPRQ,5.0,1337904000
119791,0790751143,A3AY3EV2MFRFGG,4.0,1007942400
506989,6302734924,A2YXSFEPARKGGW,5.0,1517011200
412413,6301966554,ARQFX2LPCQMNZ,5.0,1198368000
1409559,B000059TON,A1DH4WGH5NZ1I0,1.0,1024617600


In [6]:
df.sort_values("timestamp", inplace=True)

In [7]:
df["user_id"] = df["reviewerId"]
df["item_id"] = df["asin"]

df["reviewerId"] = df["reviewerId"].astype("category").cat.codes.values
df["asin"] = df["asin"].astype("category").cat.codes.values

In [8]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
sparse_item_user_train = sparse.csr_matrix(
    (train_df["overallRating"], (train_df["asin"], train_df["reviewerId"]))
)
sparse_user_item_train = sparse.csr_matrix(
    (train_df["overallRating"], (train_df["reviewerId"], train_df["asin"]))
)

sparse_item_user_test = sparse.csr_matrix(
    (test_df["overallRating"], (test_df["asin"], test_df["reviewerId"]))
)
sparse_user_item_test = sparse.csr_matrix(
    (test_df["overallRating"], (test_df["reviewerId"], test_df["asin"]))
)

In [10]:
def objective(trial):
    factors = trial.suggest_int('factors', 10, 50)
    regularization = trial.suggest_loguniform('regularization', 1e-5, 1e-1)
    iterations = trial.suggest_int('iterations', 10, 50)
    alpha_val = trial.suggest_int('alpha_val', 10, 100)

    model = implicit.als.AlternatingLeastSquares(
        factors=factors, regularization=regularization, iterations=iterations
    )
    data_conf = (sparse_user_item_train * alpha_val).astype("double")
    model.fit(data_conf)

    user_item_matrix = create_user_item_matrix(model)
    test_mse = mse(user_item_matrix, sparse_user_item_test)

    return test_mse

In [13]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

[32m[I 2023-05-04 14:28:18,671][0m A new study created in memory with name: no-name-2896b639-4c54-475d-961e-3421f7e1d95f[0m
  regularization = trial.suggest_loguniform('regularization', 1e-5, 1e-1)
100%|██████████| 12/12 [00:06<00:00,  1.92it/s]
[32m[I 2023-05-04 14:28:37,173][0m Trial 0 finished with value: 19.546539547290937 and parameters: {'factors': 10, 'regularization': 0.00010815507989440974, 'iterations': 12, 'alpha_val': 86}. Best is trial 0 with value: 19.546539547290937.[0m
100%|██████████| 25/25 [00:14<00:00,  1.74it/s]
[32m[I 2023-05-04 14:29:03,867][0m Trial 1 finished with value: 19.538747404151714 and parameters: {'factors': 13, 'regularization': 0.00038579792933126, 'iterations': 25, 'alpha_val': 66}. Best is trial 1 with value: 19.538747404151714.[0m
100%|██████████| 32/32 [00:31<00:00,  1.00it/s]
[32m[I 2023-05-04 14:29:51,055][0m Trial 2 finished with value: 19.70043752446118 and parameters: {'factors': 50, 'regularization': 0.016611645140109862, 'iterati

In [14]:
best_params = study.best_params

In [15]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('aws-rec-experiment')

2023/05/04 14:32:23 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/05/04 14:32:23 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/Users/aditshrimal/Desktop/MSDS/Spring2/case_studies_ml/project/mlruns/1', creation_time=1680035050222, experiment_id='1', last_update_time=1680035050222, lifecycle_stage='active', name='aws-rec-experiment', tags={}>

In [16]:
# Initialize a new run
wandb.init(project="movie_recommendation", config=best_params)

with mlflow.start_run():
    factors = best_params['factors']
    regularization = best_params['regularization']
    iterations = best_params['iterations']
    alpha_val = best_params['alpha_val']

    model = implicit.als.AlternatingLeastSquares(
        factors=factors, regularization=regularization, iterations=iterations
    )
    data_conf = (sparse_user_item_train * alpha_val).astype("double")
    model.fit(data_conf)

    mlflow.set_tags({"Model":"ALS", "Train Data": "all-data"})    
    mlflow.log_params(best_params)

    user_item_matrix = create_user_item_matrix(model)
    test_mse = mse(user_item_matrix, sparse_user_item_test)

    # Log the performance metrics
    mlflow.log_metric("test_mse", test_mse)
    wandb.log({"test_mse": test_mse})

    model_save_path = "model.pkl"
    with open(model_save_path, "wb") as f:
        pickle.dump(model, f)

    mlflow.log_artifact(model_save_path)

    # Log the model artifact
    artifact = wandb.Artifact("trained_model", type="model")
    artifact.add_file(model_save_path)
    wandb.log_artifact(artifact)
    
mlflow.end_run()
wandb.finish()

ERROR [wandb.jupyter] Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mashrimal2[0m. Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 38/38 [00:19<00:00,  1.91it/s]


0,1
test_mse,▁

0,1
test_mse,19.52293


model = implicit.als.AlternatingLeastSquares(
    factors=20, regularization=0.1, iterations=20
)
alpha_val = 40
data_conf = (sparse_user_item * alpha_val).astype("double")
model.fit(data_conf)

In [17]:
product_metadata = pd.read_json("data/meta_Movies_and_TV.json", lines=True)

In [19]:
product_metadata = pd.merge(
    df[["item_id"]],
    product_metadata[["asin", "title"]],
    left_on="item_id",
    right_on="asin",
    how="left",
)

In [20]:
product_metadata.head()

Unnamed: 0,item_id,asin,title
0,6304174616,6304174616,Star Blazers Series 1: The Quest for Iscandar ...
1,6301049284,6301049284,Night in the Life of Jimmy Reardon VHS
2,6301049284,6301049284,Night in the Life of Jimmy Reardon VHS
3,6301049284,6301049284,Night in the Life of Jimmy Reardon VHS
4,6301049284,6301049284,Night in the Life of Jimmy Reardon VHS


In [21]:
user_id2token = pd.Series(df.user_id.values, index=df.reviewerId).to_dict()
item_id2token = pd.Series(df.item_id.values, index=df.asin).to_dict()

In [22]:
user_token2id = pd.Series(df.reviewerId.values, index=df.user_id).to_dict()
item_token2id = pd.Series(df.asin.values, index=df.item_id).to_dict()

In [23]:
def get_recommendations(user_id, N=10):
    user_id = user_token2id[user_id]
    print("Top recommended movies:")
    recommended = model.recommend(user_id, sparse_user_item_train[user_id], N)[0]
    for i in recommended:
        p_id = i
        try:
            p_name = product_metadata.loc[
                product_metadata["asin"] == item_id2token[p_id], "title"
            ].values[0]
            print("Id:{} Name:{}".format(p_id, p_name))
        except:
            pass

In [24]:
get_recommendations("A2YNL0PAKAQPRQ")

Top recommended movies:
Id:4902 Name:Willy Wonka &amp; the Chocolate Factory VHS
Id:10618 Name:Kate &amp; Leopold
Id:4503 Name:Persuasion VHS
Id:10803 Name:The Sopranos - The Complete Third Season VHS
Id:9563 Name:Lucky Number Slevin
Id:9437 Name:Love Actually
Id:2294 Name:Point Break VHS
Id:9660 Name:Someone Like You
Id:9637 Name:The Spiderwick Chronicles Field Guide Edition
Id:1929 Name:Christmas Carol VHS


In [25]:
def get_similar_items(item_id, N=10):
    item_id = item_token2id[item_id]
    print("Query item:")
    p_name = product_metadata.loc[
        product_metadata["asin"] == item_id2token[item_id], "title"
    ].values[0]
    print("Id:{} Name:{}".format(item_id2token[item_id], p_name))
    print()
    print("Top Recommendations:")
    recommended = model.similar_items(item_id, N)[0]
    for i in recommended:
        if i != item_id:
            p_id = i
            try:
                p_name = product_metadata.loc[
                    product_metadata["asin"] == item_id2token[p_id], "title"
                ].values[0]
                print("Id:{} Name:{}".format(item_id2token[p_id], p_name))
            except:
                pass

In [26]:
get_similar_items("6304174616")

Query item:
Id:6304174616 Name:Star Blazers Series 1: The Quest for Iscandar 6pc  Coll  VHS

Top Recommendations:
Id:0005419263 Name:Steve Green: Hide 'em in Your Heart Volume 2: 13 Bible Memory Music Videos for Children VHS
Id:0005164850 Name:Crusade: The Life of Billy Graham VHS
Id:0005123968 Name:Cedarmont Kids: Action Bible Songs - 17 Classic Christian Songs for Kids Over 30 Minutes of Live Action Sing-A-Long Video  VHS
Id:0005119367 Name:Joseph VHS
Id:0005092663 Name:A NATION ADRIFT  A Chronicle of America's Providential Heritage VHS
Id:0005089549 Name:Cathedral Quartet: A Reunion VHS
Id:000503860X Name:Chapter X Live [VHS]
Id:0005019281 Name:An American Christmas Carol VHS
Id:0005000009 Name:Where Jesus Walked VHS
Id:0001526863 Name:Steve Green: Hide 'em in Your Heart: 13 Bible Memory Music Videos for Children of All Ages VHS
