#### Install MLFlow

In [1]:
!pip install mlflow -q -U

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 15.0.2 which is incompatible.
beatrix-jupyterlab 2023.128.151533 requires jupyterlab~=3.6.0, but you have jupyterlab 4.2.1 which is incompatible.
kfp 2.5.0 requires google-cloud-storage<3,>=2.2.1, but you have google-cloud-storage 1.44.0 which is incompatible.
tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you have keras 3.3.3 which is incompatible.[0m[31m
[0m

#### Add imports

In [2]:
import pandas as pd
import boto3
import warnings
from io import StringIO
import ast
import os
import pickle
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from kaggle_secrets import UserSecretsClient
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#### Suppress user warnings from sklearn

In [3]:
warnings.filterwarnings("ignore", category=UserWarning)

#### Move configuration files for Databricks and AWS to root

In [4]:
!cp /kaggle/input/config/.databrickscfg /root/
!sudo mkdir /root/.aws
!cp /kaggle/input/config/config /root/.aws
!cp /kaggle/input/config/credentials /root/.aws

#### Load dataset from S3

In [5]:
BUCKET_NAME = 'mlops-ucu-2024'
DATA_KEY = 'popular_movies.csv'
MODEL_KEY = 'lr_model.pkl'
REGION_NAME = 'eu-north-1'

In [6]:
user_secrets = UserSecretsClient()
AWS_ACCESS_KEY_ID = user_secrets.get_secret("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = user_secrets.get_secret("AWS_SECRET_ACCESS_KEY")

In [7]:
s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=REGION_NAME
)

In [8]:
obj = s3.get_object(Bucket=BUCKET_NAME, Key=DATA_KEY)
string_data = obj['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(string_data))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1440 entries, 0 to 1439
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         1440 non-null   int64  
 1   Title      1440 non-null   object 
 2   Year       1440 non-null   float64
 3   Rating     1414 non-null   float64
 4   Views      1406 non-null   float64
 5   Likes      1406 non-null   float64
 6   Minutes    1440 non-null   float64
 7   Tagline    1329 non-null   object 
 8   Language   1440 non-null   object 
 9   Genres     1440 non-null   object 
 10  Countries  1440 non-null   object 
 11  Directors  1440 non-null   object 
 12  Cast       1440 non-null   object 
dtypes: float64(5), int64(1), object(7)
memory usage: 146.4+ KB


#### Dataset preprocessing

In [9]:
df_subset = df[['Title','Year','Rating', 'Minutes', 'Language','Genres','Countries','Directors','Cast']].dropna()
df_subset['Genres'] = df_subset['Genres'].apply(ast.literal_eval)
df_subset['Countries'] = df_subset['Countries'].apply(ast.literal_eval)
df_subset['Directors'] = df_subset['Directors'].apply(ast.literal_eval)
df_subset['Cast'] = df_subset['Cast'].apply(ast.literal_eval)
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1414 entries, 0 to 1439
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Title      1414 non-null   object 
 1   Year       1414 non-null   float64
 2   Rating     1414 non-null   float64
 3   Minutes    1414 non-null   float64
 4   Language   1414 non-null   object 
 5   Genres     1414 non-null   object 
 6   Countries  1414 non-null   object 
 7   Directors  1414 non-null   object 
 8   Cast       1414 non-null   object 
dtypes: float64(3), object(6)
memory usage: 110.5+ KB


In [10]:
def get_top_n(elements_list, n=10):
    return pd.Series([item for sublist in elements_list for item in sublist]).value_counts().nlargest(n).index.tolist()

In [11]:
top_directors = get_top_n(df_subset['Directors'])
top_cast = get_top_n(df_subset['Cast'])

In [12]:
mlb_countries = MultiLabelBinarizer()
mlb_genres = MultiLabelBinarizer()
mlb_genres = MultiLabelBinarizer()
mlb_directors = MultiLabelBinarizer(classes=top_directors)
mlb_cast = MultiLabelBinarizer(classes=top_cast)
onehot_language = OneHotEncoder(sparse_output=False)

In [13]:
countries_encoded = mlb_countries.fit_transform(df_subset['Countries'])
genres_encoded = mlb_genres.fit_transform(df_subset['Genres'])
directors_encoded = mlb_directors.fit_transform(df_subset['Directors'])
cast_encoded = mlb_cast.fit_transform(df_subset['Cast'])
language_encoded = onehot_language.fit_transform(df_subset[['Language']])

In [14]:
encoded_data = pd.concat([
    df_subset[['Title','Year','Rating', 'Minutes']].reset_index(),
    pd.DataFrame(countries_encoded, columns=mlb_countries.classes_),
    pd.DataFrame(genres_encoded, columns=mlb_genres.classes_),
    pd.DataFrame(directors_encoded, columns=top_directors),
    pd.DataFrame(cast_encoded, columns=top_cast),
    pd.DataFrame(language_encoded, columns=onehot_language.get_feature_names_out())
], axis=1)
encoded_data.head()

Unnamed: 0,index,Title,Year,Rating,Minutes,Argentina,Australia,Austria,Belgium,Brazil,...,Language_Korean,Language_No spoken language,Language_Norwegian,Language_Persian (Farsi),Language_Polish,Language_Portuguese,Language_Russian,Language_Spanish,Language_Swedish,Language_Telugu
0,0,Barbie,2023.0,3.9,114.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Parasite,2019.0,4.6,133.0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Everything Everywhere All at Once,2022.0,4.3,140.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Fight Club,1999.0,4.3,139.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Oppenheimer,2023.0,4.2,181.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Model training

In [15]:
X = encoded_data.drop(columns=['Rating', 'Title'])
y = encoded_data['Rating']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)

In [17]:
lr=LinearRegression().fit(X_train, y_train)
lr.score(X_test,y_test)

0.5489031553834142

In [18]:
y_pred = lr.predict(X_test)

In [19]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [20]:
def encoder_to_file(encoder, file_name):
    with open(f'{file_name}.pkl', 'wb') as f:
        pickle.dump(encoder, f)

In [21]:
encoder_to_file(mlb_countries,'mlb_countries')
encoder_to_file(mlb_genres,'mlb_genres')
encoder_to_file(mlb_directors,'mlb_directors')
encoder_to_file(mlb_cast,'mlb_cast')
encoder_to_file(onehot_language,'onehot_language')

In [22]:
encoder_files_names_list=[
    'mlb_countries',
    'mlb_genres',
    'mlb_directors',
    'mlb_cast',
    'onehot_language',
]

#### Set up MLFlow experiment and artifacts path

In [23]:
artifact_uri = f"s3://mlops-ucu-2024/mlflow-artifacts"
experiment_name="/Users/andriankrav@gmail.com/letterboxd-predictions"
model_folder_name = 'letterboxd-predictions'
encoders_folder = 'encoders'

In [24]:
mlflow.set_tracking_uri('databricks')

# Uncomment on first run for experiment
# mlflow.create_experiment(experiment_name, artifact_location=artifact_uri)
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='s3://mlops-ucu-2024/mlflow-artifacts', creation_time=1718051967359, experiment_id='3619140286810292', last_update_time=1718052144643, lifecycle_stage='active', name='/Users/andriankrav@gmail.com/letterboxd-predictions', tags={'mlflow.experiment.sourceName': '/Users/andriankrav@gmail.com/letterboxd-predictions',
 'mlflow.experimentType': 'MLFLOW_EXPERIMENT',
 'mlflow.ownerEmail': 'andriankrav@gmail.com',
 'mlflow.ownerId': '5372028643946937'}>

#### Load metrics to Databricks and artifacts to S3

In [25]:
signature = mlflow.models.infer_signature(X_train, lr.predict(X_train))

with mlflow.start_run(run_name=f'Run {type(lr).__name__}') as run:
    print(f'Run id: {run.info.run_id}')
    model_info =mlflow.sklearn.log_model(
        lr, 
        model_folder_name, 
        signature=signature, 
        input_example=X_train.head(1))
    mlflow.log_metric('R2', r2)
    mlflow.log_metric('MAE', mae)
    mlflow.log_metric('MSE',  mse)
    for file_name in encoder_files_names_list:
        mlflow.log_artifact(f'{file_name}.pkl', encoders_folder)
    print('Successfully logged metrics and uploaded artifacts')

Run id: f0ac99d16eb34232ad77c59812a84405
Successfully logged metrics and uploaded artifacts
