In [2]:
import importlib

In [3]:
import pandas as pd
import os
import pickle
import validation_data as vd

def save_pickle(my_obj, filepath):
    """
    Save an object to a pickle file.

    Args:
        my_obj: The object to be saved.
        filepath (str): The path where the pickle file will be saved.
    """
    if os.path.isfile(filepath):
        print(f"File {filepath} already exists. Doing nothing")
        return None
    else:
        with open(filepath, 'wb') as file:
            pickle.dump(my_obj, file)

def load_pickle(filepath):
    """
    Load an object from a pickle file.

    Args:
        filepath (str): The path to the pickle file.

    Returns:
        The loaded object.
    """
    if not os.path.isfile(filepath):
        print(f"File {filepath} does not exist.")
        return None
    with open(filepath, 'rb') as file:
        return pickle.load(file)

In [4]:
import model as m
importlib.reload(m)

<module 'model' from '/home/rositsa/code/aybik/movie_picker/moviepicker/ml_logic/model.py'>

In [4]:
encoded_data = load_pickle('../../artifacts/data_encode.pkl')

In [5]:
# tfidf_matrix = m.vectorize_descriptions(encoded_data, 'description')

In [6]:
# save_pickle(tfidf_matrix, '../../artifacts/baseline_tfidf_matrix.pkl')

In [7]:
tfidf_matrix = load_pickle('../../artifacts/baseline_tfidf_matrix.pkl')

In [8]:
# knn_baseline = m.knn_fit(tfidf_matrix)

In [9]:
# save_pickle(knn_baseline, '../../artifacts/fitted_knn_baseline_model.pkl')

In [10]:
knn_baseline = load_pickle('../../artifacts/fitted_knn_baseline_model.pkl')

In [11]:
import evaluation as e
importlib.reload(e)

<module 'evaluation' from '/home/rositsa/code/aybik/movie_picker/moviepicker/ml_logic/evaluation.py'>

In [12]:
filtered_validation_data = load_pickle('../../artifacts/filtered_validation_data.pkl')

In [13]:
results = e.get_evaluation_score(knn_baseline, tfidf_matrix, encoded_data, filtered_validation_data, db_path='../../artifacts/moviepicker.sqlite', top_movies_path = '../../artifacts/top25k_rated_movies.pkl', similar_movies_path = '../../artifacts/knn_baseline_similar_movies_top25k.pkl', results_path = '../../artifacts/results_knn_baseline_top25k.pkl')

Loading top 25,000 movies from pickle...
Loading KNN baseline similar movies from pickle...
Indexes created successfully.
Processing evaluation metrics with database...


100%|██████████| 3000/3000 [49:29<00:00,  1.01it/s]
100%|██████████| 3000/3000 [17:22<00:00,  2.88it/s]
100%|██████████| 3000/3000 [09:42<00:00,  5.15it/s]
100%|██████████| 3000/3000 [06:09<00:00,  8.11it/s]
100%|██████████| 3000/3000 [04:11<00:00, 11.92it/s]
100%|██████████| 3000/3000 [03:02<00:00, 16.47it/s]
100%|██████████| 3000/3000 [02:19<00:00, 21.51it/s]
100%|██████████| 3000/3000 [01:48<00:00, 27.69it/s]
100%|██████████| 998/998 [00:30<00:00, 32.69it/s]
100%|██████████| 9/9 [1:34:35<00:00, 630.66s/it]


In [24]:
#save_pickle(results, 'results_25.02.25.0200.pkl')

pd.Series([results[k][0] for k in results.keys()]).describe()

count    20215.000000
mean         3.047724
std          0.655330
min          0.500000
25%          2.666667
50%          3.062500
75%          3.500000
max          5.000000
dtype: float64

In [5]:
results = load_pickle('../../artifacts/results_knn_baseline_top25k.pkl')

In [6]:
results

{'Knives Out (2019)': (1.75,),
 'Parasite (2019)': (3.3967391304347827,),
 'Get Out (2017)': (3.475,),
 'Everything Everywhere All at Once (2022)': (2.3333333333333335,),
 'Barbie (2023)': (3.3378995433789953,),
 'Spider-Man: Into the Spider-Verse (2018)': (4.232290166295666,),
 'The Batman (2022)': (3.4922779922779923,),
 'Nope (2022)': (2.3663793103448274,),
 'Midsommar (2019)': (3.107142857142857,),
 'Joker (2019)': (4.0,),
 'Lady Bird (2017)': (3.36,),
 'La La Land (2016)': (3.2614503816793894,),
 'Once Upon a Time… in Hollywood (2019)': (1.0635593220338984,),
 'Hereditary (2018)': (2.3873873873873874,),
 'Scream (1996)': (2.54228855721393,),
 'The Shining (1980)': (3.150757228086278,),
 'Glass Onion (2022)': (2.767924528301887,),
 'Gone Girl (2014)': (2.7916666666666665,),
 'Whiplash (2014)': (4.12280701754386,),
 'The Menu (2022)': (3.1554621848739495,),
 'Us (2019)': (3.4027149321266967,),
 'The Grand Budapest Hotel (2014)': (3.0,),
 'The Dark Knight (2008)': (3.5330188679245285