In [2]:
import pandas as pd
import os

## Test model.py from Aybike

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

# Vectorize descriptions function is changed to give array rather than matrix
def vectorize_descriptions(df, text_column, tfidf_dim=2500):
    """
    Vectorize movie descriptions using TF-IDF.

    Args:
        df: The DataFrame containing movie descriptions.
        text_column: The column in the DataFrame that contains descriptions.

    Returns:
        tfidf_matrix: The TF-IDF matrix.
        vectorizer: The fitted TfidfVectorizer object (useful if needed later).
    """
    vectorizer = TfidfVectorizer(max_features=tfidf_dim)
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    tfidf_array = tfidf_matrix.toarray()
    return tfidf_array

# Create np arrays for tfidf, language, genre to be utilized in autoencoder
def prepare_model_inputs(df, tfidf_dim=2500):
    """
    Prepares input features for the autoencoder model.

    Parameters:
        df (pd.DataFrame): The dataset containing movie descriptions, language, and genres.
        tfidf_dim (int): The maximum number of features for TF-IDF vectorization.

    Returns:
        tuple: (tfidf_array, num_languages, language_data_np, genres_data_np, num_genres)
    """
    # ---------------------------
    # 1. TF-IDF Vectorization
    # ---------------------------
    tfidf_array = vectorize_descriptions(df, 'description', tfidf_dim)

    # ---------------------------
    # 2. Language Encoding
    # ---------------------------
    num_languages = df['language_encoded'].nunique()
    language_data_np = df['language_encoded'].values.reshape(-1, 1).astype(np.int32)

    # ---------------------------
    # 3. Genre Extraction
    # ---------------------------
    genre_columns = ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary',
                     'drama', 'family', 'fantasy', 'history', 'horror', 'music', 'mystery',
                     'romance', 'science_fiction', 'thriller', 'tv_movie', 'war', 'western']
    genres_data_np = df[genre_columns].values.astype(np.int32)
    num_genres = len(genre_columns)  # Automatically detect number of genres

    return tfidf_array, num_languages, language_data_np, genres_data_np, num_genres


# Create encoder which will be used in autoencoder function and optimized during autoencoder.fit
def build_encoder(tfidf_dim, num_languages, num_genres):
    """
    Builds an encoder model that fuses:
      - A TF-IDF vector input (continuous, shape: [tfidf_dim])
      - A language input (integer, shape: [1])
      - A one-hot encoded genres input (shape: [num_genres])

    Parameters:
      tfidf_dim (int): Dimensionality of the TF-IDF vector (e.g., 2500).
      num_languages (int): Total number of language categories (max language index + 1).
      num_genres (int): Number of genres (should be 19 for your columns).

    Returns:
      encoder_model (tf.keras.Model): A model that outputs a fused latent embedding.
    """

    # -------------------------
    # TF-IDF Branch
    # -------------------------
    tfidf_input = tf.keras.layers.Input(shape=(tfidf_dim,), name="tfidf_input")
    tfidf_dense = tf.keras.layers.Dense(128, activation='relu', name="tfidf_dense")(tfidf_input)

    # -------------------------
    # Language Branch
    # -------------------------
    language_input = tf.keras.layers.Input(shape=(1,), name="language_input")
    language_embedding = tf.keras.layers.Embedding(
        input_dim=num_languages,
        output_dim=8,
        name="language_embedding"
    )(language_input)
    language_vector = tf.keras.layers.Flatten(name="language_flatten")(language_embedding)

    # -------------------------
    # Genres Branch (One-hot encoded)
    # -------------------------
    genre_input = tf.keras.layers.Input(shape=(num_genres,), name="genre_input")
    # Optionally, pass the one-hot vector through a dense layer to learn a compressed representation.
    genre_dense = tf.keras.layers.Dense(32, activation='relu', name="genre_dense")(genre_input)

    # -------------------------
    # Merge Branches
    # -------------------------
    # Concatenate the outputs of all branches.
    merged = tf.keras.layers.concatenate([tfidf_dense, language_vector, genre_dense], name="merged_features")
    x = tf.keras.layers.Dense(64, activation='relu', name="dense_1")(merged)
    final_embedding = tf.keras.layers.Dense(32, activation='relu', name="final_embedding")(x)

    # Build the encoder model
    encoder_model = tf.keras.models.Model(
        inputs=[tfidf_input, language_input, genre_input],
        outputs=final_embedding
    )

    return encoder_model


# Create autoencoder
def build_autoencoder(tfidf_dim, num_languages, num_genres):
    """
    Builds an autoencoder that uses:
      - The encoder from build_encoder to produce a 32-d latent embedding.
      - Three decoder branches to reconstruct:
          A. The original TF-IDF vector.
          B. The language (as a probability distribution over num_languages).
          C. The one-hot encoded genres vector.

    The autoencoder is compiled with MSE loss for TF-IDF, sparse categorical crossentropy for language,
    and binary crossentropy for genres.
    """

    # Define the inputs (they will be passed to both encoder and as targets later)
    tfidf_input = tf.keras.layers.Input(shape=(tfidf_dim,), name="tfidf_input")
    language_input = tf.keras.layers.Input(shape=(1,), name="language_input")
    genre_input = tf.keras.layers.Input(shape=(num_genres,), name="genre_input")

    # Build the encoder and get the latent representation.
    encoder = build_encoder(tfidf_dim, num_languages, num_genres)
    latent = encoder([tfidf_input, language_input, genre_input])

    # -------------------------
    # Decoder for TF-IDF reconstruction
    # -------------------------
    decoder_tfidf = tf.keras.layers.Dense(64, activation='relu', name="decoder_tfidf_dense")(latent)
    tfidf_output = tf.keras.layers.Dense(tfidf_dim, activation='relu', name="tfidf_output")(decoder_tfidf)

    # -------------------------
    # Decoder for Language reconstruction
    # -------------------------
    decoder_language = tf.keras.layers.Dense(16, activation='relu', name="decoder_language_dense")(latent)
    # Output is a probability distribution over languages
    language_output = tf.keras.layers.Dense(num_languages, activation='softmax', name="language_output")(decoder_language)

    # -------------------------
    # Decoder for Genres reconstruction
    # -------------------------
    decoder_genre = tf.keras.layers.Dense(16, activation='relu', name="decoder_genre_dense")(latent)
    # For multi-label, we use sigmoid activation; if it's strictly one-hot, you could use softmax.
    genre_output = tf.keras.layers.Dense(num_genres, activation='sigmoid', name="genre_output")(decoder_genre)

    # Build the autoencoder model.
    autoencoder_model = tf.keras.models.Model(
        inputs=[tfidf_input, language_input, genre_input],
        outputs=[tfidf_output, language_output, genre_output],
        name="autoencoder"
    )

    # Compile the autoencoder:
    # - For TF-IDF, we use mean squared error.
    # - For language, we use sparse categorical crossentropy (the target should be an integer).
    # - For genres, binary crossentropy is appropriate for multi-label reconstruction.
    autoencoder_model.compile(
        optimizer='adam',
        loss={
            'tfidf_output': 'mse',
            'language_output': 'sparse_categorical_crossentropy',
            'genre_output': 'binary_crossentropy'
        },
        loss_weights={
            'tfidf_output': 1.0,
            'language_output': 1.0,
            'genre_output': 1.0
        }
    )

    return autoencoder_model, encoder


def train_autoencoder(autoencoder_model, tfidf_array, language_data_np, genres_data_np, batch_size=16, epochs=50):
    """
    Trains the autoencoder model using the given input data.

    Parameters:
        autoencoder_model (tf.keras.Model): The compiled autoencoder model.
        tfidf_array (np.array): TF-IDF input data.
        language_data_np (np.array): Encoded language data.
        genres_data_np (np.array): One-hot encoded genres data.
        batch_size (int): Batch size for training. Default is 16.
        epochs (int): Number of training epochs. Default is 50.

    Returns:
        history (tf.keras.callbacks.History): Training history object containing loss values.
    """
    # Define early stopping to prevent overfitting
    early_stop = EarlyStopping(
        monitor='val_loss',       # Monitor the validation loss
        patience=3,               # Stop training if no improvement for 3 epochs
        restore_best_weights=True # Restore the best model weights
    )

    # Train the model
    history = autoencoder_model.fit(
        x=[tfidf_array, language_data_np, genres_data_np],
        y=[tfidf_array, language_data_np, genres_data_np],
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[early_stop]
    )

    return history

def extract_latent_embeddings(encoder_model, tfidf_array, language_data_np, genres_data_np):
    """
    Extracts latent embeddings from the encoder model.

    Parameters:
        encoder_model (tf.keras.Model): The trained encoder model.
        tfidf_array (np.array): TF-IDF input data.
        language_data_np (np.array): Encoded language data.
        genres_data_np (np.array): One-hot encoded genres data.

    Returns:
        latent_embeddings (np.array): The extracted latent representations.
    """
    latent_embeddings = encoder_model.predict([tfidf_array, language_data_np, genres_data_np])
    return latent_embeddings

def knn_fit(latent_embeddings, n_neighbors=10, metric='cosine'):
    """
    Fits a KNN model for similarity search using the latent embeddings.

    Parameters:
        latent_embeddings (np.array): The extracted latent embeddings from the encoder.
        n_neighbors (int): Number of nearest neighbors to find. Default is 5.
        metric (str): Distance metric for KNN. Default is 'cosine'.

    Returns:
        knn_model (NearestNeighbors): The trained KNN model.
    """
    knn_model = NearestNeighbors(n_neighbors=n_neighbors + 1, metric=metric)  # +1 to exclude the queried movie later
    knn_model.fit(latent_embeddings)

    return knn_model

def get_movie_recommendations(user_input, df, knn_model, latent_embeddings, n_recommendations=5):
    """
    Finds similar movies based on the KNN model and latent embeddings.

    Parameters:
        user_input (str): The name of the movie to find recommendations for.
        df (pd.DataFrame): DataFrame containing movie names.
        knn_model (NearestNeighbors): The trained KNN model.
        latent_embeddings (np.array): The extracted latent embeddings.
        n_recommendations (int): Number of movie recommendations to return. Default is 5.

    Returns:
        list: A list of tuples containing recommended movies and their distances.
    """
    # Convert user input and DataFrame names to lowercase for case-insensitive matching
    matched_rows = df[df["name"].str.lower() == user_input.lower()]

    if matched_rows.empty:
        print("Movie not found.")
        return []

    # Get the first matching index
    sample_index = matched_rows.index[0]
    print(f"Found movie '{user_input}'.")

    # Retrieve KNN results
    distances, indices = knn_model.kneighbors(latent_embeddings[sample_index].reshape(1, -1))

    # Convert to 1D arrays
    indices = indices.flatten()
    distances = distances.flatten()

    # Filter out the queried movie
    filtered_recs = [(idx, dist) for idx, dist in zip(indices, distances) if idx != sample_index]

    # Handle case where no recommendations remain
    if not filtered_recs:
        print("No recommendations found after filtering out the queried movie.")
        return []

    # Limit recommendations to `n_recommendations`
    filtered_recs = filtered_recs[:n_recommendations]

    # Retrieve movie names for recommendations
    recommendations = [(df.loc[idx, "name"], dist) for idx, dist in filtered_recs]

    return recommendations

2025-02-04 09:52:25.290588: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-04 09:52:25.298469: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-04 09:52:25.363329: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-04 09:52:25.427210: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738659145.479218    1982 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738659145.49

In [15]:
result0 = vectorize_descriptions(data,'description')

In [17]:
result1 = prepare_model_inputs(data)

In [24]:
result2 = build_encoder(2500, 163, 19)

W0000 00:00:1738660537.771787    1982 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [26]:
result3 = build_autoencoder(2500, 163, 19)

In [30]:
result4 = train_autoencoder(result3[0], result0, result1[2], result1[3]) # in Vertex

: 

## Load data

In [10]:
current_dir = os.getcwd() #current dir
parent_dir = os.path.dirname(current_dir) #parent of current dir

In [None]:
# SEQUENCE OF EXECUTION
# - csv NOT USE: countries, releases, themes, posters
# - movies name: kill rows where NaN, Untitled (in various languages), "" [KILL MOVIES]
# - look for minute first: kill NaN, kick out-of-range (too short and too long) [KILL MOVIES]
# - concatenate name + year: rank by ratings, keep ones with higher rating, ones without ratings keep the one with low movieid (LOGIC!!!) [KILL MOVIES]
# - unique identifier: "name (year)", if no year "name"
# - drop columns: tagline (movies)
# - cut actors short: no actors less than 12 frequency
# - no actors: empty list for now
# - drop columns: role (actors)
# - cut crew short: only Director
# - no crew: empty dict
# - genre lower case, make list
# - no genre: empty list
# - no studio: empty list
# - languages: type language, primary language
# - drop columns: type (languages)
# - no language: "NA"
# - DOUBLECHECK: if model crashes, think again on cutting unpopular languages, no year movies, no actors movies, .... TO_BE_DISCUSSED

In [8]:
# STREAMLIT
# - layout: collapse buttons, workflow, default selections
# - URL: Rositsa set B (letterboxd) links
# - tiny posters (technically: HOW???) --we have link to posters in posters.csv

In [5]:
# final = pd.read_csv(os.path.join(parent_dir,'moviepicker/final_set_a.csv'))
# final.shape

In [103]:
# movies = pd.read_csv(os.path.join(parent_dir,'raw_data/movies.csv'))
# actors = pd.read_csv(os.path.join(parent_dir,'raw_data/actors.csv'))
# crew = pd.read_csv(os.path.join(parent_dir,'raw_data/crew.csv'))
# languages = pd.read_csv(os.path.join(parent_dir,'raw_data/languages.csv'))
genres = pd.read_csv(os.path.join(parent_dir,'raw_data/genres.csv'))
# studios = pd.read_csv(os.path.join(parent_dir,'raw_data/studios.csv'))
# countries = pd.read_csv(os.path.join(parent_dir,'raw_data/countries.csv'))

In [84]:
movies = movies.drop(columns='tagline', axis=1)
movies = movies[movies['name'].notnull() & ~movies['name'].isin(['', 'No Title'])]
movies = movies[movies['description'].notnull() & (movies['description'] != '')] # Remove movies without description
movies = movies.rename(columns={"date": "year"})
movies["year"] = movies["year"].astype(float).apply(lambda x: str(int(x)) if not pd.isna(x) else "")
movies = movies.dropna(subset=['minute']) # Remove NaN values in minute
movies['minute'] = movies['minute'].astype(int) # Change minute dtype to int
movies = movies[(movies['minute'] > 40) & (movies['minute'] <= 240)] # Remove short and too long movies
movies['key'] = movies['name'] + movies['year'].apply(lambda x: f" ({int(x)})" if x!='' else '')
movies.shape
#OK

(406445, 7)

In [101]:
actors = actors[actors['role'].notnull() & (actors['role'] != '')]  # Remove actors without role
pattern = r'footage|uncredited|Ensemble/|\d'  # Matches specific terms or any digit
actors = actors[~actors['role'].str.contains(pattern, case=False, regex=True)]
actors = actors.drop(columns='role', axis=1) # Drop column role
name_counts = actors['name'].value_counts().reset_index() # Count frequency
# name_counts = name_counts.rename(columns={'index': 'name', 'name': 'count'})
name_counts = name_counts[name_counts['count']>=12] # Take only those appearing >= 12 times
actors = actors[actors['name'].isin(name_counts['name'])] # Remove unpopular actors
new_actors = (
    actors.groupby('id')['name']
    .apply(list)  # Aggregates genres into a list
    .reset_index(name='actor_list')  # Converts to DataFrame and renames the column
)
#OK

In [104]:
genres['genre'] = genres['genre'].apply(lambda x: x.lower().replace(' ', '_'))
new_genres = (
    genres.groupby('id')['genre']
    .apply(list)  # Aggregates genres into a list
    .reset_index(name='genre_list')  # Converts to DataFrame and renames the column
)

In [105]:
new_genres

Unnamed: 0,id,genre_list
0,1000001,"[comedy, adventure]"
1,1000002,"[comedy, thriller, drama]"
2,1000003,"[science_fiction, adventure, comedy, action]"
3,1000004,[drama]
4,1000005,"[drama, comedy, music, romance]"
...,...,...
676486,1941559,[comedy]
676487,1941563,[drama]
676488,1941566,[crime]
676489,1941569,[crime]


In [82]:
crew = crew[crew['role'].isin(['Director', 'Writer', 'Cinematography', 'Composer'])] #'Songs', 'Producer',
new_crew = (
    crew.groupby('id')
    .apply(lambda x: x.groupby('role')['name'].apply(list).to_dict())
    .reset_index(name='crew_dict')
)

# Clean up languages df
languages = languages[languages['type'].isin(['Language', 'Primary language'])].drop(columns='type')
# Clean up genres df
genres['genre'] = genres['genre'].apply(lambda x: x.lower().replace(' ', '_'))
new_genres = (
    genres.groupby('id')['genre']
    .apply(list)  # Aggregates genres into a list
    .reset_index(name='genre_list')  # Converts to DataFrame and renames the column
)

# Clean up studios df
new_studios = (
    studios.groupby('id')['studio']
    .apply(list)  # Aggregates genres into a list
    .reset_index(name='studio_list')  # Converts to DataFrame and renames the column
)

  .apply(lambda x: x.groupby('role')['name'].apply(list).to_dict())


In [85]:
new_genres = (
    genres.assign(genre=genres['genre'].str.lower().str.replace(' ', '_')) # Replace spaces within genres with underscores
    .groupby('id')['genre']
    .apply(' '.join) # Aggregates genres into a single string
    .reset_index(name='genre_list') # Converts to DataFrame and renames the column
)

In [88]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406445 entries, 0 to 406444
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           406445 non-null  int64  
 1   name         406445 non-null  object 
 2   year         406445 non-null  object 
 3   description  406445 non-null  object 
 4   minute       406445 non-null  int64  
 5   rating       75223 non-null   float64
 6   key          406445 non-null  object 
 7   genre_list   347516 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 24.8+ MB


In [86]:
data = movies \
    .merge(new_genres, how='left', on='id')

In [92]:
new_genres

Unnamed: 0,id,genre_list
0,1000001,comedy adventure
1,1000002,comedy thriller drama
2,1000003,science_fiction adventure comedy action
3,1000004,drama
4,1000005,drama comedy music romance
...,...,...
676486,1941559,comedy
676487,1941563,drama
676488,1941566,crime
676489,1941569,crime


In [93]:
from sklearn.preprocessing import MultiLabelBinarizer
encoder = MultiLabelBinarizer()
encoded_genres = pd.DataFrame(encoder.fit_transform(new_genres['genre_list'].str.split(' ')),
                                  columns=encoder.classes_,
                                  index=new_genres.index)
final = new_genres[['id']].join(encoded_genres)

In [94]:
final

Unnamed: 0,id,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,history,horror,music,mystery,romance,science_fiction,thriller,tv_movie,war,western
0,1000001,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1000002,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,1000003,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,1000004,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,1000005,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676486,1941559,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
676487,1941563,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
676488,1941566,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
676489,1941569,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [75]:
data = movies \
    .merge(new_actors, how='left', on='id')

In [76]:
data

(406445, 8)

In [55]:
movies.head()

Unnamed: 0,id,name,year,description,minute,rating,key
0,1000001,Barbie,2023,Barbie and Ken are having the time of their li...,114,3.86,Barbie (2023)
1,1000002,Parasite,2019,"All unemployed, Ki-taek's family takes peculia...",133,4.56,Parasite (2019)
2,1000003,Everything Everywhere All at Once,2022,An aging Chinese immigrant is swept up in an i...,140,4.3,Everything Everywhere All at Once (2022)
3,1000004,Fight Club,1999,A ticking-time-bomb insomniac and a slippery s...,139,4.27,Fight Club (1999)
4,1000005,La La Land,2016,"Mia, an aspiring actress, serves lattes to mov...",129,4.09,La La Land (2016)


In [25]:
movies[movies['name'].notnull() & ~movies['name'].isin(['', 'No Title'])]

Unnamed: 0,id,name,date,tagline,description,minute,rating
0,1000001,Barbie,2023.0,She's everything. He's just Ken.,Barbie and Ken are having the time of their li...,114.0,3.86
1,1000002,Parasite,2019.0,Act like you own the place.,"All unemployed, Ki-taek's family takes peculia...",133.0,4.56
2,1000003,Everything Everywhere All at Once,2022.0,The universe is so much bigger than you realize.,An aging Chinese immigrant is swept up in an i...,140.0,4.30
3,1000004,Fight Club,1999.0,Mischief. Mayhem. Soap.,A ticking-time-bomb insomniac and a slippery s...,139.0,4.27
4,1000005,La La Land,2016.0,Here's to the fools who dream.,"Mia, an aspiring actress, serves lattes to mov...",129.0,4.09
...,...,...,...,...,...,...,...
941592,1941593,神笛,,,,,
941593,1941594,蟲極道蜜団子抗争編 壱ノ巻,,,Shinjuku forest at night. In the sap taverns o...,30.0,
941594,1941595,蟲極道蜜団子抗争編 弐ノ巻,,,"The city that never sleeps, where insects gath...",30.0,
941595,1941596,重生,,,"In a world where order has broken down, darkne...",,


In [4]:
movies = movies.drop(columns='tagline', axis=1)

actors = actors.drop(columns='role', axis=1)
actors = actors.dropna()

crew = crew[crew['role'].isin(['Director', 'Writer', 'Cinematography', 'Composer'])] #'Songs', 'Producer',

languages = languages[languages['type'].isin(['Language', 'Primary language'])].drop(columns='type')

In [5]:
new_crew = (
    crew.groupby('id')
    .apply(lambda x: x.groupby('role')['name'].apply(list).to_dict())
    .reset_index(name='crew_dict')
)

  .apply(lambda x: x.groupby('role')['name'].apply(list).to_dict())


In [6]:
new_genres = (
    genres.groupby('id')['genre']
    .apply(list)  # Aggregates genres into a list
    .reset_index(name='genre_list')  # Converts to DataFrame and renames the column
)

new_studios = (
    studios.groupby('id')['studio']
    .apply(list)  # Aggregates genres into a list
    .reset_index(name='studio_list')  # Converts to DataFrame and renames the column
)

new_actors = (
    actors.groupby('id')['name']
    .apply(list)  # Aggregates genres into a list
    .reset_index(name='actor_list')  # Converts to DataFrame and renames the column
)

In [7]:
data = movies \
    .merge(new_genres, how='left', on='id') \
    .merge(new_actors, how='left', on='id') \
    .merge(languages, how='left', on='id') \
    .merge(new_studios, how='left', on='id') \
    .merge(new_crew, how='left', on='id')

# data = pd.read_csv(os.path.join(parent_dir,'compiled_movies.csv'))

In [8]:
data = data.dropna(subset=['date'])
data['date'] = data['date'].astype(int)
data = data.loc[data['name'] != 'Untitled'] #to be completed
data = data.loc[data['date'] <= 2024]
data['key_b'] = data['name'] + data['date'].apply(
    lambda x: f" ({int(x)})" if not pd.isna(x) else ''
    )

In [12]:
# check_data = data.groupby('key_b').size().reset_index(name='frequency').sort_values(by='frequency', ascending=False)
unique_keys = data['key_b'].value_counts()  # Count occurrences
unique_keys = unique_keys[unique_keys == 1].index  # Keep only keys with frequency 1
data = data[data['key_b'].isin(unique_keys)] # Filter rows where key_b is unique

## Process dataset A

In [1]:
import os
current_dir = os.getcwd()

In [1]:
import pandas as pd
import os
current_dir = os.getcwd() #current dir
parent_dir = os.path.dirname(current_dir) #parent of current dir
data = pd.read_csv(os.path.join(parent_dir, 'moviepicker/final_set_a.csv'))

In [44]:
clean_up_data = data.copy()
# clean_up_data = clean_up_data.dropna(subset='crew_dict')
clean_up_data = clean_up_data.drop(columns=['id', 'Unnamed: 0'])
clean_up_data = clean_up_data[clean_up_data['description'].notnull() & (clean_up_data['description'] != '')] ###ADJUSTED
clean_up_data = clean_up_data.dropna()
clean_up_data['minute'] = clean_up_data['minute'].astype(int)
clean_up_data = clean_up_data[(clean_up_data['minute'] > 40) & (clean_up_data['minute'] <= 240)]
clean_up_data['genre_list'] = clean_up_data['genre_list'].apply(lambda x: x.lower())
clean_up_data = clean_up_data.reset_index(drop=True)

In [77]:
genres = pd.read_csv(os.path.join(parent_dir,'raw_data/genres.csv'))
genres['genre'].apply(lambda x: x.lower())

0             comedy
1          adventure
2             comedy
3           thriller
4              drama
             ...    
1046844        drama
1046845        crime
1046846        crime
1046847       action
1046848        crime
Name: genre, Length: 1046849, dtype: object

In [73]:
# test_save = data_preproc(clean_up_data)
test_save = test_save[test_save['description'].notnull() & (test_save['description'] != '')]
test_save.to_csv(os.path.join(parent_dir, 'test_processed.csv'), index=False)
test_load_from_csv = pd.read_csv(os.path.join(parent_dir, 'test_processed.csv'))

- https://www.youtube.com/results?search_query=parasite+trailer
- https://letterboxd.com/film/parasite-2019/
- add poster <-- challenge of data size

In [19]:
data.actor_list

<bound method Series.max of 0         ['Song Kang-ho', 'Lee Sun-kyun', 'Cho Yeo-jeon...
1         ['Michelle Yeoh', 'Ke Huy Quan', 'Stephanie Hs...
2         ['Edward Norton', 'Brad Pitt', 'Helena Bonham ...
3         ['Ryan Gosling', 'Emma Stone', 'John Legend', ...
4         ['Cillian Murphy', 'Emily Blunt', 'Matt Damon'...
                                ...                        
279046    ['Masaya Kikawada', 'Tomoko Sekihara', '源九郎判官義経']
279047    ['Viktor Tsymbalist', 'Olga Kusenko', 'Dmytro ...
279048    ['Uttam Kumar', 'Biswajeet Chatterjee', 'Sabit...
279049    ['Irene Fenwick', 'Cyril Keightley', 'Malcolm ...
279050    ['William Larsson', 'Carl Barcklind', 'Hedvig ...
Name: actor_list, Length: 279051, dtype: object>

In [66]:
# # name_counts = actors_cleaned['name'].value_counts().reset_index()
# pd.options.display.float_format = '{:,.0f}'.format

# # Print the statistics without scientific notation
# print(name_counts['count'].describe(percentiles=[0.25, 0.5, 0.75, 0.90, 0.95, 0.99]))

count   1,243,795
mean            3
std             9
min             1
25%             1
50%             1
75%             2
90%             6
95%            12
99%            40
max           796
Name: count, dtype: float64


In [None]:
# import pandas as pd
# from collections import Counter
# import ast

# all_actors = [actor for sublist in data['actor_list'] for actor in ast.literal_eval(sublist)] # Flatten actor lists into a single list

# actor_counts = Counter(all_actors) # Count frequency of each actor
# actor_frequency_df = pd.DataFrame(actor_counts.items(), columns=['Actor', 'Frequency']) # Create a new DataFrame with actor names and their frequencies
# actor_frequency_df = actor_frequency_df.sort_values(by='Frequency', ascending=False).reset_index(drop=True) # Sort the DataFrame by frequency in descending order
# actor_frequency_df.head()

Unnamed: 0,Actor,Frequency
0,Bess Flowers,657
1,Eric Roberts,449
2,Jagathy Sreekumar,418
3,Brahmanandam,400
4,Nassar,391


In [None]:
actor_frequency_df

In [17]:
# test_load_from_csv = test_load_from_csv.drop(columns=['Unnamed: 0'])
# test_load_from_csv.isnull().sum()
# data.head(5)
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

all_actors = [actor for sublist in data['actor_list'] for actor in sublist]
actor_encoder = LabelEncoder()
actor_encoder.fit(all_actors)

data['actor_indices'] = data['actor_list'].apply(lambda actors: actor_encoder.transform(actors).tolist())

Unnamed: 0,name,date,description,minute,genre_list,actor_list,language,crew_dict,key_b,comedy,...,family,documentary,action,mystery,animation,thriller,music,drama,war,romance
0,Parasite,0.388889,unemployed kitaeks family take peculiar intere...,0.462312,"['comedy', 'thriller', 'drama']","['Song Kang-ho', 'Lee Sun-kyun', 'Cho Yeo-jeon...",Korean,"{'Cinematography': ['Hong Kyung-pyo'], 'Compos...",Parasite (2019),1,...,0,0,0,0,0,1,0,1,0,0
1,Everything Everywhere All at Once,0.472222,age chinese immigrant sweep insane adventure a...,0.497487,"['science fiction', 'adventure', 'comedy', 'ac...","['Michelle Yeoh', 'Ke Huy Quan', 'Stephanie Hs...",English,"{'Cinematography': ['Larkin Seiple'], 'Compose...",Everything Everywhere All at Once (2022),1,...,0,0,1,0,0,0,0,0,0,0
2,Fight Club,-0.166667,tickingtimebomb insomniac slippery soap salesm...,0.492462,['drama'],"['Edward Norton', 'Brad Pitt', 'Helena Bonham ...",English,"{'Cinematography': ['Jeff Cronenweth'], 'Compo...",Fight Club (1999),0,...,0,0,0,0,0,0,0,1,0,0
3,La La Land,0.305556,mia aspire actress serve latte movie star audi...,0.442211,"['drama', 'comedy', 'music', 'romance']","['Ryan Gosling', 'Emma Stone', 'John Legend', ...",English,"{'Cinematography': ['Linus Sandgren'], 'Compos...",La La Land (2016),1,...,0,0,0,0,0,0,1,1,0,1
4,Oppenheimer,0.5,story j robert oppenheimer role development at...,0.703518,"['drama', 'history']","['Cillian Murphy', 'Emily Blunt', 'Matt Damon'...",English,"{'Cinematography': ['Hoyte van Hoytema'], 'Com...",Oppenheimer (2023),0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
# from test import *
# clean_up_data = clean_up_data.sample(n=100)
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, Flatten

2025-01-30 15:01:41.909596: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-30 15:01:41.911824: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-30 15:01:41.915176: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-30 15:01:41.920474: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738245701.928693   81511 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738245701.93

In [15]:
numerical_input = Input(shape=(2,))
numerical_dense = Dense(32, activation="relu")(numerical_input)

W0000 00:00:1738246446.322489   81511 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [16]:
actor_input = Input(shape=(1,))  # Single integer index for actor
actor_embedding = Embedding(input_dim=num_actors, output_dim=32)(actor_input)
actor_flatten = Flatten()(actor_embedding)

<KerasTensor shape=(None, 32), dtype=float32, sparse=False, name=keras_tensor_1>

In [None]:
# test_data = clean_up_data.sample(n=10000)
# processed_test_data = data_preproc(test_data)
# processed_test_data = data_preproc(data)
# processed_test_data.set_index('key_b', inplace=True)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Step 1: Vectorize the text using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_test_data['description'])

# Step 2: Initialize NearestNeighbors with cosine metric
knn = NearestNeighbors(metric='cosine', algorithm='brute')  # Using brute-force to handle cosine similarity
knn.fit(tfidf_matrix)

# Step 3: Specify the number of neighbors (including the movie itself, so we set n_neighbors=6)
n_neighbors = 6

# Step 4: Compute the nearest neighbors (including itself)
distances, indices = knn.kneighbors(tfidf_matrix, n_neighbors=n_neighbors)

# Step 5: Convert distances to similarity (1 - cosine distance)
similarity_scores = 1 - distances

# Step 6: Create DataFrame for similarity scores and closest neighbors
# For each movie, find the 5 closest neighbors (excluding itself)
neighbors_df = pd.DataFrame(indices[:, 1:],  # Exclude the movie itself (index 0)
                             columns=[f"Neighbor_{i+1}" for i in range(n_neighbors-1)],
                             index=processed_test_data.index)

similarity_df = pd.DataFrame(similarity_scores[:, 1:],  # Exclude the movie itself (index 0)
                             columns=[f"Neighbor_{i+1}" for i in range(n_neighbors-1)],
                             index=processed_test_data.index)

# Map the indices of the neighbors to the actual movie names
for col in neighbors_df.columns:
    neighbors_df[col] = neighbors_df[col].map(lambda idx: processed_test_data.index[idx])

# similarity_df.to_csv(os.path.join(parent_dir,'similarity_df.csv'))
# neighbors_df.to_csv(os.path.join(parent_dir,'neighbors_df.csv'))

In [None]:
# data_preproc(test_data)
# save_check = cat_processing_genre(clean_up_data, 'genre_list')
# cat_processing_lan(save_check, 'language')
# text_preprocess(clean_up_data['description'][2])
# num_preprocess_year(clean_up_data[['date']])
# num_preprocess_min(clean_up_data[['minute']])

In [2]:
# FROM AYBIKE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

def vectorize_descriptions(df, text_column):
    """
    Vectorize movie descriptions using TF-IDF.

    Args:
        df: The DataFrame containing movie descriptions.
        text_column: The column in the DataFrame that contains descriptions.

    Returns:
        tfidf_matrix: The TF-IDF matrix.
        vectorizer: The fitted TfidfVectorizer object (useful if needed later).
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    return tfidf_matrix

def knn_fit(tfidf_matrix):
    # Fit KNN on the TF-IDF matrix
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(tfidf_matrix)
    return knn

def verify_input(df, input_name, name_column):
    if input_name not in df[name_column].values:
        raise ValueError(f"Movie '{input_name}' not found in the DataFrame.")

def get_similar_movies_knn(knn, tfidf_matrix, df, input_name, name_column, n_neighbors=5):
    """
    Find similar movies using KNN based on a TF-IDF matrix.

    Args:
        tfidf_matrix: The TF-IDF matrix.
        df: The DataFrame containing movie names and descriptions.
        input_name: The name of the movie to find similarities for.
        name_column: The column in the DataFrame that contains movie names.
        n_neighbors: The number of similar movies to retrieve (default is 5).

    Returns:
        A list of dictionaries with movie names and similarity scores.
    """
    # Get the index of the input movie
    # if input_name not in df[name_column].values:
    #     raise ValueError(f"Movie '{input_name}' not found in the DataFrame.")
    verify_input(df, input_name, name_column)

    idx = df[df[name_column] == input_name].index[0] #can be moved to verify_input

    # Find nearest neighbors
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=n_neighbors + 1)

    # Exclude the input movie itself
    similar_movies = []
    for i in range(1, len(indices.flatten())):
        similar_movies.append({
            'input_name': df.iloc[indices.flatten()[i]][name_column],
            'similarity_score': 1 - distances.flatten()[i]  # Convert distance to similarity
        })
    return similar_movies

In [3]:
matrix = vectorize_descriptions(data,'description')

In [4]:
model = knn_fit(matrix)

In [5]:
# Specify the movie name and the column for movie titles
movie_name = "Parasite"
name_column = 'name'
# Get similar movies
similar_movies = get_similar_movies_knn(model, matrix, data, movie_name, name_column)
# Print similar movies
for movie in similar_movies:
    print(f"Movie: {movie['input_name']}, Similarity Score: {movie['similarity_score']:.2f}")

Movie: Oligor Brothers, Similarity Score: 0.25
Movie: Present Company Excluded, Similarity Score: 0.22
Movie: Der Traum ihres Lebens, Similarity Score: 0.22
Movie: The Escape, Similarity Score: 0.21
Movie: Ningal Camera Nireekshanathilaanu, Similarity Score: 0.21


In [88]:
# os.path.dirname('movie_picker/moviepicker/model.py')

'movie_picker/moviepicker'

In [84]:
# matrix --> pickle as well!

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2591 stored elements and shape (100, 1671)>

In [7]:
import pickle
with open(os.path.join(parent_dir,"moviepicker/models/matrix.pkl"), "wb") as file:
    pickle.dump(matrix, file)

In [9]:
with open(os.path.join(parent_dir,"moviepicker/models/knn_model.pkl"), "wb") as file:
    pickle.dump(model, file)

In [82]:
load_model = pickle.load(open(os.path.join(parent_dir,"models/knn_model.pkl"), "rb"))

In [83]:
load_model

In [19]:
processed_test_data.reset_index(inplace=True)

In [20]:
matrix = vectorize_descriptions(processed_test_data, 'description')
model = knn_fit(matrix)

In [24]:
# Specify the movie name and the column for movie titles
movie_name = "The Boy and the Fog (1953)"
name_column = 'key_b'

# Get similar movies
similar_movies = get_similar_movies_knn(model, matrix, processed_test_data, movie_name, name_column)

# Print similar movies
for movie in similar_movies:
    print(f"Movie: {movie['input_name']}, Similarity Score: {movie['similarity_score']:.2f}")

Movie: Son of Manjeet Singh (2018), Similarity Score: 0.08
Movie: El Tamalon Navideño (2018), Similarity Score: 0.08
Movie: La Güera Rodríguez (1978), Similarity Score: 0.08
Movie: Patrol (2014), Similarity Score: 0.08
Movie: The Report (2019), Similarity Score: 0.06


In [49]:
# test_load_from_csv.isnull().sum()
# clean_up_data.isnull().sum()
test_load_from_csv[test_load_from_csv.description.isnull()]

Unnamed: 0,name,date,description,minute,genre_list,actor_list,language,crew_dict,key_b,comedy,...,family,documentary,action,mystery,animation,thriller,music,drama,war,romance
170451,Garrison,0.083333,,0.301508,"['drama', 'thriller']","['Elizabeth Ingalls', 'Jason Cox', 'Brent Boll...",German,"{'Director': ['Kerry Valderrama'], 'Writer': [...",Garrison (2008),0,...,0,0,0,0,0,1,0,1,0,0
185854,The House of Pop 6,-0.388889,,0.266332,"['horror', 'comedy']",['Nuttanee Sittisamarn'],Thai,{'Director': ['Saiyon Srisawat']},The House of Pop 6 (1991),1,...,0,0,0,0,0,0,0,0,0,0
271072,heart eyes,0.5,,0.396985,['romance'],"['nadjib', 'imene']",English,{'Director': ['nadjib']},heart eyes (2023),0,...,0,0,0,0,0,0,0,0,0,1
274771,Mayer Odhikar,-0.25,,0.542714,"['family', 'drama', 'action']","['Salman Shah', 'Shahnaz Sumi', 'Alamgir', 'Bo...","Bengali, Bangla","{'Director': ['Shibli Sadique'], 'Writer': ['S...",Mayer Odhikar (1996),0,...,1,0,1,0,0,0,0,1,0,0


In [None]:
test_load_from_csv

In [59]:
item_to_check = clean_up_data[clean_up_data.name=="The House of Pop 6"]

In [66]:
item_to_check

Unnamed: 0,name,date,description,minute,genre_list,actor_list,language,crew_dict,key_b,comedy,...,family,documentary,action,mystery,animation,thriller,music,drama,war,romance
185854,The House of Pop 6,-0.388889,,0.266332,"['horror', 'comedy']",['Nuttanee Sittisamarn'],Thai,{'Director': ['Saiyon Srisawat']},The House of Pop 6 (1991),1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
clean_up_data[clean_up_data.name=="The House of Pop 6"]

In [67]:
clean_up_data[clean_up_data.name=="Garrison"]

Unnamed: 0,name,date,description,minute,genre_list,actor_list,language,crew_dict,key_b,comedy,...,family,documentary,action,mystery,animation,thriller,music,drama,war,romance
170451,Garrison,0.083333,,0.301508,"['drama', 'thriller']","['Elizabeth Ingalls', 'Jason Cox', 'Brent Boll...",German,"{'Director': ['Kerry Valderrama'], 'Writer': [...",Garrison (2008),0,...,0,0,0,0,0,1,0,1,0,0


In [69]:
data[data.name=="Garrison"]

Unnamed: 0.1,Unnamed: 0,id,name,date,description,minute,genre_list,actor_list,language,crew_dict,key_b
203457,270000,1282791,Garrison,2008,,101.0,"['Drama', 'Thriller']","['Elizabeth Ingalls', 'Jason Cox', 'Brent Boll...",German,"{'Director': ['Kerry Valderrama'], 'Writer': [...",Garrison (2008)


In [75]:
matrix_1 = vectorize_descriptions(test_load_from_csv, 'description')
model_1 = knn_fit(matrix_1)

# Specify the movie name and the column for movie titles
movie_name = "The Boy and the Fog (1953)"
name_column = 'key_b'

# Get similar movies
similar_movies = get_similar_movies_knn(model_1, matrix_1, test_load_from_csv, movie_name, name_column)

# Print similar movies
for movie in similar_movies:
    print(f"Movie: {movie['input_name']}, Similarity Score: {movie['similarity_score']:.2f}")

Movie: Canvas (2006), Similarity Score: 0.38
Movie: Rain Beau's End (2020), Similarity Score: 0.29
Movie: New Brooklyn (2009), Similarity Score: 0.25
Movie: Manduka (2023), Similarity Score: 0.22
Movie: Incomplete Eclipse (1983), Similarity Score: 0.22


In [23]:
similar_movies

[{'input_name': 'Son of Manjeet Singh (2018)',
  'similarity_score': np.float64(0.08164263598051247)},
 {'input_name': 'El Tamalon Navideño (2018)',
  'similarity_score': np.float64(0.08079736313471475)},
 {'input_name': 'La Güera Rodríguez (1978)',
  'similarity_score': np.float64(0.07697786690098152)},
 {'input_name': 'Patrol (2014)',
  'similarity_score': np.float64(0.07508092827475188)},
 {'input_name': 'The Report (2019)',
  'similarity_score': np.float64(0.06036441232949241)}]

## Process dataset B

In [17]:
B_movie = pd.read_csv(os.path.join(parent_dir,'raw_data/Set B/movie.csv'))
B_rating = pd.read_csv(os.path.join(parent_dir,'raw_data/Set B/rating.csv'))

B_data = B_movie.merge(B_rating, how='left', on='movieId')

In [18]:
B_data['year'] = B_data['title'].str.extract(r'\((\d{4})\)')
B_data['new_title'] = B_data['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True)
B_data['new_title'] = B_data['new_title'].apply(
    lambda x: f"The {x[:-5]}" if x.endswith(', The') else x
)

In [26]:
new_B = B_data[['title', 'new_title', 'year']].drop_duplicates()
new_B['key_a'] = B_data['new_title'] + B_data['year'].apply(
    lambda x: f" ({int(x)})" if not pd.isna(x) else ''
    )

## Join A and B

In [32]:
merge_A_and_B = new_A.merge(new_B, how='left', left_on='key_b', right_on='key_a')

In [33]:
merge_A_and_B[merge_A_and_B['title'].notna()]

Unnamed: 0,key_b,date,title,new_title,year,key_a
2,Fight Club (1999),1999,Fight Club (1999),Fight Club,1999,Fight Club (1999)
5,Interstellar (2014),2014,Interstellar (2014),Interstellar,2014,Interstellar (2014)
8,Pulp Fiction (1994),1994,Pulp Fiction (1994),Pulp Fiction,1994,Pulp Fiction (1994)
10,Whiplash (2014),2014,Whiplash (2014),Whiplash,2014,Whiplash (2014)
15,The Dark Knight (2008),2008,"Dark Knight, The (2008)",The Dark Knight,2008,The Dark Knight (2008)
...,...,...,...,...,...,...
719244,Die Frau des Frisörs (2008),2008,Die Frau des Frisörs (2008),Die Frau des Frisörs,2008,Die Frau des Frisörs (2008)
735576,Confessions of a Gangsta (2006),2006,Confessions of a Gangsta (2006),Confessions of a Gangsta,2006,Confessions of a Gangsta (2006)
752416,Apollo Zero (2009),2009,Apollo Zero (2009),Apollo Zero,2009,Apollo Zero (2009)
764628,Family Meeting (2007),2007,Family Meeting (2007),Family Meeting,2007,Family Meeting (2007)


# SUMMARY

ADVISE: high-cardiality --> may make sense to drop the rare guys or not

ADVISE FROM TEACHERS
- first drop rows where everything is missing
- for Wed: drop rows where description isnull, no impute using other text columns --> future: API web_scrapping to fill in description
- for Wed: snapshot dataset (random/first/last 10k)

## ARCHIVE CODE

In [None]:
# Sweep the var away
del unique_keys

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Plot histogram of 'minute' column
# sns.histplot(clean_up_data['minute'], kde=False)
# plt.title('Histogram of Minute Column')
# plt.xlabel('Minute')
# plt.ylabel('Frequency')
# plt.xlim(0, 240)
# plt.show()

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer =TfidfVectorizer()

# X = vectorizer.fit_transform(processed_test_data['description'])
# X = pd.DataFrame(
#     X.toarray(),
#     columns = vectorizer.get_feature_names_out()
# )

# X

In [None]:
# neighbors_df.loc['Agnisnaan (1985)']

In [None]:
# clean_up_data.minute #impute mean or median
# clean_up_data.genre_list #impute with new genre category "Other" or "Unknown"
# clean_up_data.actor_list #drop rows

# SUPER SOLUTION: API scrapping can fill in all missing values
# some fancy graphs in the frontend if possible

In [None]:
# temp = crew[crew['id']==1000001]
#consider relabel those with small proportion to new category "Others"
#embeddings make sense, but we will have little control
#TODO: we check later how we deal with this feature!

# crew.isnull().sum()
# crew.role.unique()

# genres.genre.unique() #OneHotEncoder

# data.date.isnull().count()

# studios.studio.value_counts()/studios.studio.value_counts().sum()
#consider relabel those with small proportion to new category "Others" or drop them
#embeddings make sense, but we will have little control
#TODO: we check later how we deal with this feature!

# countries # categorical, high-cardinality, embeddings
# countries # if we chose studios, this one can be left out due to collinearity

# Incremental Recall
# python surprise
# Qualitative not

# - dropna, fillna: imputer
# - solve outliners (year range): drop those that have

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity
# # Compute the similarity matrix
# similarity_matrix = cosine_similarity(list(processed_test_data['description_vector']))

# # Convert similarity matrix to a DataFrame
# similarity_df = pd.DataFrame(similarity_matrix,
#                              index=processed_test_data.index,
#                              columns=processed_test_data.index)