In [1]:
import pandas as pd
import os

In [2]:
current_dir = os.getcwd()
parent_dir=os.path.dirname(current_dir)
data= pd.read_csv(os.path.join(parent_dir, 'processed_data/final_data.csv'))
data.head()

Unnamed: 0,id,name,year,description,minute,rating,key,genre_list,actor_list,language,studio_list,crew_dict
0,1000001,Barbie,2023.0,Barbie and Ken are having the time of their li...,114,3.86,Barbie (2023),comedy adventure,"['Margot Robbie', 'Ryan Gosling', 'America Fer...",English,"['LuckyChap Entertainment', 'Heyday Films', 'N...","{'Cinematography': ['Rodrigo Prieto'], 'Compos..."
1,1000002,Parasite,2019.0,"All unemployed, Ki-taek's family takes peculia...",133,4.56,Parasite (2019),comedy thriller drama,"['Song Kang-ho', 'Lee Sun-kyun', 'Cho Yeo-jeon...",Korean,['Barunson E&A'],"{'Cinematography': ['Hong Kyung-pyo'], 'Compos..."
2,1000003,Everything Everywhere All at Once,2022.0,An aging Chinese immigrant is swept up in an i...,140,4.3,Everything Everywhere All at Once (2022),science_fiction adventure comedy action,"['Michelle Yeoh', 'Ke Huy Quan', 'Stephanie Hs...",English,"['IAC Films', 'AGBO', 'Ley Line Entertainment'...","{'Cinematography': ['Larkin Seiple'], 'Compose..."
3,1000004,Fight Club,1999.0,A ticking-time-bomb insomniac and a slippery s...,139,4.27,Fight Club (1999),drama,"['Edward Norton', 'Brad Pitt', 'Helena Bonham ...",English,"['Fox 2000 Pictures', 'Regency Enterprises', '...","{'Cinematography': ['Jeff Cronenweth'], 'Compo..."
4,1000005,La La Land,2016.0,"Mia, an aspiring actress, serves lattes to mov...",129,4.09,La La Land (2016),drama comedy music romance,"['Ryan Gosling', 'Emma Stone', 'John Legend', ...",English,"['Summit Entertainment', 'Black Label Media', ...","{'Cinematography': ['Linus Sandgren'], 'Compos..."


In [3]:
import string
import numpy as np
import re
import ast
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
# from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.sequence import pad_sequences

def text_preprocess(sentence):
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers #TODO
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenize
    stop_words = set(stopwords.words('english'))
    stopwords_removed = [w for w in tokenized_sentence if not w in stop_words]
    v_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v")
        for word in stopwords_removed
    ]
    n_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "n")
        for word in v_lemmatized
    ]
    cleaned_sentence = ' '.join(word for word in n_lemmatized)
    return cleaned_sentence

def num_preprocess_year(value):
    scaler = RobustScaler()
    result = scaler.fit_transform(value)
    return result

def num_preprocess_min(value):
    scaler = MinMaxScaler()
    result = scaler.fit_transform(value)
    return result

def fix_data_from_csv(df):
    df[["language", "genre_list"]] = df[["language", "genre_list"]].fillna("")
    return df

######################### NEW INPUT #########################

# changed this function ##
def cat_processing_genre(df, column="genre_list"):
    # Initialize MultiLabelBinarizer and transform the data
    encoder = MultiLabelBinarizer()
    genre_df = pd.DataFrame(encoder.fit_transform(df[column].str.split(' ')),
                                  columns=encoder.classes_,
                                  index=df.index)
    df = pd.concat([df, genre_df], axis=1)

    return df

def cat_processing_lan(df, column="language"):
    """
    Cleans and encodes a single categorical column (e.g., language) using LabelEncoder.
    - Keeps only the first value before delimiters (comma, slash, semicolon, pipe).
    - Encodes categorical values into numerical labels.
    """

    df[column] = df[column].astype(str).str.split(r",|/|;|\|").str[0].str.strip()

    encoder = LabelEncoder()
    df[f"{column}_encoded"] = encoder.fit_transform(df[column])

    return df

def safe_eval_column(df, column_name="crew_dict"):
    """
    Safely converts a column containing string representations of dictionaries into actual dictionaries.
    - If the value is already a dictionary, it remains unchanged.
    - If the value is a valid string dictionary, it is converted using `ast.literal_eval`.
    - If conversion fails, an empty dictionary `{}` is returned.
    """
    def safe_eval(val):
        if isinstance(val, str):
            try:
                return ast.literal_eval(val)  # Convert only if it's a valid string dictionary
            except (ValueError, SyntaxError):
                return {}  # Return empty dictionary if parsing fails
        return val  # Return as is if already a dict

    df[column_name] = df[column_name].apply(safe_eval)
    return df


def extract_roles(df, column_name="crew_dict", roles=None):
    """
    Extracts specific roles (e.g., Director, Writer) from a dictionary column.
    Creates new columns for each role with lists of names.
    """
    if roles is None:
        roles = ["Director", "Writer", "Cinematography", "Composer"]

    for role in roles:
        df[role.lower()] = df[column_name].apply(
            lambda x: x.get(role, []) if isinstance(x, dict) else []
        )

    return df


#def encode_list_column_with_padding(df, column_name, padding_value=0, max_length=2):
    """
    Encodes a column containing lists of categorical values (e.g., directors) and applies padding.
    - Uses LabelEncoder to encode unique values.
# - Pads sequences to a fixed length.
# """
# # Flatten unique values for encoding
# unique_values = sorted(set(value for sublist in df[column_name] for value in sublist))
#
# # Fit LabelEncoder once
# encoder = LabelEncoder()
# encoder.fit(unique_values)
#
# # Create mapping dictionary for faster lookup
# encoding_map = {label: idx for idx, label in enumerate(encoder.classes_)}
#
# # Apply encoding efficiently
# df[f"{column_name}_encoded"] = df[column_name].apply(lambda x: [encoding_map[v] for v in x])
#
# # Apply padding to ensure fixed-length sequences
# df[f"{column_name}_encoded_padded"] = list(
#     pad_sequences(df[f"{column_name}_encoded"], maxlen=max_length, padding='pre', value=padding_value)
# )
#
# return df, len(unique_values)

def data_preproc(df):
    df = fix_data_from_csv(df)
    df['description'] = df['description'].apply(text_preprocess)
    df['year'] = num_preprocess_year(df[['year']])
    df['minute'] = num_preprocess_min(df[['minute']])
    df = cat_processing_genre(df,'genre_list') ## df equal added
    df = cat_processing_lan(df, 'language')
    return df

def data_encode(df):
    # Dictionary Processing
    df = safe_eval_column(df, column_name="crew_dict")
    df = extract_roles(df, column_name="crew_dict")

    # Encoding list columns with padding
    #df, director_length = encode_list_column_with_padding(df, "director")
    #df, writer_length = encode_list_column_with_padding(df, "writer")
    #df, cinematography_length = encode_list_column_with_padding(df, "cinematography")
    #df, composer_length = encode_list_column_with_padding(df, "composer")
#
    return df


In [4]:
data_processed = data_preproc(data)

In [5]:
data_processed.columns

Index(['id', 'name', 'year', 'description', 'minute', 'rating', 'key',
       'genre_list', 'actor_list', 'language', 'studio_list', 'crew_dict', '',
       'action', 'adventure', 'animation', 'comedy', 'crime', 'documentary',
       'drama', 'family', 'fantasy', 'history', 'horror', 'music', 'mystery',
       'romance', 'science_fiction', 'thriller', 'tv_movie', 'war', 'western',
       'language_encoded'],
      dtype='object')

In [6]:
data_encoded = data_encode(data_processed)

In [7]:
data_encoded

Unnamed: 0,id,name,year,description,minute,rating,key,genre_list,actor_list,language,...,science_fiction,thriller,tv_movie,war,western,language_encoded,director,writer,cinematography,composer
0,1000001,Barbie,0.50000,barbie ken time live colorful seemingly perfec...,0.366834,3.86,Barbie (2023),comedy adventure,"['Margot Robbie', 'Ryan Gosling', 'America Fer...",English,...,0,0,0,0,0,38,[Greta Gerwig],"[Noah Baumbach, Greta Gerwig]",[Rodrigo Prieto],"[Mark Ronson, Andrew Wyatt]"
1,1000002,Parasite,0.37500,unemployed kitaeks family take peculiar intere...,0.462312,4.56,Parasite (2019),comedy thriller drama,"['Song Kang-ho', 'Lee Sun-kyun', 'Cho Yeo-jeon...",Korean,...,0,1,0,0,0,77,[Bong Joon-ho],"[Kim Dae-hwan, Bong Joon-ho, Han Jin-won]",[Hong Kyung-pyo],[Jung Jae-il]
2,1000003,Everything Everywhere All at Once,0.46875,age chinese immigrant sweep insane adventure a...,0.497487,4.30,Everything Everywhere All at Once (2022),science_fiction adventure comedy action,"['Michelle Yeoh', 'Ke Huy Quan', 'Stephanie Hs...",English,...,1,0,0,0,0,38,"[Daniel Scheinert, Daniel Kwan]","[Daniel Kwan, Daniel Scheinert]",[Larkin Seiple],"[Ryan Lott, Rafiq Bhatia, Ian Chang]"
3,1000004,Fight Club,-0.25000,tickingtimebomb insomniac slippery soap salesm...,0.492462,4.27,Fight Club (1999),drama,"['Edward Norton', 'Brad Pitt', 'Helena Bonham ...",English,...,0,0,0,0,0,38,[David Fincher],"[Jim Uhls, Andrew Kevin Walker]",[Jeff Cronenweth],"[John King, Michael Simpson]"
4,1000005,La La Land,0.28125,mia aspire actress serve latte movie star audi...,0.442211,4.09,La La Land (2016),drama comedy music romance,"['Ryan Gosling', 'Emma Stone', 'John Legend', ...",English,...,0,0,0,0,0,38,[Damien Chazelle],[Damien Chazelle],[Linus Sandgren],[Justin Hurwitz]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406440,1941583,日本統一56,,one day special investigation team suddenly be...,0.145729,,日本統一56,,[],English,...,0,0,0,0,0,38,[],[],[],[]
406441,1941584,日本統一57,,hamadas death seem yuseikais problem would res...,0.145729,,日本統一57,,[],English,...,0,0,0,0,0,38,[],[],[],[]
406442,1941585,日本統一58,,makimoto tomohiro waki arimura tasuku nagaoka ...,0.145729,,日本統一58,,[],English,...,0,0,0,0,0,38,[],[],[],[]
406443,1941586,日本統一59,,one day himuro yasufu motomiya tamura yoshiyuk...,0.150754,,日本統一59,,[],English,...,0,0,0,0,0,38,[],[],[],[]


In [8]:
#df = pd.DataFrame(data_encoded[0])
df = data_encoded

In [9]:
df.isnull().sum()

id                       0
name                     0
year                 15719
description              0
minute                   0
rating              331222
key                      0
genre_list               0
actor_list               0
language                 0
studio_list              0
crew_dict            38988
                         0
action                   0
adventure                0
animation                0
comedy                   0
crime                    0
documentary              0
drama                    0
family                   0
fantasy                  0
history                  0
horror                   0
music                    0
mystery                  0
romance                  0
science_fiction          0
thriller                 0
tv_movie                 0
war                      0
western                  0
language_encoded         0
director                 0
writer                   0
cinematography           0
composer                 0
d

# Get Features Ready

## Create TFIDF Matrix

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

tfidf_dim=2500

def vectorize_descriptions(df, text_column):
    """
    Vectorize movie descriptions using TF-IDF.

    Args:
        df: The DataFrame containing movie descriptions.
        text_column: The column in the DataFrame that contains descriptions.

    Returns:
        tfidf_matrix: The TF-IDF matrix.
        vectorizer: The fitted TfidfVectorizer object (useful if needed later).
    """
    vectorizer = TfidfVectorizer(max_features=tfidf_dim)
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    tfidf_array = tfidf_matrix.toarray()
    return tfidf_array


In [13]:
text_df = df[['key', 'description']]
text_df

Unnamed: 0,key,description
0,Barbie (2023),barbie ken time live colorful seemingly perfec...
1,Parasite (2019),unemployed kitaeks family take peculiar intere...
2,Everything Everywhere All at Once (2022),age chinese immigrant sweep insane adventure a...
3,Fight Club (1999),tickingtimebomb insomniac slippery soap salesm...
4,La La Land (2016),mia aspire actress serve latte movie star audi...
...,...,...
406440,日本統一56,one day special investigation team suddenly be...
406441,日本統一57,hamadas death seem yuseikais problem would res...
406442,日本統一58,makimoto tomohiro waki arimura tasuku nagaoka ...
406443,日本統一59,one day himuro yasufu motomiya tamura yoshiyuk...


In [14]:
tfidf_array = vectorize_descriptions(text_df, 'description')

## Language

In [15]:
num_languages = df.language_encoded.nunique()
num_languages

172

In [16]:
language_data = df.language_encoded
language_data_np = np.array(language_data, dtype=np.int32).reshape(-1, 1)
print(language_data_np)

[[38]
 [77]
 [38]
 ...
 [38]
 [38]
 [38]]


## Genre

In [17]:
num_genres= 19

In [None]:
genre_columns = ['action', 'adventure', 'animation', 'comedy', 'crime', 'documentary',
                 'drama', 'family', 'fantasy', 'history', 'horror', 'music', 'mystery',
                 'romance', 'science_fiction', 'thriller', 'tv_movie', 'war', 'western']
genres_data_np = df[genre_columns].values
print(genres_data_np)

[[0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [19]:
import tensorflow as tf

def build_encoder(tfidf_dim, num_languages, num_genres):
    """
    Builds an encoder model that fuses:
      - A TF-IDF vector input (continuous, shape: [tfidf_dim])
      - A language input (integer, shape: [1])
      - A one-hot encoded genres input (shape: [num_genres])

    Parameters:
      tfidf_dim (int): Dimensionality of the TF-IDF vector (e.g., 2500).
      num_languages (int): Total number of language categories (max language index + 1).
      num_genres (int): Number of genres (should be 19 for your columns).

    Returns:
      encoder_model (tf.keras.Model): A model that outputs a fused latent embedding.
    """

    # -------------------------
    # TF-IDF Branch
    # -------------------------
    tfidf_input = tf.keras.layers.Input(shape=(tfidf_dim,), name="tfidf_input")
    tfidf_dense = tf.keras.layers.Dense(128, activation='relu', name="tfidf_dense")(tfidf_input)

    # -------------------------
    # Language Branch
    # -------------------------
    language_input = tf.keras.layers.Input(shape=(1,), name="language_input")
    language_embedding = tf.keras.layers.Embedding(
        input_dim=num_languages,
        output_dim=8,
        name="language_embedding"
    )(language_input)
    language_vector = tf.keras.layers.Flatten(name="language_flatten")(language_embedding)

    # -------------------------
    # Genres Branch (One-hot encoded)
    # -------------------------
    genre_input = tf.keras.layers.Input(shape=(num_genres,), name="genre_input")
    # Optionally, pass the one-hot vector through a dense layer to learn a compressed representation.
    genre_dense = tf.keras.layers.Dense(32, activation='relu', name="genre_dense")(genre_input)

    # -------------------------
    # Merge Branches
    # -------------------------
    # Concatenate the outputs of all branches.
    merged = tf.keras.layers.concatenate([tfidf_dense, language_vector, genre_dense], name="merged_features")
    x = tf.keras.layers.Dense(64, activation='relu', name="dense_1")(merged)
    final_embedding = tf.keras.layers.Dense(32, activation='relu', name="final_embedding")(x)

    # Build the encoder model
    encoder_model = tf.keras.models.Model(
        inputs=[tfidf_input, language_input, genre_input],
        outputs=final_embedding
    )

    return encoder_model


In [20]:
import tensorflow as tf

def build_autoencoder(tfidf_dim, num_languages, num_genres):
    """
    Builds an autoencoder that uses:
      - The encoder from build_encoder to produce a 32-d latent embedding.
      - Three decoder branches to reconstruct:
          A. The original TF-IDF vector.
          B. The language (as a probability distribution over num_languages).
          C. The one-hot encoded genres vector.

    The autoencoder is compiled with MSE loss for TF-IDF, sparse categorical crossentropy for language,
    and binary crossentropy for genres.
    """

    # Define the inputs (they will be passed to both encoder and as targets later)
    tfidf_input = tf.keras.layers.Input(shape=(tfidf_dim,), name="tfidf_input")
    language_input = tf.keras.layers.Input(shape=(1,), name="language_input")
    genre_input = tf.keras.layers.Input(shape=(num_genres,), name="genre_input")

    # Build the encoder and get the latent representation.
    encoder = build_encoder(tfidf_dim, num_languages, num_genres)
    latent = encoder([tfidf_input, language_input, genre_input])

    # -------------------------
    # Decoder for TF-IDF reconstruction
    # -------------------------
    decoder_tfidf = tf.keras.layers.Dense(64, activation='relu', name="decoder_tfidf_dense")(latent)
    tfidf_output = tf.keras.layers.Dense(tfidf_dim, activation='relu', name="tfidf_output")(decoder_tfidf)

    # -------------------------
    # Decoder for Language reconstruction
    # -------------------------
    decoder_language = tf.keras.layers.Dense(16, activation='relu', name="decoder_language_dense")(latent)
    # Output is a probability distribution over languages
    language_output = tf.keras.layers.Dense(num_languages, activation='softmax', name="language_output")(decoder_language)

    # -------------------------
    # Decoder for Genres reconstruction
    # -------------------------
    decoder_genre = tf.keras.layers.Dense(16, activation='relu', name="decoder_genre_dense")(latent)
    # For multi-label, we use sigmoid activation; if it's strictly one-hot, you could use softmax.
    genre_output = tf.keras.layers.Dense(num_genres, activation='sigmoid', name="genre_output")(decoder_genre)

    # Build the autoencoder model.
    autoencoder_model = tf.keras.models.Model(
        inputs=[tfidf_input, language_input, genre_input],
        outputs=[tfidf_output, language_output, genre_output],
        name="autoencoder"
    )

    # Compile the autoencoder:
    # - For TF-IDF, we use mean squared error.
    # - For language, we use sparse categorical crossentropy (the target should be an integer).
    # - For genres, binary crossentropy is appropriate for multi-label reconstruction.
    autoencoder_model.compile(
        optimizer='adam',
        loss={
            'tfidf_output': 'mse',
            'language_output': 'sparse_categorical_crossentropy',
            'genre_output': 'binary_crossentropy'
        },
        loss_weights={
            'tfidf_output': 1.0,
            'language_output': 1.0,
            'genre_output': 1.0
        }
    )

    return autoencoder_model, encoder

# Example usage:
tfidf_dim = 2500      # Dimensionality of your TF-IDF vectors
num_languages = 172   # For example, if your full dataset has 172 unique languages (0..171)
num_genres = 19       # One column per genre: action, adventure, ..., western


In [22]:
autoencoder_model, encoder_model = build_autoencoder(tfidf_dim, num_languages, num_genres)
autoencoder_model.summary()

Model: "autoencoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 tfidf_input (InputLayer)       [(None, 2500)]       0           []                               
                                                                                                  
 language_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 genre_input (InputLayer)       [(None, 19)]         0           []                               
                                                                                                  
 model_1 (Functional)           (None, 32)           335040      ['tfidf_input[0][0]',            
                                                                  'language_input[0][0]'

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define the early stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',       # Monitor the validation loss
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity
)

history = autoencoder_model.fit(
    x=[tfidf_array, language_data_np, genres_data_np],
    y=[tfidf_array, language_data_np, genres_data_np],
    batch_size=16,
    epochs=50,
    callbacks=[early_stop]
)

Epoch 1/10


2025-02-02 21:00:11.556632: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x54a7c30d0>

In [24]:
# xtract latent embeddings using the encoder model.
latent_embeddings = encoder_model.predict([tfidf_array, language_data_np, genres_data_np])
print("Latent embeddings shape:", latent_embeddings.shape)

Latent embeddings shape: (406445, 32)


In [32]:
import pickle

file_path = os.path.join(parent_dir, "processed_data", "latent_embeddings.pkl")

with open(file_path, "wb") as f:
    pickle.dump(latent_embeddings, f)

In [25]:
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

In [55]:
# Build a KNN model for similarity search using the latent embeddings.
n_neighbors = 5
knn_model = NearestNeighbors(n_neighbors=n_neighbors+1, metric='cosine')
knn_model.fit(latent_embeddings)

In [73]:
user_input = "The bourne identity"  # Example input

# Convert both the user input and the DataFrame names to lowercase for case-insensitive matching.
matched_rows = df[df["name"].str.lower() == user_input.lower()]

if matched_rows.empty:
    print("Movie not found.")
else:
    # Get the first matching index (handle multiple matches as needed)
    sample_index = matched_rows.index[0]
    print(f"Found movie '{user_input}'.")

    # Retrieve KNN results.
    distances, indices = knn_model.kneighbors(latent_embeddings[sample_index].reshape(1, -1))

    # Convert to 1D arrays.
    indices = indices.flatten()
    distances = distances.flatten()

    # Filter out the queried movie (its index should match sample_index).
    filtered_recs = [(idx, dist) for idx, dist in zip(indices, distances) if idx != sample_index]

    # If the filtered recommendations list is empty or too short, you may want to handle that.
    if not filtered_recs:
        print("No recommendations found after filtering out the queried movie.")
    else:
        print("Recommended Movies:")
        for idx, dist in filtered_recs:
            movie_name = df.loc[idx, "name"]
            print(f"- {movie_name} (Distance: {dist:.6f})")


Found movie 'The bourne identity'.
Recommended Movies:
- Conspiracy Theory (Distance: 0.000006)
- The Stranger (Distance: 0.000006)
- Oldboy (Distance: 0.000006)
- The Bourne Ultimatum (Distance: 0.000006)
- Flashpoint (Distance: 0.000006)


### Directors

In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_list_column_first(df, column_name, reserve_zero=True, default_value=0):
    """
    Encodes a column containing lists of categorical values (e.g., directors) and
    extracts only the first encoded value from each list.

    - Uses LabelEncoder to encode unique values.
    - Optionally offsets encoding so that 0 is reserved for a default value
      (i.e., valid values start at 1).
    - Instead of padding, extracts only the first director from each list.

    Parameters:
      df: pandas DataFrame.
      column_name: Name of the column containing lists to be encoded.
      reserve_zero: If True, adds 1 to each encoded value so that 0 is reserved for default.
      default_value: The value to use if the list is empty (default is 0).

    Returns:
      The modified DataFrame with a new column <column_name>_encoded_first.
    """
    # Flatten unique values for encoding
    unique_values = sorted(set(value for sublist in df[column_name] for value in sublist))

    # Fit LabelEncoder on the unique values
    encoder = LabelEncoder()
    encoder.fit(unique_values)

    # Create a mapping dictionary for faster lookup
    encoding_map = {label: idx for idx, label in enumerate(encoder.classes_)}

    if reserve_zero:
        # Offset each encoded value by 1 so that 0 is reserved for default.
        df[f"{column_name}_encoded"] = df[column_name].apply(lambda x: [encoding_map[v] + 1 for v in x])
    else:
        df[f"{column_name}_encoded"] = df[column_name].apply(lambda x: [encoding_map[v] for v in x])

    # Instead of padding, take only the first element if available; otherwise, use default_value.
    df[f"{column_name}_encoded_first"] = df[f"{column_name}_encoded"].apply(
        lambda x: x[0] if len(x) > 0 else default_value
    )

    return df


In [None]:
def calculate_num_unique(df, column_name):
    # Assuming your encoded values are stored in <column_name>_encoded
    # and you offset them (if reserve_zero was True).
    unique_vals = set(val for sublist in df[f"{column_name}_encoded"] for val in sublist)
    return len(unique_vals)


In [None]:
df = encode_list_column_first(df, "director", reserve_zero=True, default_value=0)
df.head()

In [None]:
num_directors = calculate_num_unique(df, "director")
print("Number of unique directors:", num_directors)

In [None]:
director_data = df.director_encoded_first
director_data_np = np.array(director_data, dtype=np.int32).reshape(-1, 1)
print(director_data_np)

# Encode: Directors, Tfidf, language, genre

In [None]:
import tensorflow as tf

def build_encoder2(tfidf_dim, num_languages, num_genres, num_directors):
    """
    Builds an encoder model that fuses:
      - A TF-IDF vector input (continuous, shape: [tfidf_dim])
      - A language input (integer, shape: [1])
      - A one-hot encoded genres input (shape: [num_genres])
      - A director input (integer, shape: [1])

    Parameters:
      tfidf_dim (int): Dimensionality of the TF-IDF vector (e.g., 2500).
      num_languages (int): Total number of language categories (max language index + 1).
      num_genres (int): Number of genres (should be 19 for your columns).
      num_directors (int): Total number of unique director tokens (max director index + 1).

    Returns:
      encoder_model (tf.keras.Model): A model that outputs a fused latent embedding.
    """
    # -------------------------
    # TF-IDF Branch
    # -------------------------
    tfidf_input = tf.keras.layers.Input(shape=(tfidf_dim,), name="tfidf_input")
    tfidf_dense = tf.keras.layers.Dense(128, activation='relu', name="tfidf_dense")(tfidf_input)

    # -------------------------
    # Language Branch
    # -------------------------
    language_input = tf.keras.layers.Input(shape=(1,), name="language_input")
    language_embedding = tf.keras.layers.Embedding(
        input_dim=num_languages,
        output_dim=8,
        name="language_embedding"
    )(language_input)
    language_vector = tf.keras.layers.Flatten(name="language_flatten")(language_embedding)

    # -------------------------
    # Genres Branch (One-hot encoded)
    # -------------------------
    genre_input = tf.keras.layers.Input(shape=(num_genres,), name="genre_input")
    # Optionally, pass the one-hot vector through a dense layer.
    genre_dense = tf.keras.layers.Dense(32, activation='relu', name="genre_dense")(genre_input)

    # -------------------------
    # Director Branch
    # -------------------------
    director_input = tf.keras.layers.Input(shape=(1,), name="director_input")
    director_embedding = tf.keras.layers.Embedding(
        input_dim=num_directors,
        output_dim=16,
        name="director_embedding"
    )(director_input)
    director_vector = tf.keras.layers.Flatten(name="director_flatten")(director_embedding)

    # -------------------------
    # Merge All Branches
    # -------------------------
    merged = tf.keras.layers.concatenate(
        [tfidf_dense, language_vector, genre_dense, director_vector],
        name="merged_features"
    )
    x = tf.keras.layers.Dense(64, activation='relu', name="dense_1")(merged)
    final_embedding = tf.keras.layers.Dense(32, activation='relu', name="final_embedding")(x)

    # Build the encoder model with 4 inputs.
    encoder_model = tf.keras.models.Model(
        inputs=[tfidf_input, language_input, genre_input, director_input],
        outputs=final_embedding,
        name="encoder"
    )

    return encoder_model


def build_autoencoder2(tfidf_dim, num_languages, num_genres, num_directors):
    """
    Builds an autoencoder that uses:
      - The encoder from build_encoder to produce a 32-d latent embedding.
      - Four decoder branches to reconstruct:
          A. The original TF-IDF vector.
          B. The language (as a probability distribution over num_languages).
          C. The one-hot encoded genres vector.
          D. The director (as a probability distribution over num_directors).

    The autoencoder is compiled with:
      - MSE loss for TF-IDF.
      - Sparse categorical crossentropy for language and director.
      - Binary crossentropy for genres.
    """
    # Define inputs (to be used as both inputs and targets).
    tfidf_input = tf.keras.layers.Input(shape=(tfidf_dim,), name="tfidf_input")
    language_input = tf.keras.layers.Input(shape=(1,), name="language_input")
    genre_input = tf.keras.layers.Input(shape=(num_genres,), name="genre_input")
    director_input = tf.keras.layers.Input(shape=(1,), name="director_input")

    # Build the encoder and get the latent representation.
    encoder = build_encoder2(tfidf_dim, num_languages, num_genres, num_directors)
    latent = encoder([tfidf_input, language_input, genre_input, director_input])

    # -------------------------
    # Decoder for TF-IDF reconstruction
    # -------------------------
    decoder_tfidf = tf.keras.layers.Dense(64, activation='relu', name="decoder_tfidf_dense")(latent)
    tfidf_output = tf.keras.layers.Dense(tfidf_dim, activation='relu', name="tfidf_output")(decoder_tfidf)

    # -------------------------
    # Decoder for Language reconstruction
    # -------------------------
    decoder_language = tf.keras.layers.Dense(16, activation='relu', name="decoder_language_dense")(latent)
    language_output = tf.keras.layers.Dense(num_languages, activation='softmax', name="language_output")(decoder_language)

    # -------------------------
    # Decoder for Genres reconstruction
    # -------------------------
    decoder_genre = tf.keras.layers.Dense(16, activation='relu', name="decoder_genre_dense")(latent)
    genre_output = tf.keras.layers.Dense(num_genres, activation='sigmoid', name="genre_output")(decoder_genre)

    # -------------------------
    # Decoder for Director reconstruction
    # -------------------------
    decoder_director = tf.keras.layers.Dense(16, activation='relu', name="decoder_director_dense")(latent)
    director_output = tf.keras.layers.Dense(num_directors, activation='softmax', name="director_output")(decoder_director)

    # Build the autoencoder model with four inputs and four outputs.
    autoencoder_model = tf.keras.models.Model(
        inputs=[tfidf_input, language_input, genre_input, director_input],
        outputs=[tfidf_output, language_output, genre_output, director_output],
        name="autoencoder"
    )

    # Compile the model.
    autoencoder_model.compile(
        optimizer='adam',
        loss={
            'tfidf_output': 'mse',
            'language_output': 'sparse_categorical_crossentropy',
            'genre_output': 'binary_crossentropy',
            'director_output': 'sparse_categorical_crossentropy'
        },
        loss_weights={
            'tfidf_output': 1.0,
            'language_output': 1.0,
            'genre_output': 1.0,
            'director_output': 1.0
        }
    )

    return autoencoder_model, encoder

# Example usage:
tfidf_dim = 2500      # Dimensionality of your TF-IDF vectors
num_languages = 172   # e.g., if your full dataset has 172 unique languages (indices 0..171)
num_genres = 19       # One column per genre: action, adventure, ..., western
num_directors = num_directors

autoencoder_model2, encoder_model2 = build_autoencoder2(tfidf_dim, num_languages, num_genres, num_directors)
autoencoder_model2.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define the early stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',       # Monitor the validation loss
    patience=3,               # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity
)

history2 = autoencoder_model2.fit(
    x=[tfidf_array, language_data_np, genres_data_np, director_data_np],  # Inputs
    y=[tfidf_array, language_data_np, genres_data_np, director_data_np],  # Targets
    batch_size=16,
    epochs=50,
    callbacks=[early_stop]
)

# K-means and Knn

🚀 Full Workflow for K-Means + KNN Movie Recommendation
This approach:

Prepares data (TF-IDF + numerical/categorical features) ✅
Clusters movies using K-Means ✅
Applies KNN only within the closest cluster ✅
Returns top-K similar movies efficiently ✅

**Use sparse instead of df**
✅ Keeps TF-IDF sparse (NO toarray() conversion) → Saves RAM
✅ Uses hstack() for memory-efficient feature merging
✅ Clusters movies using MiniBatchKMeans to reduce computation
✅ Limits KNN search to relevant clusters → 10x faster!

In [15]:
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

def expand_list_columns(df, list_columns, max_elements=2):
    """
    Expand list-type columns into separate numerical columns.

    Args:
        df: Pandas DataFrame
        list_columns: List of column names that contain lists
        max_elements: Number of elements to extract from each list (default=2)

    Returns:
        DataFrame with expanded columns.
    """
    for col in list_columns:
        df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [0] * max_elements)  # Handle NaNs
        for i in range(max_elements):
            df[f'{col}_{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else 0)  # Extract element i
        df.drop(columns=[col], inplace=True)  # Drop original column
    return df

def preprocess_features_sparse(tfidf_matrix, X):
    """
    Process and concatenate TF-IDF features with numerical/categorical features (sparse version).
    """
    # ✅ Keep TF-IDF sparse
    tfidf_sparse = csr_matrix(tfidf_matrix)  # No conversion to dense!

    # ✅ Drop 'key' column before merging
    X_numeric = X.drop(columns=['key', 'name'])

    # ✅ Convert list-type columns into separate numerical columns
    list_columns = [
        'director_encoded_padded', 'writer_encoded_padded',
        'cinematography_encoded_padded', 'composer_encoded_padded'
    ]
    X_numeric = expand_list_columns(X_numeric, list_columns)

    # ✅ Convert X_numeric to sparse matrix
    X_numeric_sparse = csr_matrix(X_numeric.values)

    # ✅ Concatenate using Scipy `hstack()` (efficient!)
    X_final = hstack([tfidf_sparse, X_numeric_sparse])

    return X_final


In [19]:
from sklearn.cluster import MiniBatchKMeans

def cluster_movies_kmeans_sparse(X_final, n_clusters=200):
    """
    Cluster movies using K-Means on sparse matrix.
    """
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=1024)
    clusters = kmeans.fit_predict(X_final)

    return kmeans, clusters


In [20]:
def get_similar_movies_kmeans_knn_sparse(movie_name, X_final, df, kmeans, name_column, n_neighbors=5):
    """
    Find similar movies using KNN within the assigned cluster from K-Means (Sparse Matrix version).
    Handles case-insensitive movie name search.
    """
    # ✅ Convert movie names in DataFrame to lowercase for case-insensitive search
    df['lowercase_name'] = df[name_column].str.lower()

    # ✅ Convert input movie name to lowercase
    movie_name_lower = movie_name.lower()

    # ✅ Verify movie exists (case-insensitive search)
    if movie_name_lower not in df['lowercase_name'].values:
        raise ValueError(f"Movie '{movie_name}' not found in the DataFrame.")

    # ✅ Get index of input movie
    idx = df[df['lowercase_name'] == movie_name_lower].index[0]

    # ✅ Predict the cluster for the input movie
    movie_cluster = kmeans.predict(X_final[idx].reshape(1, -1))[0]

    # ✅ Get indices of movies in the same cluster
    cluster_indices = df[df["cluster"] == movie_cluster].index

    # ✅ Apply KNN only within the cluster
    knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
    knn_model.fit(X_final[cluster_indices])

    # ✅ Find K-nearest neighbors
    distances, indices = knn_model.kneighbors(X_final[idx].reshape(1, -1), n_neighbors=n_neighbors + 1)

    return df.iloc[indices.flatten()[1:]][name_column].tolist()  # Exclude input movie itself


In [44]:
X_final = preprocess_features_sparse(tfidf_matrix, X)  # This should NOT crash

In [45]:
kmeans, clusters = cluster_movies_kmeans_sparse(X_final)


  super()._check_params_vs_input(X, default_n_init=3)


In [47]:
X["cluster"] = clusters

In [50]:
recommendations = get_similar_movies_kmeans_knn_sparse("the bourne ultimatum", X_final, X, kmeans, "name", n_neighbors=5)

for movie in recommendations:
    print(f"Similar Movie: {movie}")


Similar Movie: The Wolf of Wall Street
Similar Movie: Requiem for a Dream
Similar Movie: Easy A
Similar Movie: Bring It On
Similar Movie: Zoolander


## Feature Selection

In [33]:
X.columns

Index(['name', 'minute', 'key', 'action', 'adventure', 'animation', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'history',
       'horror', 'music', 'mystery', 'romance', 'science_fiction', 'thriller',
       'tv_movie', 'war', 'western', 'language_encoded',
       'director_encoded_padded', 'writer_encoded_padded',
       'cinematography_encoded_padded', 'composer_encoded_padded', 'cluster',
       'lowercase_name'],
      dtype='object')

In [29]:
X_new = X[['name', 'key', 'action', 'adventure', 'animation', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'history',
       'horror', 'music', 'mystery', 'romance', 'science_fiction', 'thriller',
       'tv_movie', 'war', 'western']]

In [30]:
def preprocess_features_sparse_wolist(tfidf_matrix, X):
    """
    Process and concatenate TF-IDF features with numerical/categorical features (sparse version).
    """
    # ✅ Keep TF-IDF sparse
    tfidf_sparse = csr_matrix(tfidf_matrix)  # No conversion to dense!

    # ✅ Drop 'key' column before merging
    X_numeric = X.drop(columns=['key', 'name'])

    # ✅ Convert X_numeric to sparse matrix
    X_numeric_sparse = csr_matrix(X_numeric.values)

    # ✅ Concatenate using Scipy `hstack()` (efficient!)
    X_final = hstack([tfidf_sparse, X_numeric_sparse])

    return X_final


In [31]:
X_final_new = preprocess_features_sparse_wolist(tfidf_matrix, X_new)
kmeans, clusters = cluster_movies_kmeans_sparse(X_final_new)
X["cluster"] = clusters


  super()._check_params_vs_input(X, default_n_init=3)


In [37]:
recommendations = get_similar_movies_kmeans_knn_sparse("the Bourne supremacy", X_final_new, X, kmeans, "name", n_neighbors=5)

for movie in recommendations:
    print(f"Similar Movie: {movie}")

Similar Movie: Hell or High Water
Similar Movie: Pig
Similar Movie: Civil War
Similar Movie: Total Recall
Similar Movie: A Beautiful Mind


In [None]:
df[df["name"]]

In [None]:
#import tensorflow as tf
from keras.layers import Input, Dense, Embedding, Flatten, Concatenate, BatchNormalization, Dropout
from keras.models import Model

def build_autoencoder(num_actors, num_directors, num_numeric, num_tfidf, num_genres, num_languages, embedding_dim=50, encoding_dim=64):
    """
    Build an autoencoder model incorporating embeddings for categorical features,
    dense layers for numerical and high-dimensional TF-IDF features, and separate
    inputs for one-hot encoded genres and languages.
    """
    # Input layers
    actor_input = Input(shape=(1,), name="actor_input")
    director_input = Input(shape=(1,), name="director_input")
    numeric_input = Input(shape=(num_numeric,), name="numeric_features")
    tfidf_input = Input(shape=(num_tfidf,), name="tfidf_features")
    genres_input = Input(shape=(num_genres,), name="genres_features")
    languages_input = Input(shape=(num_languages,), name="languages_features")

    # Embedding layers for categorical variables
    actor_embedding = Embedding(input_dim=num_actors + 1, output_dim=embedding_dim, name="actor_embedding")(actor_input)
    director_embedding = Embedding(input_dim=num_directors + 1, output_dim=embedding_dim, name="director_embedding")(director_input)

    # Flatten embeddings
    actor_embedding_flat = Flatten()(actor_embedding)
    director_embedding_flat = Flatten()(director_embedding)

    # Dense layer for TF-IDF features (dimensionality reduction)
    tfidf_dense = Dense(128, activation='relu', name="tfidf_dense_layer")(tfidf_input)

    # Concatenate all features
    concatenated = Concatenate()([
        actor_embedding_flat,
        director_embedding_flat,
        numeric_input,
        tfidf_dense,
        genres_input,
        languages_input
    ])

    # Encoder
    encoded = Dense(256, activation='relu')(concatenated)
    encoded = BatchNormalization()(encoded)
    encoded = Dropout(0.3)(encoded)
    encoded = Dense(128, activation='relu')(encoded)
    bottleneck = Dense(encoding_dim, activation='relu', name="bottleneck_layer")(encoded)  # Latent space

    # Decoder
    decoded = Dense(128, activation='relu')(bottleneck)
    decoded = BatchNormalization()(decoded)
    decoded = Dropout(0.3)(decoded)
    decoded = Dense(256, activation='relu')(decoded)
    output_layer = Dense(num_numeric + num_tfidf + num_genres + num_languages, activation='sigmoid')(decoded)  # Reconstruct all inputs except categorical IDs

    # Define models
    autoencoder = Model(inputs=[actor_input, director_input, numeric_input, tfidf_input, genres_input, languages_input], outputs=output_layer)
    encoder = Model(inputs=[actor_input, director_input, numeric_input, tfidf_input, genres_input, languages_input], outputs=bottleneck)

    # Compile the model
    autoencoder.compile(optimizer='adam', loss='mse')

    return autoencoder, encoder


ModuleNotFoundError: No module named 'tensorflow.keras'

In [51]:
tfidf_matrix

<406445x335304 sparse matrix of type '<class 'numpy.float64'>'
	with 10796785 stored elements in Compressed Sparse Row format>