In [1]:
import pandas as pd
import os

In [2]:
current_dir = os.getcwd()
parent_dir=os.path.dirname(current_dir)
data= pd.read_csv(os.path.join(parent_dir, 'processed_data/final_data.csv'))
data.head()

Unnamed: 0,id,name,year,description,minute,rating,key,genre_list,actor_list,language,studio_list,crew_dict
0,1000001,Barbie,2023.0,Barbie and Ken are having the time of their li...,114,3.86,Barbie (2023),comedy adventure,"['Margot Robbie', 'Ryan Gosling', 'America Fer...",English,"['LuckyChap Entertainment', 'Heyday Films', 'N...","{'Cinematography': ['Rodrigo Prieto'], 'Compos..."
1,1000002,Parasite,2019.0,"All unemployed, Ki-taek's family takes peculia...",133,4.56,Parasite (2019),comedy thriller drama,"['Song Kang-ho', 'Lee Sun-kyun', 'Cho Yeo-jeon...",Korean,['Barunson E&A'],"{'Cinematography': ['Hong Kyung-pyo'], 'Compos..."
2,1000003,Everything Everywhere All at Once,2022.0,An aging Chinese immigrant is swept up in an i...,140,4.3,Everything Everywhere All at Once (2022),science_fiction adventure comedy action,"['Michelle Yeoh', 'Ke Huy Quan', 'Stephanie Hs...",English,"['IAC Films', 'AGBO', 'Ley Line Entertainment'...","{'Cinematography': ['Larkin Seiple'], 'Compose..."
3,1000004,Fight Club,1999.0,A ticking-time-bomb insomniac and a slippery s...,139,4.27,Fight Club (1999),drama,"['Edward Norton', 'Brad Pitt', 'Helena Bonham ...",English,"['Fox 2000 Pictures', 'Regency Enterprises', '...","{'Cinematography': ['Jeff Cronenweth'], 'Compo..."
4,1000005,La La Land,2016.0,"Mia, an aspiring actress, serves lattes to mov...",129,4.09,La La Land (2016),drama comedy music romance,"['Ryan Gosling', 'Emma Stone', 'John Legend', ...",English,"['Summit Entertainment', 'Black Label Media', ...","{'Cinematography': ['Linus Sandgren'], 'Compos..."


In [3]:
import string
import numpy as np
import re
import ast
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
# from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.sequence import pad_sequences

def text_preprocess(sentence):
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers #TODO
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenize
    stop_words = set(stopwords.words('english'))
    stopwords_removed = [w for w in tokenized_sentence if not w in stop_words]
    v_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v")
        for word in stopwords_removed
    ]
    n_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "n")
        for word in v_lemmatized
    ]
    cleaned_sentence = ' '.join(word for word in n_lemmatized)
    return cleaned_sentence

def num_preprocess_year(value):
    scaler = RobustScaler()
    result = scaler.fit_transform(value)
    return result

def num_preprocess_min(value):
    scaler = MinMaxScaler()
    result = scaler.fit_transform(value)
    return result

def fix_data_from_csv(df):
    df[["language", "genre_list"]] = df[["language", "genre_list"]].fillna("")
    return df

######################### NEW INPUT #########################

# changed this function ##
def cat_processing_genre(df, column="genre_list"):
    # Initialize MultiLabelBinarizer and transform the data
    encoder = MultiLabelBinarizer()
    genre_df = pd.DataFrame(encoder.fit_transform(df[column].str.split(' ')),
                                  columns=encoder.classes_,
                                  index=df.index)
    df = pd.concat([df, genre_df], axis=1)

    return df

def cat_processing_lan(df, column="language"):
    """
    Cleans and encodes a single categorical column (e.g., language) using LabelEncoder.
    - Keeps only the first value before delimiters (comma, slash, semicolon, pipe).
    - Encodes categorical values into numerical labels.
    """

    df[column] = df[column].astype(str).str.split(r",|/|;|\|").str[0].str.strip()

    encoder = LabelEncoder()
    df[f"{column}_encoded"] = encoder.fit_transform(df[column])

    return df

def safe_eval_column(df, column_name="crew_dict"):
    """
    Safely converts a column containing string representations of dictionaries into actual dictionaries.
    - If the value is already a dictionary, it remains unchanged.
    - If the value is a valid string dictionary, it is converted using `ast.literal_eval`.
    - If conversion fails, an empty dictionary `{}` is returned.
    """
    def safe_eval(val):
        if isinstance(val, str):
            try:
                return ast.literal_eval(val)  # Convert only if it's a valid string dictionary
            except (ValueError, SyntaxError):
                return {}  # Return empty dictionary if parsing fails
        return val  # Return as is if already a dict

    df[column_name] = df[column_name].apply(safe_eval)
    return df


def extract_roles(df, column_name="crew_dict", roles=None):
    """
    Extracts specific roles (e.g., Director, Writer) from a dictionary column.
    Creates new columns for each role with lists of names.
    """
    if roles is None:
        roles = ["Director", "Writer", "Cinematography", "Composer"]

    for role in roles:
        df[role.lower()] = df[column_name].apply(
            lambda x: x.get(role, []) if isinstance(x, dict) else []
        )

    return df


def encode_list_column_with_padding(df, column_name, padding_value=0, max_length=2):
    """
    Encodes a column containing lists of categorical values (e.g., directors) and applies padding.
    - Uses LabelEncoder to encode unique values.
    - Pads sequences to a fixed length.
    """
    # Flatten unique values for encoding
    unique_values = sorted(set(value for sublist in df[column_name] for value in sublist))

    # Fit LabelEncoder once
    encoder = LabelEncoder()
    encoder.fit(unique_values)

    # Create mapping dictionary for faster lookup
    encoding_map = {label: idx for idx, label in enumerate(encoder.classes_)}

    # Apply encoding efficiently
    df[f"{column_name}_encoded"] = df[column_name].apply(lambda x: [encoding_map[v] for v in x])

    # Apply padding to ensure fixed-length sequences
    df[f"{column_name}_encoded_padded"] = list(
        pad_sequences(df[f"{column_name}_encoded"], maxlen=max_length, padding='pre', value=padding_value)
    )

    return df, len(unique_values)

def data_preproc(df):
    df = fix_data_from_csv(df)
    df['description'] = df['description'].apply(text_preprocess)
    df['year'] = num_preprocess_year(df[['year']])
    df['minute'] = num_preprocess_min(df[['minute']])
    df = cat_processing_genre(df,'genre_list') ## df equal added
    df = cat_processing_lan(df, 'language')
    return df

def data_encode(df):
    # Dictionary Processing
    df = safe_eval_column(df, column_name="crew_dict")
    df = extract_roles(df, column_name="crew_dict")

    # Encoding list columns with padding
    df, director_length = encode_list_column_with_padding(df, "director")
    df, writer_length = encode_list_column_with_padding(df, "writer")
    df, cinematography_length = encode_list_column_with_padding(df, "cinematography")
    df, composer_length = encode_list_column_with_padding(df, "composer")

    return df, {
        "director_length": director_length,
        "writer_length": writer_length,
        "cinematography_length": cinematography_length,
        "composer_length": composer_length,
    }


In [4]:
data_processed = data_preproc(data)

In [5]:
data_processed.columns

Index(['id', 'name', 'year', 'description', 'minute', 'rating', 'key',
       'genre_list', 'actor_list', 'language', 'studio_list', 'crew_dict', '',
       'action', 'adventure', 'animation', 'comedy', 'crime', 'documentary',
       'drama', 'family', 'fantasy', 'history', 'horror', 'music', 'mystery',
       'romance', 'science_fiction', 'thriller', 'tv_movie', 'war', 'western',
       'language_encoded'],
      dtype='object')

In [6]:
data_encoded = data_encode(data_processed)

In [7]:
df = pd.DataFrame(data_encoded[0])

In [8]:
df.isnull().sum()

id                                    0
name                                  0
year                              15719
description                           0
minute                                0
rating                           331222
key                                   0
genre_list                            0
actor_list                            0
language                              0
studio_list                           0
crew_dict                         38988
                                      0
action                                0
adventure                             0
animation                             0
comedy                                0
crime                                 0
documentary                           0
drama                                 0
family                                0
fantasy                               0
history                               0
horror                                0
music                                 0


In [9]:
dict_crew = data_encoded[1]
dict_crew

{'director_length': 155188,
 'writer_length': 167862,
 'cinematography_length': 59282,
 'composer_length': 50527}

# Create X

In [10]:
X = df.drop(columns=['crew_dict',
                     'description',
                     'language',
                     'studio_list',
                     'rating',
                     'year',
                     'id',
                     'actor_list',
                     'cinematography_encoded',
                     'composer_encoded',
                     'director_encoded',
                     'writer_encoded',
                     'genre_list',
                     'director',
                     'writer',
                     'cinematography',
                     'composer',
                     ''], axis=1)

In [11]:
X.columns

Index(['name', 'minute', 'key', 'action', 'adventure', 'animation', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'history',
       'horror', 'music', 'mystery', 'romance', 'science_fiction', 'thriller',
       'tv_movie', 'war', 'western', 'language_encoded',
       'director_encoded_padded', 'writer_encoded_padded',
       'cinematography_encoded_padded', 'composer_encoded_padded'],
      dtype='object')

In [12]:
X.dtypes

name                              object
minute                           float64
key                               object
action                             int64
adventure                          int64
animation                          int64
comedy                             int64
crime                              int64
documentary                        int64
drama                              int64
family                             int64
fantasy                            int64
history                            int64
horror                             int64
music                              int64
mystery                            int64
romance                            int64
science_fiction                    int64
thriller                           int64
tv_movie                           int64
war                                int64
western                            int64
language_encoded                   int64
director_encoded_padded           object
writer_encoded_p

In [13]:
X

Unnamed: 0,name,minute,key,action,adventure,animation,comedy,crime,documentary,drama,...,science_fiction,thriller,tv_movie,war,western,language_encoded,director_encoded_padded,writer_encoded_padded,cinematography_encoded_padded,composer_encoded_padded
0,Barbie,0.366834,Barbie (2023),0,1,0,1,0,0,0,...,0,0,0,0,0,38,"[0, 50864]","[116613, 55438]","[0, 46905]","[30073, 2873]"
1,Parasite,0.462312,Parasite (2019),0,0,0,1,0,0,1,...,0,1,0,0,0,77,"[0, 17922]","[19597, 57357]","[0, 21591]","[0, 24603]"
2,Everything Everywhere All at Once,0.497487,Everything Everywhere All at Once (2022),1,1,0,1,0,0,0,...,1,0,0,0,0,38,"[30580, 30343]","[32629, 32861]","[0, 31693]","[38090, 19026]"
3,Fight Club,0.492462,Fight Club (1999),0,0,0,0,0,0,1,...,0,0,0,0,0,38,"[0, 32213]","[72859, 9194]","[0, 24994]","[22882, 31970]"
4,La La Land,0.442211,La La Land (2016),0,0,0,1,0,0,1,...,0,0,0,0,0,38,"[0, 29358]","[0, 31657]","[0, 32660]","[0, 24690]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406440,日本統一56,0.145729,日本統一56,0,0,0,0,0,0,0,...,0,0,0,0,0,38,"[0, 0]","[0, 0]","[0, 0]","[0, 0]"
406441,日本統一57,0.145729,日本統一57,0,0,0,0,0,0,0,...,0,0,0,0,0,38,"[0, 0]","[0, 0]","[0, 0]","[0, 0]"
406442,日本統一58,0.145729,日本統一58,0,0,0,0,0,0,0,...,0,0,0,0,0,38,"[0, 0]","[0, 0]","[0, 0]","[0, 0]"
406443,日本統一59,0.150754,日本統一59,0,0,0,0,0,0,0,...,0,0,0,0,0,38,"[0, 0]","[0, 0]","[0, 0]","[0, 0]"


### Question

Do we want to keep last movies?

# Create TFIDF Matrix

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

def vectorize_descriptions(df, text_column):
    """
    Vectorize movie descriptions using TF-IDF.

    Args:
        df: The DataFrame containing movie descriptions.
        text_column: The column in the DataFrame that contains descriptions.

    Returns:
        tfidf_matrix: The TF-IDF matrix.
        vectorizer: The fitted TfidfVectorizer object (useful if needed later).
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    return tfidf_matrix


In [15]:
text_df = df[['key', 'description']]
text_df

Unnamed: 0,key,description
0,Barbie (2023),barbie ken time live colorful seemingly perfec...
1,Parasite (2019),unemployed kitaeks family take peculiar intere...
2,Everything Everywhere All at Once (2022),age chinese immigrant sweep insane adventure a...
3,Fight Club (1999),tickingtimebomb insomniac slippery soap salesm...
4,La La Land (2016),mia aspire actress serve latte movie star audi...
...,...,...
406440,日本統一56,one day special investigation team suddenly be...
406441,日本統一57,hamadas death seem yuseikais problem would res...
406442,日本統一58,makimoto tomohiro waki arimura tasuku nagaoka ...
406443,日本統一59,one day himuro yasufu motomiya tamura yoshiyuk...


In [16]:
tfidf_matrix = vectorize_descriptions(text_df, 'description')

# Simple KNN

This workflow:

Converts TF-IDF (sparse) into a Pandas DataFrame ✅
Expands list-based columns into numerical features ✅
Uses pandas concat() instead of hstack ✅
Retrieves similar movies using KNN (cosine similarity) ✅

In [23]:
import pandas as pd

def convert_tfidf_to_df(tfidf_matrix, index):
    """
    Convert a sparse TF-IDF matrix to a Pandas DataFrame.

    Args:
        tfidf_matrix: Sparse TF-IDF matrix from TfidfVectorizer.
        index: Index for the resulting DataFrame.

    Returns:
        Pandas DataFrame with TF-IDF features.
    """
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=index)  # Convert sparse to dense
    return tfidf_df

def expand_list_columns(df, list_columns, max_elements=2):
    """
    Expand list-type columns into separate numerical columns.

    Args:
        df: Pandas DataFrame
        list_columns: List of column names that contain lists
        max_elements: Number of elements to extract from each list (default=2)

    Returns:
        DataFrame with expanded columns.
    """
    for col in list_columns:
        df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [0] * max_elements)  # Handle NaNs
        for i in range(max_elements):
            df[f'{col}_{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else 0)  # Extract element i
        df.drop(columns=[col], inplace=True)  # Drop original column
    return df


In [45]:
def knn_fit_pandas(tfidf_matrix, X):
    """
    Fit KNN on a pandas-based feature space (TF-IDF + numerical/categorical features).

    Args:
        tfidf_matrix: Precomputed TF-IDF sparse matrix.
        X: DataFrame with numerical/categorical features (excluding 'key').

    Returns:
        Trained KNN model, Processed feature DataFrame.
    """
    # Convert TF-IDF matrix to DataFrame
    tfidf_df = convert_tfidf_to_df(tfidf_matrix, X.index)

    # Drop 'key' before concatenation
    X_numeric = X.drop(columns=['key'])

    # Convert list-type columns into separate numerical columns
    list_columns = [
        'director_encoded_padded', 'writer_encoded_padded',
        'cinematography_encoded_padded', 'composer_encoded_padded'
    ]
    X_numeric = expand_list_columns(X_numeric, list_columns)

    # Concatenate using pandas
    X_final = pd.concat([tfidf_df, X_numeric], axis=1)

    # Initialize & Fit KNN
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(X_final)

    return knn, X_final


In [46]:
def get_similar_movies_knn_pandas(knn, X_final, df, input_name, name_column, n_neighbors=5):
    """
    Find similar movies using KNN based on a pandas DataFrame.

    Args:
        knn: Trained KNN model.
        X_final: Feature DataFrame (TF-IDF + numerical/categorical features).
        df: DataFrame containing movie names and metadata.
        input_name: The name of the movie to find similarities for.
        name_column: Column in the DataFrame that contains movie names.
        n_neighbors: The number of similar movies to retrieve.

    Returns:
        A list of dictionaries with movie names and similarity scores.
    """
    # Verify movie exists
    verify_input(df, input_name, name_column)

    # Get index of input movie
    idx = df[df[name_column] == input_name].index[0]

    # Directly use the feature vector from X_final
    input_movie_vector = X_final.loc[idx].values.reshape(1, -1)

    # Find nearest neighbors
    distances, indices = knn.kneighbors(input_movie_vector, n_neighbors=n_neighbors + 1)  # +1 to exclude itself

    # Exclude the input movie itself
    similar_movies = []
    for i in range(1, len(indices.flatten())):
        similar_movies.append({
            'movie': df.iloc[indices.flatten()[i]][name_column],
            'similarity_score': 1 - distances.flatten()[i]  # Convert distance to similarity
        })

    return similar_movies


In [47]:
knn, X_final = knn_fit_pandas(tfidf_matrix, X)


: 

In [None]:
recommendations = get_similar_movies_knn_pandas(knn, X_final, X, "Inception", "key", n_neighbors=5)

# Print recommendations
for rec in recommendations:
    print(f"Movie: {rec['movie']} | Similarity Score: {rec['similarity_score']:.4f}")


# K-means and Knn

🚀 Full Workflow for K-Means + KNN Movie Recommendation
This approach:

Prepares data (TF-IDF + numerical/categorical features) ✅
Clusters movies using K-Means ✅
Applies KNN only within the closest cluster ✅
Returns top-K similar movies efficiently ✅

**Use sparse instead of df**
✅ Keeps TF-IDF sparse (NO toarray() conversion) → Saves RAM
✅ Uses hstack() for memory-efficient feature merging
✅ Clusters movies using MiniBatchKMeans to reduce computation
✅ Limits KNN search to relevant clusters → 10x faster!

In [17]:
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

def expand_list_columns(df, list_columns, max_elements=2):
    """
    Expand list-type columns into separate numerical columns.

    Args:
        df: Pandas DataFrame
        list_columns: List of column names that contain lists
        max_elements: Number of elements to extract from each list (default=2)

    Returns:
        DataFrame with expanded columns.
    """
    for col in list_columns:
        df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [0] * max_elements)  # Handle NaNs
        for i in range(max_elements):
            df[f'{col}_{i}'] = df[col].apply(lambda x: x[i] if len(x) > i else 0)  # Extract element i
        df.drop(columns=[col], inplace=True)  # Drop original column
    return df

def preprocess_features_sparse(tfidf_matrix, X):
    """
    Process and concatenate TF-IDF features with numerical/categorical features (sparse version).
    """
    # ✅ Keep TF-IDF sparse
    tfidf_sparse = csr_matrix(tfidf_matrix)  # No conversion to dense!

    # ✅ Drop 'key' column before merging
    X_numeric = X.drop(columns=['key', 'name'])

    # ✅ Convert list-type columns into separate numerical columns
    list_columns = [
        'director_encoded_padded', 'writer_encoded_padded',
        'cinematography_encoded_padded', 'composer_encoded_padded'
    ]
    X_numeric = expand_list_columns(X_numeric, list_columns)

    # ✅ Convert X_numeric to sparse matrix
    X_numeric_sparse = csr_matrix(X_numeric.values)

    # ✅ Concatenate using Scipy `hstack()` (efficient!)
    X_final = hstack([tfidf_sparse, X_numeric_sparse])

    return X_final


In [18]:
from sklearn.cluster import MiniBatchKMeans

def cluster_movies_kmeans_sparse(X_final, n_clusters=1000):
    """
    Cluster movies using K-Means on sparse matrix.
    """
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=42, batch_size=1024)
    clusters = kmeans.fit_predict(X_final)

    return kmeans, clusters


In [19]:
def get_similar_movies_kmeans_knn_sparse(movie_name, X_final, df, kmeans, name_column, n_neighbors=5):
    """
    Find similar movies using KNN within the assigned cluster from K-Means (Sparse Matrix version).
    Handles case-insensitive movie name search.
    """
    # ✅ Convert movie names in DataFrame to lowercase for case-insensitive search
    df['lowercase_name'] = df[name_column].str.lower()

    # ✅ Convert input movie name to lowercase
    movie_name_lower = movie_name.lower()

    # ✅ Verify movie exists (case-insensitive search)
    if movie_name_lower not in df['lowercase_name'].values:
        raise ValueError(f"Movie '{movie_name}' not found in the DataFrame.")

    # ✅ Get index of input movie
    idx = df[df['lowercase_name'] == movie_name_lower].index[0]

    # ✅ Predict the cluster for the input movie
    movie_cluster = kmeans.predict(X_final[idx].reshape(1, -1))[0]

    # ✅ Get indices of movies in the same cluster
    cluster_indices = df[df["cluster"] == movie_cluster].index

    # ✅ Apply KNN only within the cluster
    knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
    knn_model.fit(X_final[cluster_indices])

    # ✅ Find K-nearest neighbors
    distances, indices = knn_model.kneighbors(X_final[idx].reshape(1, -1), n_neighbors=n_neighbors + 1)

    return df.iloc[indices.flatten()[1:]][name_column].tolist()  # Exclude input movie itself


In [20]:
X_final = preprocess_features_sparse(tfidf_matrix, X)  # This should NOT crash

In [23]:
recommendations = get_similar_movies_kmeans_knn_sparse("My dinner with Andre", X_final, X, kmeans, "name", n_neighbors=5)

for movie in recommendations:
    print(f"Similar Movie: {movie}")


Similar Movie: Motocrossed
Similar Movie: Leprechaun 4: In Space
Similar Movie: Three Lives and Only One Death
Similar Movie: Love Affair(s)
Similar Movie: They Drive by Night


## Feature Selection

In [33]:
X.columns

Index(['name', 'minute', 'key', 'action', 'adventure', 'animation', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'history',
       'horror', 'music', 'mystery', 'romance', 'science_fiction', 'thriller',
       'tv_movie', 'war', 'western', 'language_encoded',
       'director_encoded_padded', 'writer_encoded_padded',
       'cinematography_encoded_padded', 'composer_encoded_padded', 'cluster',
       'lowercase_name'],
      dtype='object')

In [35]:
X_new = X[['name', 'key', 'action', 'adventure', 'animation', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'history',
       'horror', 'music', 'mystery', 'romance', 'science_fiction', 'thriller',
       'tv_movie', 'war', 'western']]

In [37]:
def preprocess_features_sparse_wolist(tfidf_matrix, X):
    """
    Process and concatenate TF-IDF features with numerical/categorical features (sparse version).
    """
    # ✅ Keep TF-IDF sparse
    tfidf_sparse = csr_matrix(tfidf_matrix)  # No conversion to dense!

    # ✅ Drop 'key' column before merging
    X_numeric = X.drop(columns=['key', 'name'])

    # ✅ Convert X_numeric to sparse matrix
    X_numeric_sparse = csr_matrix(X_numeric.values)

    # ✅ Concatenate using Scipy `hstack()` (efficient!)
    X_final = hstack([tfidf_sparse, X_numeric_sparse])

    return X_final


In [39]:
X_final_new = preprocess_features_sparse_wolist(tfidf_matrix, X_new)
kmeans, clusters = cluster_movies_kmeans_sparse(X_final_new)
X["cluster"] = clusters


  super()._check_params_vs_input(X, default_n_init=3)


Similar Movie: Mirror Game
Similar Movie: Welcome to the Dollhouse
Similar Movie: Poetical Refugee
Similar Movie: Free Willy: Escape from Pirate's Cove
Similar Movie: Prince and the Revolution: Live


In [43]:
recommendations = get_similar_movies_kmeans_knn_sparse("amadeus", X_final, X, kmeans, "name", n_neighbors=5)

for movie in recommendations:
    print(f"Similar Movie: {movie}")

Similar Movie: Ritual
Similar Movie: The Big Sick
Similar Movie: Meet the Robinsons
Similar Movie: Shiva Baby
Similar Movie: The Color of Pomegranates


In [None]:
#import tensorflow as tf
from keras.layers import Input, Dense, Embedding, Flatten, Concatenate, BatchNormalization, Dropout
from keras.models import Model

def build_autoencoder(num_actors, num_directors, num_numeric, num_tfidf, num_genres, num_languages, embedding_dim=50, encoding_dim=64):
    """
    Build an autoencoder model incorporating embeddings for categorical features,
    dense layers for numerical and high-dimensional TF-IDF features, and separate
    inputs for one-hot encoded genres and languages.
    """
    # Input layers
    actor_input = Input(shape=(1,), name="actor_input")
    director_input = Input(shape=(1,), name="director_input")
    numeric_input = Input(shape=(num_numeric,), name="numeric_features")
    tfidf_input = Input(shape=(num_tfidf,), name="tfidf_features")
    genres_input = Input(shape=(num_genres,), name="genres_features")
    languages_input = Input(shape=(num_languages,), name="languages_features")

    # Embedding layers for categorical variables
    actor_embedding = Embedding(input_dim=num_actors + 1, output_dim=embedding_dim, name="actor_embedding")(actor_input)
    director_embedding = Embedding(input_dim=num_directors + 1, output_dim=embedding_dim, name="director_embedding")(director_input)

    # Flatten embeddings
    actor_embedding_flat = Flatten()(actor_embedding)
    director_embedding_flat = Flatten()(director_embedding)

    # Dense layer for TF-IDF features (dimensionality reduction)
    tfidf_dense = Dense(128, activation='relu', name="tfidf_dense_layer")(tfidf_input)

    # Concatenate all features
    concatenated = Concatenate()([
        actor_embedding_flat,
        director_embedding_flat,
        numeric_input,
        tfidf_dense,
        genres_input,
        languages_input
    ])

    # Encoder
    encoded = Dense(256, activation='relu')(concatenated)
    encoded = BatchNormalization()(encoded)
    encoded = Dropout(0.3)(encoded)
    encoded = Dense(128, activation='relu')(encoded)
    bottleneck = Dense(encoding_dim, activation='relu', name="bottleneck_layer")(encoded)  # Latent space

    # Decoder
    decoded = Dense(128, activation='relu')(bottleneck)
    decoded = BatchNormalization()(decoded)
    decoded = Dropout(0.3)(decoded)
    decoded = Dense(256, activation='relu')(decoded)
    output_layer = Dense(num_numeric + num_tfidf + num_genres + num_languages, activation='sigmoid')(decoded)  # Reconstruct all inputs except categorical IDs

    # Define models
    autoencoder = Model(inputs=[actor_input, director_input, numeric_input, tfidf_input, genres_input, languages_input], outputs=output_layer)
    encoder = Model(inputs=[actor_input, director_input, numeric_input, tfidf_input, genres_input, languages_input], outputs=bottleneck)

    # Compile the model
    autoencoder.compile(optimizer='adam', loss='mse')

    return autoencoder, encoder


ModuleNotFoundError: No module named 'tensorflow.keras'