In [46]:
import pandas as pd
import os

In [None]:
current_dir = os.getcwd()
parent_dir=os.path.dirname(current_dir)
data= pd.read_csv(os.path.join(parent_dir, 'processed_data/clean_compile.csv'))

In [16]:
data = data.drop(columns=["id", "Unnamed: 0"]).reset_index(drop=True)

In [17]:
data_new = data[(data['minute'] > 40) & (data['minute'] <= 240)]
data_new


## Preproccessing 

In [18]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer

def text_preprocess(sentence):
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers #TODO
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    tokenized_sentence = word_tokenize(sentence) ## tokenize
    stop_words = set(stopwords.words('english'))
    stopwords_removed = [w for w in tokenized_sentence if not w in stop_words]
    v_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v")
        for word in stopwords_removed
    ]
    n_lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "n")
        for word in v_lemmatized
    ]
    cleaned_sentence = ' '.join(word for word in n_lemmatized)
    return cleaned_sentence

def num_preprocess_year(value):
    scaler = RobustScaler()
    result = scaler.fit_transform(value)
    return result

def num_preprocess_min(value):
    scaler = MinMaxScaler()
    result = scaler.fit_transform(value)
    return result

def cat_processing_genre(df, value):
    unique_genres = set(genre for genres in df[value] for genre in eval(genres))
    for genre in unique_genres:
        df[genre] = df[value].apply(lambda x: 1 if genre in x else 0)
    return df.rename(columns={
    'science fiction': 'science_fiction',
    'tv movie': 'tv_movie'
    })

def cat_processing_lan(df, value):
    temp_lang = (df[value].value_counts()/df[value].value_counts().sum())*100
    temp = list(temp_lang.head(20).keys())
    df_filtered = df[df[value].isin(temp)]
    return df_filtered

def data_preproc(df):
    df['description'] = df['description'].apply(text_preprocess)
    df['date'] = num_preprocess_year(df[['date']])
    df['minute'] = num_preprocess_min(df[['minute']])
    cat_processing_genre(df,'genre_list')
    cat_processing_lan(df, 'language')
    return df

def text_encode(df):
    vectorizer =TfidfVectorizer()
    X = vectorizer.fit_transform(df['description'])
    X = pd.DataFrame(
        X.toarray(),
        columns = vectorizer.get_feature_names_out()
        )
    return X

In [19]:
data_sampled = data_new.sample(n=100)
#ata_sampled = data_new[(data_new['date'] > 1980) & (data_new['minute'] > 90)]
data_sampled.shape


(100, 9)

In [20]:
cleaned_data = data_preproc(data_new)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['description'] = df['description'].apply(text_preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = num_preprocess_year(df[['date']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['minute'] = num_preprocess_min(df[['minute']])
A value is trying to be set on a copy of a slic

In [21]:
cleaned_data

Unnamed: 0,name,date,description,minute,genre_list,actor_list,language,crew_dict,key_b,History,...,Comedy,Animation,War,Thriller,Drama,Documentary,Crime,TV Movie,Science Fiction,Horror
0,Parasite,0.388889,unemployed kitaeks family take peculiar intere...,0.462312,"['Comedy', 'Thriller', 'Drama']","['Song Kang-ho', 'Lee Sun-kyun', 'Cho Yeo-jeon...",Korean,"{'Cinematography': ['Hong Kyung-pyo'], 'Compos...",Parasite (2019),0,...,1,0,0,1,1,0,0,0,0,0
1,Everything Everywhere All at Once,0.472222,age chinese immigrant sweep insane adventure a...,0.497487,"['Science Fiction', 'Adventure', 'Comedy', 'Ac...","['Michelle Yeoh', 'Ke Huy Quan', 'Stephanie Hs...",English,"{'Cinematography': ['Larkin Seiple'], 'Compose...",Everything Everywhere All at Once (2022),0,...,1,0,0,0,0,0,0,0,1,0
2,Fight Club,-0.166667,tickingtimebomb insomniac slippery soap salesm...,0.492462,['Drama'],"['Edward Norton', 'Brad Pitt', 'Helena Bonham ...",English,"{'Cinematography': ['Jeff Cronenweth'], 'Compo...",Fight Club (1999),0,...,0,0,0,0,1,0,0,0,0,0
3,La La Land,0.305556,mia aspire actress serve latte movie star audi...,0.442211,"['Drama', 'Comedy', 'Music', 'Romance']","['Ryan Gosling', 'Emma Stone', 'John Legend', ...",English,"{'Cinematography': ['Linus Sandgren'], 'Compos...",La La Land (2016),0,...,1,0,0,0,1,0,0,0,0,0
4,Oppenheimer,0.500000,story j robert oppenheimer role development at...,0.703518,"['Drama', 'History']","['Cillian Murphy', 'Emily Blunt', 'Matt Damon'...",English,"{'Cinematography': ['Hoyte van Hoytema'], 'Com...",Oppenheimer (2023),1,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379195,Red Ink Ghost Stories,-1.027778,vampire yoshitsune appear rashomon every night...,0.020101,"['Horror', 'TV Movie']","['Masaya Kikawada', 'Tomoko Sekihara', '源九郎判官義経']",Japanese,{'Director': ['Kentaro Uchida']},Red Ink Ghost Stories (1968),0,...,0,0,0,0,0,0,0,1,0,1
379196,Svichka's Wedding,-1.194444,spectacle version ivan franko theater base “ c...,0.271357,['Drama'],"['Viktor Tsymbalist', 'Olga Kusenko', 'Dmytro ...",Ukrainian,"{'Cinematography': ['Serhiy Revenko'], 'Compos...",Svichka's Wedding (1962),0,...,0,0,0,0,1,0,0,0,0,0
379197,Two Brothers,-1.222222,two brother utpal elder kamal younger love see...,0.412060,"['Drama', 'Romance']","['Uttam Kumar', 'Biswajeet Chatterjee', 'Sabit...","Bengali, Bangla","{'Cinematography': ['Bibhuti Chakraborty'], 'D...",Two Brothers (1961),0,...,0,0,0,0,1,0,0,0,0,0
379200,The Spendthrift,-2.500000,remarkable sixpart adaptation sensational stag...,0.125628,['Drama'],"['Irene Fenwick', 'Cyril Keightley', 'Malcolm ...",English,"{'Director': ['Walter Edwin'], 'Writer': ['Por...",The Spendthrift (1915),0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
#cleaned_data.to_csv(os.path.join(parent_dir, cleaned.csv))

## Experimenting after application of data_preproc

### TF-IDF

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

def vectorize_descriptions(df, text_column):
    """
    Vectorize movie descriptions using TF-IDF.

    Args:
        df: The DataFrame containing movie descriptions.
        text_column: The column in the DataFrame that contains descriptions.

    Returns:
        tfidf_matrix: The TF-IDF matrix.
        vectorizer: The fitted TfidfVectorizer object (useful if needed later).
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    return tfidf_matrix

def get_similar_movies_knn(tfidf_matrix, df, movie_name, name_column, n_neighbors=5):
    """
    Find similar movies using KNN based on a TF-IDF matrix.

    Args:
        tfidf_matrix: The TF-IDF matrix.
        df: The DataFrame containing movie names and descriptions.
        movie_name: The name of the movie to find similarities for.
        name_column: The column in the DataFrame that contains movie names.
        n_neighbors: The number of similar movies to retrieve (default is 5).

    Returns:
        A list of dictionaries with movie names and similarity scores.
    """
    # Fit KNN on the TF-IDF matrix
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(tfidf_matrix)

    # Get the index of the input movie
    if movie_name not in df[name_column].values:
        raise ValueError(f"Movie '{movie_name}' not found in the DataFrame.")

    idx = df[df[name_column] == movie_name].index[0]

    # Find nearest neighbors
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=n_neighbors + 1)

    # Exclude the input movie itself
    similar_movies = []
    for i in range(1, len(indices.flatten())):
        similar_movies.append({
            'movie_name': df.iloc[indices.flatten()[i]][name_column],
            'similarity_score': 1 - distances.flatten()[i]  # Convert distance to similarity
        })

    return similar_movies

In [23]:
#Example Usage
# Vectorize descriptions
tfidf_matrix = vectorize_descriptions(cleaned_data, 'description')

In [54]:
# Specify the movie name and the column for movie titles
movie_name = "Oppenheimer"
name_column = 'name'

# Get similar movies
similar_movies = get_similar_movies_knn(tfidf_matrix, cleaned_data, movie_name, name_column)

# Print similar movies
for movie in similar_movies:
    print(f"Movie: {movie['movie_name']}, Similarity Score: {movie['similarity_score']:.2f}")

Movie: Oppenheimer: Genius or Madman?, Similarity Score: 0.53
Movie: The Oppenheimer Case, Similarity Score: 0.50
Movie: Oppenheimer: The Real Story, Similarity Score: 0.48
Movie: Oppenheimer After Trinity, Similarity Score: 0.47
Movie: The Moment in Time: The Manhattan Project, Similarity Score: 0.44


### BERT

In [25]:
from sklearn.neighbors import NearestNeighbors
from transformers import BertTokenizer, TFBertModel
import numpy as np
import tensorflow as tf


# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Function to generate BERT embeddings for text
def get_bert_embeddings(texts):
    """
    Generate BERT embeddings for a list of texts.

    Args:
        texts: List of text descriptions.

    Returns:
        Numpy array of BERT embeddings.
    """
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="tf", max_length=512)
    outputs = bert_model(inputs)
    # Use [CLS] token embeddings for the entire text
    embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, embedding_dim)
    return embeddings.numpy()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on,

In [26]:
def find_similar_movies_from_df(query, df, name_column, description_column, n_neighbors=5):
    """
    Find movies with similar descriptions using KNN and BERT embeddings.

    Args:
        query: The query description as a string.
        df: DataFrame containing movie titles and descriptions.
        name_column: Column name for movie titles in the DataFrame.
        description_column: Column name for movie descriptions in the DataFrame.
        n_neighbors: Number of neighbors to retrieve.

    Returns:
        A list of the most similar movies with their titles, descriptions, and similarity scores.
    """
    # Generate BERT embeddings for all movie descriptions
    descriptions = df[description_column].tolist()
    bert_embeddings = get_bert_embeddings(descriptions)

    # Fit KNN on the embeddings
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(bert_embeddings)

    # Generate embedding for the query description
    query_embedding = get_bert_embeddings([query])

    # Find the nearest neighbors
    distances, indices = knn.kneighbors(query_embedding, n_neighbors=n_neighbors)

    # Retrieve the top matches
    similar_movies = [
        {
            'movie_title': df.iloc[idx][name_column],
            'description': df.iloc[idx][description_column],
            'similarity_score': 1 - distances[0][i]  # Convert distance to similarity
        }
        for i, idx in enumerate(indices[0])
    ]

    return similar_movies


In [None]:
# Example Usage
query_description = "A detective solving mysteries in a small town."
name_column = 'name'  # Replace with your column name for movie titles
description_column = 'description'  # Replace with your column name for descriptions

# Call the function directly
similar_movies = find_similar_movies_from_df(query_description, cleaned_data, name_column, description_column)

# Print the results
for movie in similar_movies:
    print(f"Title: {movie['movie_title']}, Description: {movie['description']}, Similarity Score: {movie['similarity_score']:.2f}")

In [44]:
'''
cleaned_data.set_index("key_b", inplace=True)
### WORKING CELL 3 --FINAL
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Step 1: Vectorize the text using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_data['description'])

# Step 2: Initialize NearestNeighbors with cosine metric
knn = NearestNeighbors(metric='cosine', algorithm='brute')  # Using brute-force to handle cosine similarity
knn.fit(tfidf_matrix)

# Step 3: Specify the number of neighbors (including the movie itself, so we set n_neighbors=6)
n_neighbors = 6

# Step 4: Compute the nearest neighbors (including itself)
distances, indices = knn.kneighbors(tfidf_matrix, n_neighbors=n_neighbors)

# Step 5: Convert distances to similarity (1 - cosine distance)
similarity_scores = 1 - distances

# Step 6: Create DataFrame for similarity scores and closest neighbors
# For each movie, find the 5 closest neighbors (excluding itself)
neighbors_df = pd.DataFrame(indices[:, 1:],  # Exclude the movie itself (index 0)
                             columns=[f"Neighbor_{i+1}" for i in range(n_neighbors-1)],
                             index=cleaned_data.index)

similarity_df = pd.DataFrame(similarity_scores[:, 1:],  # Exclude the movie itself (index 0)
                             columns=[f"Neighbor_{i+1}" for i in range(n_neighbors-1)],
                             index=cleaned_data.index)

# Map the indices of the neighbors to the actual movie names
for col in neighbors_df.columns:
    neighbors_df[col] = neighbors_df[col].map(lambda idx: cleaned_data.index[idx])

# Display the similarity score and closest neighbors for each movie
#print("Similarity Scores (1: most similar, 0: least similar):")
#print(similarity_df)

print("\n5 Most Similar Movies (without self):")
print(neighbors_df)

'''


5 Most Similar Movies (without self):
                                                                          Neighbor_1  \
key_b                                                                                  
Underdogs (2018)                                             Audition Tape 13 (2022)   
The Men in Black (1992)                                     The Next 365 Days (2022)   
Lovers, Liars and Thieves (1997)                               The Magic Hour (2008)   
First Love (2007)                                    Nigdy tu juz nie powróce (1990)   
Prey of the Chameleon (1992)                                So Sweet, So Dead (1972)   
...                                                                              ...   
Strange Holiday (1945)                                                Calibre (2018)   
عمالقة البحار (1960)                                                    Nadia (1986)   
Le Cœur sous le paillasson (1976)  Disturbed Married Woman Embarrassing Caress (1

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

def fit_tfidf_model(df, text_column, max_features=5000):
    """
    Fit a TF-IDF model and transform movie descriptions into vectors.

    Args:
        df: DataFrame containing movie descriptions.
        text_column: Column name containing text data.
        max_features: Maximum number of words in the TF-IDF vectorizer.

    Returns:
        tfidf_matrix: Transformed TF-IDF matrix.
        tfidf_vectorizer: The trained TF-IDF vectorizer.
    """
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[text_column].fillna(''))
    return tfidf_matrix, tfidf_vectorizer

def fit_knn_model(tfidf_matrix, metric='cosine'):
    """
    Fit a KNN model on the TF-IDF matrix.

    Args:
        tfidf_matrix: The TF-IDF matrix.
        metric: Distance metric for KNN (default: 'cosine').

    Returns:
        Trained NearestNeighbors model.
    """
    knn = NearestNeighbors(metric=metric, algorithm='brute')
    knn.fit(tfidf_matrix)
    return knn

def get_similar_movies(knn_model, tfidf_matrix, movie_names, movie_name, n_neighbors=5):
    """
    Find similar movies using a pre-trained KNN model.

    Args:
        knn_model: The trained NearestNeighbors model.
        tfidf_matrix: The TF-IDF matrix.
        movie_names: The Series containing movie names.
        movie_name: The name of the movie to find similarities for.
        n_neighbors: The number of similar movies to retrieve (default is 5).

    Returns:
        A list of dictionaries with movie names and similarity scores.
    """
    try:
        # Get the index of the input movie
        idx = movie_names[movie_names == movie_name].index[0]
    except IndexError:
        raise ValueError(f"Movie '{movie_name}' not found in the dataset.")

    # Find nearest neighbors
    distances, indices = knn_model.kneighbors(tfidf_matrix[idx], n_neighbors=n_neighbors + 1)

    # Exclude the input movie itself
    similar_movies = [
        {
            'movie_name': movie_names.iloc[indices.flatten()[i]],
            'similarity_score': 1 - distances.flatten()[i]  # Convert distance to similarity
        }
        for i in range(1, len(indices.flatten()))
    ]

    return similar_movies

# Example Usage:
if __name__ == "__main__":
    # Load dataset (Example)
    df = pd.DataFrame({
        'movie_title': ['Inception', 'Interstellar', 'The Matrix', 'The Dark Knight', 'Memento'],
        'description': [
            "A thief who enters the dreams of others to steal their secrets.",
            "A team of explorers travel through a wormhole in space.",
            "A computer hacker learns about the true nature of reality.",
            "A vigilante fights crime in Gotham City.",
            "A man with short-term memory loss seeks revenge."
        ]
    })

    # Step 1: Fit TF-IDF Model
    tfidf_matrix, tfidf_vectorizer = fit_tfidf_model(df, 'description')

    # Step 2: Fit KNN Model
    knn_model = fit_knn_model(tfidf_matrix)

    # Step 3: Get Similar Movies
    movie_name = "Inception"
    similar_movies = get_similar_movies(knn_model, tfidf_matrix, df["movie_title"], movie_name, n_neighbors=3)
