In [9]:
import pandas as pd
import os

## Load data

In [10]:
current_dir = os.getcwd() #current dir
parent_dir = os.path.dirname(current_dir) #parent of current dir

In [3]:
movies = pd.read_csv(os.path.join(parent_dir,'raw_data/movies.csv'))
actors = pd.read_csv(os.path.join(parent_dir,'raw_data/actors.csv'))
crew = pd.read_csv(os.path.join(parent_dir,'raw_data/crew.csv'))
languages = pd.read_csv(os.path.join(parent_dir,'raw_data/languages.csv'))
genres = pd.read_csv(os.path.join(parent_dir,'raw_data/genres.csv'))
studios = pd.read_csv(os.path.join(parent_dir,'raw_data/studios.csv'))
countries = pd.read_csv(os.path.join(parent_dir,'raw_data/countries.csv'))

In [4]:
movies = movies.drop(columns='tagline', axis=1)

actors = actors.drop(columns='role', axis=1)
actors = actors.dropna()

crew = crew[crew['role'].isin(['Director', 'Writer', 'Cinematography', 'Composer'])] #'Songs', 'Producer',

languages = languages[languages['type'].isin(['Language', 'Primary language'])].drop(columns='type')

In [5]:
new_crew = (
    crew.groupby('id')
    .apply(lambda x: x.groupby('role')['name'].apply(list).to_dict())
    .reset_index(name='crew_dict')
)

  .apply(lambda x: x.groupby('role')['name'].apply(list).to_dict())


In [6]:
new_genres = (
    genres.groupby('id')['genre']
    .apply(list)  # Aggregates genres into a list
    .reset_index(name='genre_list')  # Converts to DataFrame and renames the column
)

new_studios = (
    studios.groupby('id')['studio']
    .apply(list)  # Aggregates genres into a list
    .reset_index(name='studio_list')  # Converts to DataFrame and renames the column
)

new_actors = (
    actors.groupby('id')['name']
    .apply(list)  # Aggregates genres into a list
    .reset_index(name='actor_list')  # Converts to DataFrame and renames the column
)

In [7]:
data = movies \
    .merge(new_genres, how='left', on='id') \
    .merge(new_actors, how='left', on='id') \
    .merge(languages, how='left', on='id') \
    .merge(new_studios, how='left', on='id') \
    .merge(new_crew, how='left', on='id')

# data = pd.read_csv(os.path.join(parent_dir,'compiled_movies.csv'))

In [8]:
data = data.dropna(subset=['date'])
data['date'] = data['date'].astype(int)
data = data.loc[data['name'] != 'Untitled'] #to be completed
data = data.loc[data['date'] <= 2024]
data['key_b'] = data['name'] + data['date'].apply(
    lambda x: f" ({int(x)})" if not pd.isna(x) else ''
    )

In [12]:
# check_data = data.groupby('key_b').size().reset_index(name='frequency').sort_values(by='frequency', ascending=False)
unique_keys = data['key_b'].value_counts()  # Count occurrences
unique_keys = unique_keys[unique_keys == 1].index  # Keep only keys with frequency 1
data = data[data['key_b'].isin(unique_keys)] # Filter rows where key_b is unique

## Process dataset A

In [43]:
import pandas as pd
import os
current_dir = os.getcwd() #current dir
parent_dir = os.path.dirname(current_dir) #parent of current dir
data = pd.read_csv(os.path.join(parent_dir, 'clean_compile.csv'))

In [44]:
clean_up_data = data.copy()
# clean_up_data = clean_up_data.dropna(subset='crew_dict')
clean_up_data = clean_up_data.drop(columns=['id', 'Unnamed: 0'])
clean_up_data = clean_up_data[clean_up_data['description'].notnull() & (clean_up_data['description'] != '')] ###ADJUSTED
clean_up_data = clean_up_data.dropna()
clean_up_data['minute'] = clean_up_data['minute'].astype(int)
clean_up_data = clean_up_data[(clean_up_data['minute'] > 40) & (clean_up_data['minute'] <= 240)]
clean_up_data['genre_list'] = clean_up_data['genre_list'].apply(lambda x: x.lower())
clean_up_data = clean_up_data.reset_index(drop=True)

In [73]:
# test_save = data_preproc(clean_up_data)
test_save = test_save[test_save['description'].notnull() & (test_save['description'] != '')]
test_save.to_csv(os.path.join(parent_dir, 'test_processed.csv'), index=False)
test_load_from_csv = pd.read_csv(os.path.join(parent_dir, 'test_processed.csv'))

In [74]:
# test_load_from_csv = test_load_from_csv.drop(columns=['Unnamed: 0'])
test_load_from_csv.isnull().sum()

name               0
date               0
description        0
minute             0
genre_list         0
actor_list         0
language           0
crew_dict          0
key_b              0
comedy             0
history            0
science fiction    0
horror             0
tv movie           0
crime              0
western            0
fantasy            0
adventure          0
family             0
documentary        0
action             0
mystery            0
animation          0
thriller           0
music              0
drama              0
war                0
romance            0
dtype: int64

In [3]:
from test import *
clean_up_data = clean_up_data.sample(n=100)

In [4]:
# test_data = clean_up_data.sample(n=10000)
# processed_test_data = data_preproc(test_data)
processed_test_data = data_preproc(clean_up_data)
# processed_test_data.set_index('key_b', inplace=True)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Step 1: Vectorize the text using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_test_data['description'])

# Step 2: Initialize NearestNeighbors with cosine metric
knn = NearestNeighbors(metric='cosine', algorithm='brute')  # Using brute-force to handle cosine similarity
knn.fit(tfidf_matrix)

# Step 3: Specify the number of neighbors (including the movie itself, so we set n_neighbors=6)
n_neighbors = 6

# Step 4: Compute the nearest neighbors (including itself)
distances, indices = knn.kneighbors(tfidf_matrix, n_neighbors=n_neighbors)

# Step 5: Convert distances to similarity (1 - cosine distance)
similarity_scores = 1 - distances

# Step 6: Create DataFrame for similarity scores and closest neighbors
# For each movie, find the 5 closest neighbors (excluding itself)
neighbors_df = pd.DataFrame(indices[:, 1:],  # Exclude the movie itself (index 0)
                             columns=[f"Neighbor_{i+1}" for i in range(n_neighbors-1)],
                             index=processed_test_data.index)

similarity_df = pd.DataFrame(similarity_scores[:, 1:],  # Exclude the movie itself (index 0)
                             columns=[f"Neighbor_{i+1}" for i in range(n_neighbors-1)],
                             index=processed_test_data.index)

# Map the indices of the neighbors to the actual movie names
for col in neighbors_df.columns:
    neighbors_df[col] = neighbors_df[col].map(lambda idx: processed_test_data.index[idx])

# similarity_df.to_csv(os.path.join(parent_dir,'similarity_df.csv'))
# neighbors_df.to_csv(os.path.join(parent_dir,'neighbors_df.csv'))

In [None]:
# data_preproc(test_data)
# save_check = cat_processing_genre(clean_up_data, 'genre_list')
# cat_processing_lan(save_check, 'language')
# text_preprocess(clean_up_data['description'][2])
# num_preprocess_year(clean_up_data[['date']])
# num_preprocess_min(clean_up_data[['minute']])

In [11]:
# FROM AYBIKE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

def vectorize_descriptions(df, text_column):
    """
    Vectorize movie descriptions using TF-IDF.

    Args:
        df: The DataFrame containing movie descriptions.
        text_column: The column in the DataFrame that contains descriptions.

    Returns:
        tfidf_matrix: The TF-IDF matrix.
        vectorizer: The fitted TfidfVectorizer object (useful if needed later).
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    return tfidf_matrix

def knn_fit(tfidf_matrix):
    # Fit KNN on the TF-IDF matrix
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(tfidf_matrix)
    return knn

def verify_input(df, input_name, name_column):
    if input_name not in df[name_column].values:
        raise ValueError(f"Movie '{input_name}' not found in the DataFrame.")

def get_similar_movies_knn(knn, tfidf_matrix, df, input_name, name_column, n_neighbors=5):
    """
    Find similar movies using KNN based on a TF-IDF matrix.

    Args:
        tfidf_matrix: The TF-IDF matrix.
        df: The DataFrame containing movie names and descriptions.
        input_name: The name of the movie to find similarities for.
        name_column: The column in the DataFrame that contains movie names.
        n_neighbors: The number of similar movies to retrieve (default is 5).

    Returns:
        A list of dictionaries with movie names and similarity scores.
    """
    # Get the index of the input movie
    # if input_name not in df[name_column].values:
    #     raise ValueError(f"Movie '{input_name}' not found in the DataFrame.")
    verify_input(df, input_name, name_column)

    idx = df[df[name_column] == input_name].index[0] #can be moved to verify_input

    # Find nearest neighbors
    distances, indices = knn.kneighbors(tfidf_matrix[idx], n_neighbors=n_neighbors + 1)

    # Exclude the input movie itself
    similar_movies = []
    for i in range(1, len(indices.flatten())):
        similar_movies.append({
            'input_name': df.iloc[indices.flatten()[i]][name_column],
            'similarity_score': 1 - distances.flatten()[i]  # Convert distance to similarity
        })
    return similar_movies

In [78]:
parent_dir
# os.path.dirname(__file__) #absolute path of dir

'/home/duonghaxuyen/code/aybik/movie_picker'

In [94]:
import test
test_func()

NameError: name 'test_func' is not defined

In [90]:
print(os.path.dirname(__file__),"actors.csv")

NameError: name '__file__' is not defined

In [96]:
from pathlib import Path
temp = Path(__file__)

NameError: name '__file__' is not defined

In [88]:
os.path.dirname('movie_picker/moviepicker/model.py')

'movie_picker/moviepicker'

In [84]:
# matrix --> pickle as well!

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2591 stored elements and shape (100, 1671)>

In [97]:
import pickle
with open(os.path.join(parent_dir,"models/matrix.pkl"), "wb") as file:
    pickle.dump(matrix, file)

In [82]:
load_model = pickle.load(open(os.path.join(parent_dir,"models/knn_model.pkl"), "rb"))

In [83]:
load_model

In [19]:
processed_test_data.reset_index(inplace=True)

In [20]:
matrix = vectorize_descriptions(processed_test_data, 'description')
model = knn_fit(matrix)

In [24]:
# Specify the movie name and the column for movie titles
movie_name = "The Boy and the Fog (1953)"
name_column = 'key_b'

# Get similar movies
similar_movies = get_similar_movies_knn(model, matrix, processed_test_data, movie_name, name_column)

# Print similar movies
for movie in similar_movies:
    print(f"Movie: {movie['input_name']}, Similarity Score: {movie['similarity_score']:.2f}")

Movie: Son of Manjeet Singh (2018), Similarity Score: 0.08
Movie: El Tamalon Navideño (2018), Similarity Score: 0.08
Movie: La Güera Rodríguez (1978), Similarity Score: 0.08
Movie: Patrol (2014), Similarity Score: 0.08
Movie: The Report (2019), Similarity Score: 0.06


In [49]:
# test_load_from_csv.isnull().sum()
# clean_up_data.isnull().sum()
test_load_from_csv[test_load_from_csv.description.isnull()]

Unnamed: 0,name,date,description,minute,genre_list,actor_list,language,crew_dict,key_b,comedy,...,family,documentary,action,mystery,animation,thriller,music,drama,war,romance
170451,Garrison,0.083333,,0.301508,"['drama', 'thriller']","['Elizabeth Ingalls', 'Jason Cox', 'Brent Boll...",German,"{'Director': ['Kerry Valderrama'], 'Writer': [...",Garrison (2008),0,...,0,0,0,0,0,1,0,1,0,0
185854,The House of Pop 6,-0.388889,,0.266332,"['horror', 'comedy']",['Nuttanee Sittisamarn'],Thai,{'Director': ['Saiyon Srisawat']},The House of Pop 6 (1991),1,...,0,0,0,0,0,0,0,0,0,0
271072,heart eyes,0.5,,0.396985,['romance'],"['nadjib', 'imene']",English,{'Director': ['nadjib']},heart eyes (2023),0,...,0,0,0,0,0,0,0,0,0,1
274771,Mayer Odhikar,-0.25,,0.542714,"['family', 'drama', 'action']","['Salman Shah', 'Shahnaz Sumi', 'Alamgir', 'Bo...","Bengali, Bangla","{'Director': ['Shibli Sadique'], 'Writer': ['S...",Mayer Odhikar (1996),0,...,1,0,1,0,0,0,0,1,0,0


In [None]:
test_load_from_csv

In [59]:
item_to_check = clean_up_data[clean_up_data.name=="The House of Pop 6"]

In [66]:
item_to_check

Unnamed: 0,name,date,description,minute,genre_list,actor_list,language,crew_dict,key_b,comedy,...,family,documentary,action,mystery,animation,thriller,music,drama,war,romance
185854,The House of Pop 6,-0.388889,,0.266332,"['horror', 'comedy']",['Nuttanee Sittisamarn'],Thai,{'Director': ['Saiyon Srisawat']},The House of Pop 6 (1991),1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
clean_up_data[clean_up_data.name=="The House of Pop 6"]

In [67]:
clean_up_data[clean_up_data.name=="Garrison"]

Unnamed: 0,name,date,description,minute,genre_list,actor_list,language,crew_dict,key_b,comedy,...,family,documentary,action,mystery,animation,thriller,music,drama,war,romance
170451,Garrison,0.083333,,0.301508,"['drama', 'thriller']","['Elizabeth Ingalls', 'Jason Cox', 'Brent Boll...",German,"{'Director': ['Kerry Valderrama'], 'Writer': [...",Garrison (2008),0,...,0,0,0,0,0,1,0,1,0,0


In [69]:
data[data.name=="Garrison"]

Unnamed: 0.1,Unnamed: 0,id,name,date,description,minute,genre_list,actor_list,language,crew_dict,key_b
203457,270000,1282791,Garrison,2008,,101.0,"['Drama', 'Thriller']","['Elizabeth Ingalls', 'Jason Cox', 'Brent Boll...",German,"{'Director': ['Kerry Valderrama'], 'Writer': [...",Garrison (2008)


In [75]:
matrix_1 = vectorize_descriptions(test_load_from_csv, 'description')
model_1 = knn_fit(matrix_1)

# Specify the movie name and the column for movie titles
movie_name = "The Boy and the Fog (1953)"
name_column = 'key_b'

# Get similar movies
similar_movies = get_similar_movies_knn(model_1, matrix_1, test_load_from_csv, movie_name, name_column)

# Print similar movies
for movie in similar_movies:
    print(f"Movie: {movie['input_name']}, Similarity Score: {movie['similarity_score']:.2f}")

Movie: Canvas (2006), Similarity Score: 0.38
Movie: Rain Beau's End (2020), Similarity Score: 0.29
Movie: New Brooklyn (2009), Similarity Score: 0.25
Movie: Manduka (2023), Similarity Score: 0.22
Movie: Incomplete Eclipse (1983), Similarity Score: 0.22


In [23]:
similar_movies

[{'input_name': 'Son of Manjeet Singh (2018)',
  'similarity_score': np.float64(0.08164263598051247)},
 {'input_name': 'El Tamalon Navideño (2018)',
  'similarity_score': np.float64(0.08079736313471475)},
 {'input_name': 'La Güera Rodríguez (1978)',
  'similarity_score': np.float64(0.07697786690098152)},
 {'input_name': 'Patrol (2014)',
  'similarity_score': np.float64(0.07508092827475188)},
 {'input_name': 'The Report (2019)',
  'similarity_score': np.float64(0.06036441232949241)}]

## Process dataset B

In [17]:
B_movie = pd.read_csv(os.path.join(parent_dir,'raw_data/Set B/movie.csv'))
B_rating = pd.read_csv(os.path.join(parent_dir,'raw_data/Set B/rating.csv'))

B_data = B_movie.merge(B_rating, how='left', on='movieId')

In [18]:
B_data['year'] = B_data['title'].str.extract(r'\((\d{4})\)')
B_data['new_title'] = B_data['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True)
B_data['new_title'] = B_data['new_title'].apply(
    lambda x: f"The {x[:-5]}" if x.endswith(', The') else x
)

In [26]:
new_B = B_data[['title', 'new_title', 'year']].drop_duplicates()
new_B['key_a'] = B_data['new_title'] + B_data['year'].apply(
    lambda x: f" ({int(x)})" if not pd.isna(x) else ''
    )

## Join A and B

In [32]:
merge_A_and_B = new_A.merge(new_B, how='left', left_on='key_b', right_on='key_a')

In [33]:
merge_A_and_B[merge_A_and_B['title'].notna()]

Unnamed: 0,key_b,date,title,new_title,year,key_a
2,Fight Club (1999),1999,Fight Club (1999),Fight Club,1999,Fight Club (1999)
5,Interstellar (2014),2014,Interstellar (2014),Interstellar,2014,Interstellar (2014)
8,Pulp Fiction (1994),1994,Pulp Fiction (1994),Pulp Fiction,1994,Pulp Fiction (1994)
10,Whiplash (2014),2014,Whiplash (2014),Whiplash,2014,Whiplash (2014)
15,The Dark Knight (2008),2008,"Dark Knight, The (2008)",The Dark Knight,2008,The Dark Knight (2008)
...,...,...,...,...,...,...
719244,Die Frau des Frisörs (2008),2008,Die Frau des Frisörs (2008),Die Frau des Frisörs,2008,Die Frau des Frisörs (2008)
735576,Confessions of a Gangsta (2006),2006,Confessions of a Gangsta (2006),Confessions of a Gangsta,2006,Confessions of a Gangsta (2006)
752416,Apollo Zero (2009),2009,Apollo Zero (2009),Apollo Zero,2009,Apollo Zero (2009)
764628,Family Meeting (2007),2007,Family Meeting (2007),Family Meeting,2007,Family Meeting (2007)


# SUMMARY

ADVISE: high-cardiality --> may make sense to drop the rare guys or not

ADVISE FROM TEACHERS
- first drop rows where everything is missing
- for Wed: drop rows where description isnull, no impute using other text columns --> future: API web_scrapping to fill in description
- for Wed: snapshot dataset (random/first/last 10k)

## ARCHIVE CODE

In [None]:
# Sweep the var away
del unique_keys

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Plot histogram of 'minute' column
# sns.histplot(clean_up_data['minute'], kde=False)
# plt.title('Histogram of Minute Column')
# plt.xlabel('Minute')
# plt.ylabel('Frequency')
# plt.xlim(0, 240)
# plt.show()

In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer =TfidfVectorizer()

# X = vectorizer.fit_transform(processed_test_data['description'])
# X = pd.DataFrame(
#     X.toarray(),
#     columns = vectorizer.get_feature_names_out()
# )

# X

In [None]:
# neighbors_df.loc['Agnisnaan (1985)']

In [None]:
# clean_up_data.minute #impute mean or median
# clean_up_data.genre_list #impute with new genre category "Other" or "Unknown"
# clean_up_data.actor_list #drop rows

# SUPER SOLUTION: API scrapping can fill in all missing values
# some fancy graphs in the frontend if possible

In [None]:
# temp = crew[crew['id']==1000001]
#consider relabel those with small proportion to new category "Others"
#embeddings make sense, but we will have little control
#TODO: we check later how we deal with this feature!

# crew.isnull().sum()
# crew.role.unique()

# genres.genre.unique() #OneHotEncoder

# data.date.isnull().count()

# studios.studio.value_counts()/studios.studio.value_counts().sum()
#consider relabel those with small proportion to new category "Others" or drop them
#embeddings make sense, but we will have little control
#TODO: we check later how we deal with this feature!

# countries # categorical, high-cardinality, embeddings
# countries # if we chose studios, this one can be left out due to collinearity

# Incremental Recall
# python surprise
# Qualitative not

# - dropna, fillna: imputer
# - solve outliners (year range): drop those that have

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity
# # Compute the similarity matrix
# similarity_matrix = cosine_similarity(list(processed_test_data['description_vector']))

# # Convert similarity matrix to a DataFrame
# similarity_df = pd.DataFrame(similarity_matrix,
#                              index=processed_test_data.index,
#                              columns=processed_test_data.index)