In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sentence_transformers import SentenceTransformer
from datetime import datetime

In [2]:
df = pd.read_excel('data/movie_data.xlsx')
df.to_pickle('data/movie_data.pkl')

useful_columns = ['_id', 'genres', 'movie_title', 'original_language', 'overview', 'popularity', 'production_countries', 'release_date', 'runtime', 'spoken_languages', 'vote_average', 'vote_count', 'year_released']
df = df[useful_columns]

df.columns

Index(['_id', 'genres', 'movie_title', 'original_language', 'overview',
       'popularity', 'production_countries', 'release_date', 'runtime',
       'spoken_languages', 'vote_average', 'vote_count', 'year_released'],
      dtype='object')

In [3]:
df = df.dropna(subset=['release_date'])
df['release_date'] = pd.to_datetime(df['release_date'])
df['release_date_ordinal'] = df['release_date'].apply(lambda x: x.toordinal())

In [4]:
# Initialize the multi-label binarizer for genres, production_countries, and spoken_languages
mlb_genres = MultiLabelBinarizer()
mlb_countries = MultiLabelBinarizer()
mlb_languages = MultiLabelBinarizer()

# Initialize the one-hot encoder for original_language
# ohe_language = OneHotEncoder()
ohe_language = MultiLabelBinarizer()

# Initialize the SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# FillNaN
df['genres'] = df['genres'].fillna('[]')
df['production_countries'] = df['production_countries'].fillna('[]')
df['spoken_languages'] = df['spoken_languages'].fillna('[]')

# Encoding genres, production_countries, and spoken_languages
df['genres_encoded'] = list(mlb_genres.fit_transform(df['genres']))
df['production_countries_encoded'] = list(mlb_countries.fit_transform(df['production_countries']))
df['spoken_languages_encoded'] = list(mlb_languages.fit_transform(df['spoken_languages']))

In [5]:
df['original_language'] = df['original_language'].fillna('')
df['original_language'] = list(ohe_language.fit_transform(df['original_language'].apply(lambda x: [x])))

In [6]:
# df['original_language'] = df['original_language'].fillna('')
# df.dropna(subset=['overview'], inplace=True)

# ohe_language = OneHotEncoder(sparse_output=False)
# inp = df['original_language'].values.reshape(-1, 1)
# encoded_features = ohe_language.fit_transform(inp)
# df['original_language_vector'] = encoded_features.tolist()

In [7]:
df['overview'] = df['overview'].fillna('')
overview_vectors = model.encode(df['overview'].tolist())
df['overview_vectors'] = overview_vectors.tolist()

In [8]:
df['movie_title'] = df['movie_title'].fillna('')
df['movie_title'] = df['movie_title'].astype(str)
title_vectors = model.encode(df['movie_title'].tolist())
df['title_vectors'] = title_vectors.tolist()

In [9]:
df[['overview_vectors', 'title_vectors']].to_csv('data/nlp_vectors.csv', index=False)

In [10]:
old_cols = ['genres', 'original_language', 'production_countries', 'spoken_languages', 'release_date']

In [13]:
df.drop(columns=old_cols + ['overview_vectors', 'title_vectors']).to_csv('data/movie_data_vectorized.csv', index=False)

In [12]:
# Dropping original columns to keep only the features
# df.drop(columns=old_cols).to_excel('data/movie_data_vectorized.xlsx', index=False)