In [1]:
!pip install fasttext


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.5-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.5-py3-none-any.whl (240 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4246563 sha256=f2ac0b09ddabf352fba9cfdf6d4f8ce0ca6f7a244b517c0587bcf10e37d1c463
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [6]:
import fasttext
import numpy as np
import pandas as pd

In [31]:
df = pd.read_csv('clean_data.csv')


In [10]:
df.head()

Unnamed: 0,id,title,overview,release_date,vote_average,vote_count,genre_names
0,700391,65,"65 million years ago, the only 2 survivors of ...",2023-03-02,6.008,2211,"Science Fiction, Action, Adventure, Thriller"
1,592834,My Spy,A hardened CIA operative finds himself at the ...,2020-01-09,6.9,1136,"Family, Action, Comedy"
2,493529,Dungeons & Dragons: Honor Among Thieves,A charming thief and a band of unlikely advent...,2023-03-23,7.376,3276,"Adventure, Fantasy, Comedy"
3,571625,The Closet,"After moving into a new house, a young girl be...",2020-02-05,7.276,174,"Horror, Thriller"
4,571648,Beasts Clawing at Straws,"A struggling restaurant owner, caring for his ...",2020-02-19,7.119,235,"Mystery, Thriller, Crime, Drama"


In [11]:
# Combine the columns into a single column
df['combined'] = df['title'] + ' ' + df['overview']

# Save the combined text to a new file
df['combined'].to_csv('combined_descriptions.txt', index=False, header=False)

In [12]:
# Train the FastText model and save
model = fasttext.train_unsupervised('combined_descriptions.txt', model='skipgram', dim=100)

save_path = './fasttext_model.bin'

model.save_model(save_path)

In [32]:
fasttext_model = fasttext.load_model('./fasttext_model.bin')

# Function to convert text into FastText embeddings
def embed_text(text, model):
    tokens = text.split()  # Tokenize the text (simple splitting on spaces)
    embeddings = [model.get_word_vector(token) for token in tokens]
    # Take the mean of the word vectors to get a single vector for the sentence
    return np.mean(embeddings, axis=0)

# Apply the embedding function to the 'title', 'overview' and 'release_date' columns
df['title_embedding'] = df['title'].apply(lambda x: embed_text(x, fasttext_model))
df['overview_embedding'] = df['overview'].apply(lambda x: embed_text(str(x), fasttext_model))
df['date_embedding'] = df['release_date'].apply(lambda x: embed_text(str(x), fasttext_model))

In [33]:
# One-hot encoding for the genre_names column, splitting by comma and space
df_one_hot = df['genre_names'].str.get_dummies(sep=', ')


In [34]:
# Extract numeric columns
numeric_columns = df[['id', 'vote_average', 'vote_count']]

# Concatenate the one-hot encoded and numeric data
combined_data = pd.concat([df_one_hot, numeric_columns], axis=1)

In [36]:
# Concatenate with other embeddings and numeric columns
df_final_embeddings = pd.concat([
    pd.DataFrame(df['title_embedding'].tolist()),
    pd.DataFrame(df['overview_embedding'].tolist()),
    pd.DataFrame(df['date_embedding'].tolist()),
    combined_data
], axis=1)

In [37]:
df_final_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,id,vote_average,vote_count
0,-0.026247,0.088760,-0.004074,0.008518,0.034858,0.080920,-0.066916,-0.031542,0.041323,0.016954,...,0,0,1,0,1,0,0,700391,6.008,2211
1,-0.091066,0.436741,-0.137849,0.108118,-0.179357,0.669822,-0.075735,0.160373,0.281742,-0.176827,...,0,0,0,0,0,0,0,592834,6.900,1136
2,-0.047975,0.253489,-0.053783,0.093863,-0.021732,0.454831,-0.231874,0.114748,0.243956,0.031398,...,0,0,0,0,0,0,0,493529,7.376,3276
3,-0.092692,0.525707,-0.102800,0.007783,0.097758,0.390945,-0.353241,-0.163858,0.177330,0.126446,...,0,0,0,0,1,0,0,571625,7.276,174
4,-0.274100,0.430820,-0.018639,0.072753,0.110665,0.227099,-0.336320,0.008596,0.133655,0.174047,...,1,0,0,0,1,0,0,571648,7.119,235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18269,-0.205260,0.297022,-0.102192,0.121868,-0.028642,0.491099,-0.130047,-0.004082,0.277160,0.077399,...,0,0,0,0,0,0,0,756403,5.965,85
18270,0.055350,0.509264,-0.032456,0.207973,0.129462,0.425930,-0.314500,0.105279,0.123801,0.049609,...,1,0,0,0,0,0,0,14584,6.200,82
18271,-0.003106,0.140753,-0.105246,0.073562,0.110365,0.125282,-0.171913,-0.153950,0.156972,0.170226,...,0,1,0,0,0,0,0,10646,5.100,244
18272,-0.197058,0.271913,-0.245451,0.267949,-0.041579,0.395716,0.049152,0.066697,0.194609,0.092598,...,0,0,0,0,0,0,0,9830,5.300,95
