In [None]:
!pip install fasttext


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.5-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.5-py3-none-any.whl (240 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4246560 sha256=31c9aec6f8f274f1c78ab27b88f21dac259970fca1fe50bf8c7eee670b6bfdcc
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58e02cec2ddb20ce3e59fad8d3c92a
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Success

In [None]:
import fasttext
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('clean_data.csv')


In [None]:
df.head()

Unnamed: 0,id,title,overview,release_date,vote_average,vote_count,genre_names
0,700391,65,"65 million years ago, the only 2 survivors of ...",2023-03-02,6.008,2211,"Science Fiction, Action, Adventure, Thriller"
1,592834,My Spy,A hardened CIA operative finds himself at the ...,2020-01-09,6.9,1136,"Family, Action, Comedy"
2,493529,Dungeons & Dragons: Honor Among Thieves,A charming thief and a band of unlikely advent...,2023-03-23,7.376,3276,"Adventure, Fantasy, Comedy"
3,571625,The Closet,"After moving into a new house, a young girl be...",2020-02-05,7.276,174,"Horror, Thriller"
4,571648,Beasts Clawing at Straws,"A struggling restaurant owner, caring for his ...",2020-02-19,7.119,235,"Mystery, Thriller, Crime, Drama"


In [None]:
# Combine the columns into a single column
df['combined'] = df['title'] + ' ' + df['overview'] + ' ' + df['release_date']

# Save the combined text to a new file
df['combined'].to_csv('combined_descriptions.txt', index=False, header=False)

In [None]:
# Train the FastText model and save
model = fasttext.train_unsupervised('combined_descriptions.txt', model='skipgram', dim=100)

save_path = './fasttext_model.bin'

model.save_model(save_path)

In [None]:
fasttext_model = fasttext.load_model('./fasttext_model.bin')

# Function to convert text into FastText embeddings
def embed_text(text, model):
    tokens = text.split()  # Tokenize the text (simple splitting on spaces)
    embeddings = [model.get_word_vector(token) for token in tokens]
    # Take the mean of the word vectors to get a single vector for the sentence
    return np.mean(embeddings, axis=0)

# Apply the embedding function to the 'title', 'overview' and 'release_date' columns
df['title_embedding'] = df['title'].apply(lambda x: embed_text(x, fasttext_model))
df['overview_embedding'] = df['overview'].apply(lambda x: embed_text(str(x), fasttext_model))
df['release_date_embedding'] = df['release_date'].apply(lambda x: embed_text(str(x), fasttext_model))

In [None]:
# One-hot encoding for the genre_names column, splitting by comma and space
df_one_hot = df['genre_names'].str.get_dummies(sep=', ')


In [None]:
title_dim = len(df['title_embedding'].iloc[0])
overview_dim = len(df['overview_embedding'].iloc[0])
release_date_dim = len(df['release_date'].iloc[0])
print(f'title_dim: {title_dim}')
print(f'overview_dim: {overview_dim}')
print(f'releade_date_dim: {release_date_dim}')

title_dim: 100
overview_dim: 100
releade_date_dim: 10


In [None]:
title_embedding_df = pd.DataFrame(df['title_embedding'].tolist(),
                                  columns=[f'title_emb_{i}' for i in range(title_dim)])
overview_embedding_df = pd.DataFrame(df['overview_embedding'].tolist(),
                                     columns=[f'overview_emb_{i}' for i in range(overview_dim)])
release_date_embedding_df = pd.DataFrame(df['release_date_embedding'].tolist(),
                                         columns=[f'release_date_emb_{i}' for i in range(100)])

In [None]:
df_final = pd.concat([
    df[['id', 'vote_average', 'vote_count']],
    title_embedding_df,
    overview_embedding_df,
    release_date_embedding_df,
    df_one_hot
], axis=1)

In [None]:
# Retain original title, overview, release_date for metadata purposes
df_final_with_metadata = df_final.copy()
df_final_with_metadata['title'] = df['title']
df_final_with_metadata['overview'] = df['overview']
df_final_with_metadata['release_date'] = df['release_date']

In [None]:
df_final_with_metadata.head()

Unnamed: 0,id,vote_average,vote_count,title_emb_0,title_emb_1,title_emb_2,title_emb_3,title_emb_4,title_emb_5,title_emb_6,...,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,title,overview,release_date
0,700391,6.008,2211,0.004806,0.021263,0.014998,0.013826,0.074351,0.052894,-0.038101,...,0,0,1,0,1,0,0,65,"65 million years ago, the only 2 survivors of ...",2023-03-02
1,592834,6.9,1136,0.036632,0.22587,0.264631,0.041697,0.17552,0.311399,-0.32022,...,0,0,0,0,0,0,0,My Spy,A hardened CIA operative finds himself at the ...,2020-01-09
2,493529,7.376,3276,0.12645,0.092999,0.19478,0.098243,0.26279,0.21878,-0.383634,...,0,0,0,0,0,0,0,Dungeons & Dragons: Honor Among Thieves,A charming thief and a band of unlikely advent...,2023-03-23
3,571625,7.276,174,0.170989,0.090268,0.065357,0.02655,0.474886,0.327209,-0.279548,...,0,0,0,0,1,0,0,The Closet,"After moving into a new house, a young girl be...",2020-02-05
4,571648,7.119,235,-0.009752,0.09153,0.281226,-0.015163,0.350808,0.35017,-0.212534,...,1,0,0,0,1,0,0,Beasts Clawing at Straws,"A struggling restaurant owner, caring for his ...",2020-02-19


In [None]:
# Combine embeddings into a single vector
df_final['combined_embedding'] = df_final[[
    *title_embedding_df.columns,
    *overview_embedding_df.columns,
    *release_date_embedding_df.columns,
]].values.tolist()

In [None]:
# Prepare the data for Pinecone upsert
def prepare_pinecone_data(row):
    return {
        'id': str(row['id']),
        'values': row['combined_embedding'],
        'metadata': {
            'title': row['title'],
            'overview': row['overview'],
            'release_date': row['release_date'],
            'vote_average': row['vote_average'],
            'vote_count': row['vote_count'],
        }
    }

In [None]:
# Apply the function to the dataframe and convert it into a list of dictionaries
pinecone_data = df_final_with_metadata.apply(prepare_pinecone_data, axis=1).tolist()