In [1]:
!pip install fasttext


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.5-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.5-py3-none-any.whl (240 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4246560 sha256=b4162773f15555

In [2]:
import fasttext
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('clean_data.csv')


In [4]:
df.head()

Unnamed: 0,id,title,overview,release_date,vote_average,vote_count,genre_names
0,700391,65,"65 million years ago, the only 2 survivors of ...",2023-03-02,6.008,2211,"Science Fiction, Action, Adventure, Thriller"
1,592834,My Spy,A hardened CIA operative finds himself at the ...,2020-01-09,6.9,1136,"Family, Action, Comedy"
2,493529,Dungeons & Dragons: Honor Among Thieves,A charming thief and a band of unlikely advent...,2023-03-23,7.376,3276,"Adventure, Fantasy, Comedy"
3,571625,The Closet,"After moving into a new house, a young girl be...",2020-02-05,7.276,174,"Horror, Thriller"
4,571648,Beasts Clawing at Straws,"A struggling restaurant owner, caring for his ...",2020-02-19,7.119,235,"Mystery, Thriller, Crime, Drama"


In [5]:
# Combine the columns into a single column
df['combined'] = df['title'] + ' ' + df['overview'] + ' ' + df['release_date']

# Save the combined text to a new file
df['combined'].to_csv('combined_descriptions.txt', index=False, header=False)

In [6]:
# Train the FastText model and save
model = fasttext.train_unsupervised('combined_descriptions.txt', model='skipgram', dim=100)

save_path = './fasttext_model.bin'

model.save_model(save_path)

In [7]:
fasttext_model = fasttext.load_model('./fasttext_model.bin')

# Function to convert text into FastText embeddings
def embed_text(text, model):
    tokens = text.split()  # Tokenize the text (simple splitting on spaces)
    embeddings = [model.get_word_vector(token) for token in tokens]
    # Take the mean of the word vectors to get a single vector for the sentence
    return np.mean(embeddings, axis=0)

# Apply the embedding function to the 'title', 'overview' and 'release_date' columns
df['title_embedding'] = df['title'].apply(lambda x: embed_text(x, fasttext_model))
df['overview_embedding'] = df['overview'].apply(lambda x: embed_text(str(x), fasttext_model))
df['release_date_embedding'] = df['release_date'].apply(lambda x: embed_text(str(x), fasttext_model))

In [8]:
# One-hot encoding for the genre_names column, splitting by comma and space
df_one_hot = df['genre_names'].str.get_dummies(sep=', ')


In [9]:
title_dim = len(df['title_embedding'].iloc[0])
overview_dim = len(df['overview_embedding'].iloc[0])
release_date_dim = len(df['release_date'].iloc[0])
print(f'title_dim: {title_dim}')
print(f'overview_dim: {overview_dim}')
print(f'releade_date_dim: {release_date_dim}')

title_dim: 100
overview_dim: 100
releade_date_dim: 10


In [10]:
title_embedding_df = pd.DataFrame(df['title_embedding'].tolist(),
                                  columns=[f'title_emb_{i}' for i in range(title_dim)])
overview_embedding_df = pd.DataFrame(df['overview_embedding'].tolist(),
                                     columns=[f'overview_emb_{i}' for i in range(overview_dim)])
release_date_embedding_df = pd.DataFrame(df['release_date_embedding'].tolist(),
                                         columns=[f'release_date_emb_{i}' for i in range(100)])

In [11]:
df_final = pd.concat([
    df[['id', 'vote_average', 'vote_count']],
    title_embedding_df,
    overview_embedding_df,
    release_date_embedding_df,
    df_one_hot
], axis=1)

In [12]:
# Retain original title, overview, release_date for metadata purposes
df_final_with_metadata = df_final.copy()
df_final_with_metadata['title'] = df['title']
df_final_with_metadata['overview'] = df['overview']
df_final_with_metadata['release_date'] = df['release_date']

In [13]:
df_final_with_metadata.head()

Unnamed: 0,id,vote_average,vote_count,title_emb_0,title_emb_1,title_emb_2,title_emb_3,title_emb_4,title_emb_5,title_emb_6,...,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,title,overview,release_date
0,700391,6.008,2211,0.004806,0.021263,0.014998,0.013826,0.074351,0.052894,-0.038101,...,0,0,1,0,1,0,0,65,"65 million years ago, the only 2 survivors of ...",2023-03-02
1,592834,6.9,1136,0.036632,0.22587,0.264631,0.041697,0.17552,0.311399,-0.32022,...,0,0,0,0,0,0,0,My Spy,A hardened CIA operative finds himself at the ...,2020-01-09
2,493529,7.376,3276,0.12645,0.092999,0.19478,0.098243,0.26279,0.21878,-0.383634,...,0,0,0,0,0,0,0,Dungeons & Dragons: Honor Among Thieves,A charming thief and a band of unlikely advent...,2023-03-23
3,571625,7.276,174,0.170989,0.090268,0.065357,0.02655,0.474886,0.327209,-0.279548,...,0,0,0,0,1,0,0,The Closet,"After moving into a new house, a young girl be...",2020-02-05
4,571648,7.119,235,-0.009752,0.09153,0.281226,-0.015163,0.350808,0.35017,-0.212534,...,1,0,0,0,1,0,0,Beasts Clawing at Straws,"A struggling restaurant owner, caring for his ...",2020-02-19


In [14]:
# Combine embeddings into a single vector
df_final_with_metadata['combined_embedding'] = df_final[[
    *title_embedding_df.columns,
    *overview_embedding_df.columns,
    *release_date_embedding_df.columns,
]].values.tolist()

In [15]:
# Prepare the data for Pinecone upsert
def prepare_pinecone_data(row):
    return {
        'id': str(row['id']),
        'values': row['combined_embedding'],
        'metadata': {
            'title': row['title'],
            'overview': row['overview'],
            'release_date': row['release_date'],
            'vote_average': row['vote_average'],
            'vote_count': row['vote_count'],
        }
    }

In [16]:
# Apply the function to the dataframe and convert it into a list of dictionaries
pinecone_data = df_final_with_metadata.apply(prepare_pinecone_data, axis=1).tolist()

In [17]:
pdf = pd.DataFrame(pinecone_data)

In [33]:
pdf

Unnamed: 0,id,values,metadata
0,700391,"[0.00480611389502883, 0.021263178437948227, 0....","{'title': '65', 'overview': '65 million years ..."
1,592834,"[0.03663239628076553, 0.2258700430393219, 0.26...","{'title': 'My Spy', 'overview': 'A hardened CI..."
2,493529,"[0.12644962966442108, 0.09299876540899277, 0.1...",{'title': 'Dungeons & Dragons: Honor Among Thi...
3,571625,"[0.1709892749786377, 0.09026770293712616, 0.06...","{'title': 'The Closet', 'overview': 'After mov..."
4,571648,"[-0.009751636534929276, 0.09153008460998535, 0...","{'title': 'Beasts Clawing at Straws', 'overvie..."
...,...,...,...
18269,756403,"[0.1199960857629776, 0.13584935665130615, 0.04...",{'title': 'Riverdance: The Animated Adventure'...
18270,14584,"[0.013468627817928791, 0.13476072251796722, 0....","{'title': 'The Cheap Detective', 'overview': '..."
18271,10646,"[0.14252185821533203, 0.16987906396389008, 0.1...","{'title': 'Tomcats', 'overview': 'College budd..."
18272,9830,"[0.09595801681280136, -0.11051520705223083, -0...","{'title': 'Haven', 'overview': 'During a weeke..."


In [21]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.0.3-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.0.3-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.6/117.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-

In [31]:
len(pdf['values'].iloc[0])

300

In [29]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="1b558c37-2524-47f9-9576-0a2efc720fe8")
index_name = 'movies'

# Check if the index exists
if index_name not in pc.list_indexes():
    pc.create_index(
    name=index_name,
    dimension=300,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [30]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="1b558c37-2524-47f9-9576-0a2efc720fe8")
index_name = 'movies'
index = pc.Index(index_name)

In [34]:
# Prepare data for upsert
vectors = []
for _, row in df.iterrows():
    id = str(pdf['id'])
    embedding = pdf['values']
    metadata = pdf['metadata']
    vectors.append((id, embedding, metadata))

# Function to chunk the data into smaller batches
def chunk_data(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Set batch size
batch_size = 100

# Upsert data in batches
for batch in chunk_data(vectors, batch_size):
    index.upsert(vectors=batch)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# Function to chunk the data into smaller batches
def chunk_data(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Prepare data for upsert
vectors = []
for movie_id, title_emb, overview_emb, release_emb, title, overview, release_date, vote_average, vote_count in zip(
        df['id'],
        df[[f'title_emb_{i}' for i in range(title_embedding_dim)]].values.tolist(),
        df[[f'overview_emb_{i}' for i in range(overview_embedding_dim)]].values.tolist(),
        df[[f'release_date_emb_{i}' for i in range(release_date_embedding_dim)]].values.tolist(),
        df['title'],
        df['overview'],
        df['release_date'],
        df['vote_average'],
        df['vote_count']):

    # Combine embeddings into one vector
    combined_embedding = title_emb + overview_emb + release_emb

    # Ensure the embedding is not None and has non-zero values
    if np.count_nonzero(combined_embedding) > 0:
        vectors.append((
            str(movie_id),  # Use movie ID as the unique identifier
            combined_embedding,  # The combined embedding for title, overview, release_date
            {  # Metadata to attach to each vector
                'title': title,
                'overview': overview,
                'release_date': release_date,
                'vote_average': vote_average,
                'vote_count': vote_count
            }
        ))

# Set batch size
batch_size = 100

# Upsert data in batches
for batch in chunk_data(vectors, batch_size):
    index.upsert(vectors=batch)