In [1]:
!pip install fasttext


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/73.4 kB[0m [31m688.2 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m71.7/73.4 kB[0m [31m868.2 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m731.4 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.5-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.5-py3-none-any.whl (240 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.

In [2]:
import fasttext
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('clean_data.csv')


In [4]:
df.head()

Unnamed: 0,id,title,overview,release_date,vote_average,vote_count,genre_names
0,700391,65,"65 million years ago, the only 2 survivors of ...",2023-03-02,6.008,2211,"Science Fiction, Action, Adventure, Thriller"
1,592834,My Spy,A hardened CIA operative finds himself at the ...,2020-01-09,6.9,1136,"Family, Action, Comedy"
2,493529,Dungeons & Dragons: Honor Among Thieves,A charming thief and a band of unlikely advent...,2023-03-23,7.376,3276,"Adventure, Fantasy, Comedy"
3,571625,The Closet,"After moving into a new house, a young girl be...",2020-02-05,7.276,174,"Horror, Thriller"
4,571648,Beasts Clawing at Straws,"A struggling restaurant owner, caring for his ...",2020-02-19,7.119,235,"Mystery, Thriller, Crime, Drama"


In [5]:
# Combine the columns into a single column
df['combined'] = df['title'] + ' ' + df['overview'] + ' ' + df['release_date']

# Save the combined text to a new file
df['combined'].to_csv('combined_descriptions.txt', index=False, header=False)

In [6]:
# Train the FastText model and save
model = fasttext.train_unsupervised('combined_descriptions.txt', model='skipgram', dim=100)

save_path = './fasttext_model.bin'

model.save_model(save_path)

In [7]:
fasttext_model = fasttext.load_model('./fasttext_model.bin')

# Function to convert text into FastText embeddings
def embed_text(text, model):
    tokens = text.split()  # Tokenize the text (simple splitting on spaces)
    embeddings = [model.get_word_vector(token) for token in tokens]
    # Take the mean of the word vectors to get a single vector for the sentence
    return np.mean(embeddings, axis=0)

# Apply the embedding function to the 'title', 'overview' and 'release_date' columns
df['title_embedding'] = df['title'].apply(lambda x: embed_text(x, fasttext_model))
df['overview_embedding'] = df['overview'].apply(lambda x: embed_text(str(x), fasttext_model))
df['release_date_embedding'] = df['release_date'].apply(lambda x: embed_text(str(x), fasttext_model))

In [8]:
# One-hot encoding for the genre_names column, splitting by comma and space
df_one_hot = df['genre_names'].str.get_dummies(sep=', ')


In [9]:
title_dim = len(df['title_embedding'].iloc[0])
overview_dim = len(df['overview_embedding'].iloc[0])
release_date_dim = len(df['release_date'].iloc[0])
print(f'title_dim: {title_dim}')
print(f'overview_dim: {overview_dim}')
print(f'releade_date_dim: {release_date_dim}')

title_dim: 100
overview_dim: 100
releade_date_dim: 10


In [10]:
title_embedding_df = pd.DataFrame(df['title_embedding'].tolist(),
                                  columns=[f'title_emb_{i}' for i in range(title_dim)])
overview_embedding_df = pd.DataFrame(df['overview_embedding'].tolist(),
                                     columns=[f'overview_emb_{i}' for i in range(overview_dim)])
release_date_embedding_df = pd.DataFrame(df['release_date_embedding'].tolist(),
                                         columns=[f'release_date_emb_{i}' for i in range(100)])

In [11]:
df_final = pd.concat([
    df[['id', 'vote_average', 'vote_count']],
    title_embedding_df,
    overview_embedding_df,
    release_date_embedding_df,
    df_one_hot
], axis=1)

In [12]:
# Retain original title, overview, release_date for metadata purposes
df_final_with_metadata = df_final.copy()
df_final_with_metadata['title'] = df['title']
df_final_with_metadata['overview'] = df['overview']
df_final_with_metadata['release_date'] = df['release_date']

In [13]:
df_final_with_metadata.head()

Unnamed: 0,id,vote_average,vote_count,title_emb_0,title_emb_1,title_emb_2,title_emb_3,title_emb_4,title_emb_5,title_emb_6,...,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,title,overview,release_date
0,700391,6.008,2211,0.004806,0.021263,0.014998,0.013826,0.074351,0.052894,-0.038101,...,0,0,1,0,1,0,0,65,"65 million years ago, the only 2 survivors of ...",2023-03-02
1,592834,6.9,1136,0.036632,0.22587,0.264631,0.041697,0.17552,0.311399,-0.32022,...,0,0,0,0,0,0,0,My Spy,A hardened CIA operative finds himself at the ...,2020-01-09
2,493529,7.376,3276,0.12645,0.092999,0.19478,0.098243,0.26279,0.21878,-0.383634,...,0,0,0,0,0,0,0,Dungeons & Dragons: Honor Among Thieves,A charming thief and a band of unlikely advent...,2023-03-23
3,571625,7.276,174,0.170989,0.090268,0.065357,0.02655,0.474886,0.327209,-0.279548,...,0,0,0,0,1,0,0,The Closet,"After moving into a new house, a young girl be...",2020-02-05
4,571648,7.119,235,-0.009752,0.09153,0.281226,-0.015163,0.350808,0.35017,-0.212534,...,1,0,0,0,1,0,0,Beasts Clawing at Straws,"A struggling restaurant owner, caring for his ...",2020-02-19


In [14]:
# Combine embeddings into a single vector
df_final_with_metadata['combined_embedding'] = df_final[[
    *title_embedding_df.columns,
    *overview_embedding_df.columns,
    *release_date_embedding_df.columns,
]].values.tolist()

In [15]:
# Prepare the data for Pinecone upsert
def prepare_pinecone_data(row):
    return {
        'id': str(row['id']),
        'values': row['combined_embedding'],
        'metadata': {
            'title': row['title'],
            'overview': row['overview'],
            'release_date': row['release_date'],
            'vote_average': row['vote_average'],
            'vote_count': row['vote_count'],
        }
    }

In [16]:
# Apply the function to the dataframe and convert it into a list of dictionaries
pinecone_data = df_final_with_metadata.apply(prepare_pinecone_data, axis=1).tolist()

In [36]:
pdf = pd.DataFrame(pinecone_data)

In [18]:
pdf

Unnamed: 0,id,values,metadata
0,700391,"[0.00480611389502883, 0.021263178437948227, 0....","{'title': '65', 'overview': '65 million years ..."
1,592834,"[0.03663239628076553, 0.2258700430393219, 0.26...","{'title': 'My Spy', 'overview': 'A hardened CI..."
2,493529,"[0.12644962966442108, 0.09299876540899277, 0.1...",{'title': 'Dungeons & Dragons: Honor Among Thi...
3,571625,"[0.1709892749786377, 0.09026770293712616, 0.06...","{'title': 'The Closet', 'overview': 'After mov..."
4,571648,"[-0.009751636534929276, 0.09153008460998535, 0...","{'title': 'Beasts Clawing at Straws', 'overvie..."
...,...,...,...
18269,756403,"[0.1199960857629776, 0.13584935665130615, 0.04...",{'title': 'Riverdance: The Animated Adventure'...
18270,14584,"[0.013468627817928791, 0.13476072251796722, 0....","{'title': 'The Cheap Detective', 'overview': '..."
18271,10646,"[0.14252185821533203, 0.16987906396389008, 0.1...","{'title': 'Tomcats', 'overview': 'College budd..."
18272,9830,"[0.09595801681280136, -0.11051520705223083, -0...","{'title': 'Haven', 'overview': 'During a weeke..."


In [19]:
!pip install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Downloading pinecone_plugin_inference-1.0.3-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone_client-5.0.1-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.8/244.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_inference-1.0.3-py3-none-any.whl (117 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.6/117.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successfully installed pinecone-client-5

In [20]:
len(pdf['values'].iloc[0])

300

In [22]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="1b558c37-2524-47f9-9576-0a2efc720fe8")
index_name = 'movies'

# Check if the index exists
if index_name not in pc.list_indexes():
    pc.create_index(
    name=index_name,
    dimension=300,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': '06470b866c2d8c1a3dce396ef8c74f63', 'Date': 'Tue, 10 Sep 2024 08:30:24 GMT', 'Server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [23]:
pc = Pinecone(api_key="1b558c37-2524-47f9-9576-0a2efc720fe8")
index_name = 'movies'
index = pc.Index(index_name)

In [27]:
import numpy as np

# Function to replace NaN values in metadata
def clean_metadata(metadata):
    for key, value in metadata.items():
        if isinstance(value, float) and np.isnan(value):
            metadata[key] = ""  # Replace NaN with empty string
    return metadata

# Prepare data for upsert
vectors = []
for _, row in pdf.iterrows():
    id = str(row['id'])  # Convert id to string
    embedding = row['values']

    metadata = row['metadata']
    metadata = clean_metadata(metadata)

    vectors.append((id, embedding, metadata))

# Function to chunk the data into smaller batches
def chunk_data(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Set batch size
batch_size = 100

# Upsert data in batches
for batch in chunk_data(vectors, batch_size):
    index.upsert(vectors=batch)


In [None]:
import ast

# Function to check NaN values in the metadata dictionary
def count_nan_metadata(metadata_str):
    try:
        metadata = ast.literal_eval(metadata_str)
    except ValueError:
        return 1  # Handle cases where metadata is malformed
    return sum(pd.isna(value) for value in metadata.values())

# Count NaN values in the 'metadata' column
nan_metadata_count = pdf['metadata'].apply(count_nan_metadata).sum()

print("Total NaN values in metadata:", nan_metadata_count)

In [47]:
# Initialize a counter
nan_count = 0

# Loop through the 'metadata' column and count NaN values
for key, value in pdf['metadata'].items():
    if isinstance(value, float) and np.isnan(value):
        nan_count += 1

print(f"Number of NaN values in metadata: {nan_count}")

Number of NaN values in metadata: 0


In [45]:
import ast
# Function to safely parse metadata
def get_release_date(metadata):
    try:
        # Check if the metadata is a string, then try to parse it
        if isinstance(metadata, str):
            metadata_dict = ast.literal_eval(metadata)
            return metadata_dict.get('release_date')
    except (ValueError, SyntaxError):
        return None  # Return None if there's an error in parsing
    return None  # Return None for non-string values

# Apply the function and count the NaN release dates
nan_dates_count = pdf['metadata'].apply(lambda x: pd.isna(get_release_date(x))).sum()

print(f"Number of NaN release_dates: {nan_dates_count}")


Number of NaN release_dates: 18274


In [None]:
import numpy as np
import ast  # For safely evaluating strings as dictionaries

# Initialize a counter for NaN release dates
nan_release_date_count = 0

# Loop through the 'metadata' column
for key, value in df['metadata'].items():
    if isinstance(value, str):
        try:
            # Parse the string into a dictionary
            metadata_dict = ast.literal_eval(value)

            # Check if 'release_date' exists and if it's NaN
            release_date = metadata_dict.get('release_date')
            if release_date is None or pd.isna(release_date):
                nan_release_date_count += 1
        except (ValueError, SyntaxError):
            # Skip rows that aren't valid dictionaries
            continue

print(f"Number of NaN release dates in metadata: {nan_release_date_count}")


In [43]:
pdf

Unnamed: 0,id,values,metadata
0,700391,"[0.00480611389502883, 0.021263178437948227, 0....","{'title': '65', 'overview': '65 million years ..."
1,592834,"[0.03663239628076553, 0.2258700430393219, 0.26...","{'title': 'My Spy', 'overview': 'A hardened CI..."
2,493529,"[0.12644962966442108, 0.09299876540899277, 0.1...",{'title': 'Dungeons & Dragons: Honor Among Thi...
3,571625,"[0.1709892749786377, 0.09026770293712616, 0.06...","{'title': 'The Closet', 'overview': 'After mov..."
4,571648,"[-0.009751636534929276, 0.09153008460998535, 0...","{'title': 'Beasts Clawing at Straws', 'overvie..."
...,...,...,...
18269,756403,"[0.1199960857629776, 0.13584935665130615, 0.04...",{'title': 'Riverdance: The Animated Adventure'...
18270,14584,"[0.013468627817928791, 0.13476072251796722, 0....","{'title': 'The Cheap Detective', 'overview': '..."
18271,10646,"[0.14252185821533203, 0.16987906396389008, 0.1...","{'title': 'Tomcats', 'overview': 'College budd..."
18272,9830,"[0.09595801681280136, -0.11051520705223083, -0...","{'title': 'Haven', 'overview': 'During a weeke..."
