In this notebook, we embedd and export our data to [Pinecone.](https://www.pinecone.io/?utm_term=pinecone%20db&utm_campaign=Brand+-+US/Canada&utm_source=adwords&utm_medium=ppc&hsa_acc=3111363649&hsa_cam=21023356007&hsa_grp=156209469342&hsa_ad=690982079000&hsa_src=g&hsa_tgt=kwd-1627713670685&hsa_kw=pinecone%20db&hsa_mt=p&hsa_net=adwords&hsa_ver=3&gad_source=1&gclid=EAIaIQobChMIjYnihIqYhgMVzpdQBh1zmQM9EAAYASAAEgLCGPD_BwE)

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv(r'../../data/TMDB_tv_dataset_v3.csv')

df.head(3)

Unnamed: 0,id,name,number_of_seasons,number_of_episodes,original_language,vote_count,vote_average,overview,adult,backdrop_path,first_air_date,last_air_date,homepage,in_production,original_name,popularity,poster_path,type,status,tagline,genres,created_by,languages,networks,origin_country,spoken_languages,production_companies,production_countries,episode_run_time
0,1399,Game of Thrones,8,73,en,21857,8.442,Seven noble families fight for control of the ...,False,/2OMB0ynKlyIenMJWI2Dy9IWT4c.jpg,2011-04-17,2019-05-19,http://www.hbo.com/game-of-thrones,False,Game of Thrones,1083.917,/1XS1oqL89opfnbLl8WnZY1O1uJx.jpg,Scripted,Ended,Winter Is Coming,"Sci-Fi & Fantasy, Drama, Action & Adventure","David Benioff, D.B. Weiss",en,HBO,US,English,"Revolution Sun Studios, Television 360, Genera...","United Kingdom, United States of America",0
1,71446,Money Heist,3,41,es,17836,8.257,"To carry out the biggest heist in history, a m...",False,/gFZriCkpJYsApPZEF3jhxL4yLzG.jpg,2017-05-02,2021-12-03,https://www.netflix.com/title/80192098,False,La Casa de Papel,96.354,/reEMJA1uzscCbkpeRJeTT2bjqUp.jpg,Scripted,Ended,The perfect robbery.,"Crime, Drama",Álex Pina,es,"Netflix, Antena 3",ES,Español,Vancouver Media,Spain,70
2,66732,Stranger Things,4,34,en,16161,8.624,"When a young boy vanishes, a small town uncove...",False,/2MaumbgBlW1NoPo3ZJO38A6v7OS.jpg,2016-07-15,2022-07-01,https://www.netflix.com/title/80057281,True,Stranger Things,185.711,/49WJfeN0moxb9IPfGn8AIqMGskD.jpg,Scripted,Returning Series,Every ending has a beginning.,"Drama, Sci-Fi & Fantasy, Mystery","Matt Duffer, Ross Duffer",en,Netflix,US,English,"21 Laps Entertainment, Monkey Massacre Product...",United States of America,0


In [3]:
# Make a subset of the data, where shows are in english language only 

df = df.loc[df['languages']=='en', :]

print(f'Number of shows with english language only : {len(df)}')

Number of shows with english language only : 38671


In [4]:
# Selecting a few columns only 
df = df[['name', 'overview', 'genres', 'status', 'networks']]

df.head()

Unnamed: 0,name,overview,genres,status,networks
0,Game of Thrones,Seven noble families fight for control of the ...,"Sci-Fi & Fantasy, Drama, Action & Adventure",Ended,HBO
2,Stranger Things,"When a young boy vanishes, a small town uncove...","Drama, Sci-Fi & Fantasy, Mystery",Returning Series,Netflix
3,The Walking Dead,Sheriff's deputy Rick Grimes awakens from a co...,"Action & Adventure, Drama, Sci-Fi & Fantasy",Ended,AMC
4,Lucifer,"Bored and unhappy as the Lord of Hell, Lucifer...","Crime, Sci-Fi & Fantasy",Ended,"FOX, Netflix"
5,Riverdale,"Set in the present, the series offers a bold, ...","Crime, Drama, Mystery",Ended,The CW


### Create Documents from data

In [5]:
from langchain.document_loaders import DataFrameLoader

df["page_content"] = df.apply(lambda x : f"Name: {x['name']} \
                              \nGenres: {x['genres']} \
                              \nOverview: {x['overview']}  \
                              \nStatus: {x['status']}  \
                              \nNetwork: {x['networks']} ", \
                                axis=1 \
                                )

pc_df = df[['page_content']]

loader = DataFrameLoader(pc_df, page_content_column='page_content')

docs = loader.load()

In [6]:
print(docs[:3]) 
print(len(docs))

[Document(page_content="Name: Game of Thrones                               \nGenres: Sci-Fi & Fantasy, Drama, Action & Adventure                               \nOverview: Seven noble families fight for control of the mythical land of Westeros. Friction between the houses leads to full-scale war. All while a very ancient evil awakens in the farthest north. Amidst the war, a neglected military order of misfits, the Night's Watch, is all that stands between the realms of men and icy horrors beyond.                                \nStatus: Ended                                \nNetwork: HBO ", metadata={}), Document(page_content='Name: Stranger Things                               \nGenres: Drama, Sci-Fi & Fantasy, Mystery                               \nOverview: When a young boy vanishes, a small town uncovers a mystery involving secret experiments, terrifying supernatural forces, and one strange little girl.                                \nStatus: Returning Series                     

### Embedd documents

In [74]:
import tiktoken 

encoder = tiktoken.get_encoding('cl100k_base')

tokens_per_doc = [len(encoder.encode(doc.page_content)) for doc in docs]

[101, 56, 71]

In [75]:
# Show the estimated cost
# est costs : https://www.linkedin.com/learning/openai-api-embeddings/estimate-embeddings-pricing#:~:text=The%20cost%20of%20return%20and,because%20it's%20under%201000%20tokens.
total_tokens = sum(tokens_per_doc)
cost_per_1k_token = 0.0001
cost = (total_tokens/1000) * cost_per_1k_token

print(f'${cost}')

$0.35307730000000004


### Create Pinecone Index

In [19]:
import sys 
sys.path.append('..\..\src')
from paths import PARENT_DIR

from dotenv import load_dotenv
import os 

load_dotenv(PARENT_DIR / '.env')

True

In [77]:
import pinecone

pinecone.init(api_key = os.environ["PINECONE_API_KEY"],
             environment="gcp-starter")


index_name = "tmdb-tv-shows"

# # Check if the index already exists
# if index_name in pinecone.list_indexes():
#     # Delete the existing index
#     pinecone.delete_index(index_name)
#     print(f"Deleted existing index: {index_name}")


if index_name not in pinecone.list_indexes():
    # Create the index if it does not exist
    pinecone.create_index(
        name = index_name,
        metric = "cosine",
        dimension = 1536,) # dimensions for model we'll use
    print(f"Created index: {index_name}")

Deleted existing index: tmdb-tv-shows
Created index: tmdb-tv-shows


In [20]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone


# Create the embeddings object
embeddings = OpenAIEmbeddings()


docsearch = Pinecone.from_documents(docs,embeddings,index_name = index_name)


In [21]:
embeddings.model

'text-embedding-ada-002'

In [79]:
question = "can you give me some names of shows which are about the police?"

docsearch.as_retriever().get_relevant_documents(question)

[Document(page_content='Name: The Cops                               \nGenres: Drama                               \nOverview: Set in and around Stanton, a faceless and grim Northern enclave, The Cops depicts the daily grind for a group of policemen and women out on the beat as they interact, and sometimes clash, with the local community.                                \nStatus: Ended                                \nNetwork: BBC Two ', metadata={}),
 Document(page_content='Name: Police Station                               \nGenres: Drama                               \nOverview: Police Station is an American TV series that aired in syndication in 1959. Stories were taken from actual files.                                \nStatus: Ended                                \nNetwork: nan ', metadata={}),
 Document(page_content='Name: Police Call                               \nGenres: nan                               \nOverview: Police Call is a 1955 anthology drama television series which