In [4]:
import pandas as pd

In [5]:
df = pd.read_csv(r'Recommend_Sys\data\netflix_titles.csv')

In [6]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [7]:
def create_textual_representation(row):
    txt= f"""Type: {row['type']},
Title: {row['title']},
Director: {row['director']},
Cast: {row['cast']},
Release Year: {row['release_year']},

Description: {row['description']}"""
    return txt

In [8]:
df['textual_representation']=df.apply(create_textual_representation, axis=1)

In [9]:
print(df['textual_representation'].values[0])

Type: Movie,
Title: Dick Johnson Is Dead,
Director: Kirsten Johnson,
Cast: nan,
Release Year: 2020,

Description: As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.


In [10]:
import faiss
import requests
import numpy as np

In [12]:
dim = 3072
index = faiss.IndexFlatIP(dim)
X = np.zeros((len(df['textual_representation']), dim), dtype="float32")


for i , representation in enumerate(df['textual_representation']):
    if i% 10==0:
        print(f"Proces {i} Instances")
        
    response = requests.post("http://localhost:11434/api/embeddings",
                             json={
                                 "model": "llama3.2",
                                 "prompt": representation
                                 }
                             )
    
    embedding = response.json()['embedding']
    
    X[i] = np.array(embedding)
    
index.add(X)

Proces 0 Instances


KeyboardInterrupt: 

In [22]:
from llama_index.embeddings.ollama import OllamaEmbedding
import numpy as np
import faiss
from tqdm import tqdm

# Initialize OllamaEmbedding
ollama_embed = OllamaEmbedding(
    model_name="llama3.2",
    base_url="http://localhost:11434"
)

# Assuming df is your DataFrame with 'textual_representation' column
texts = df['textual_representation'].tolist()

dim = 3072
index = faiss.IndexFlatIP(dim)

# Function to process embeddings in batches
def batch_embed(texts, batch_size=32):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch = texts[i:i+batch_size]
        embeddings = ollama_embed.get_text_embedding_batch(batch)
        all_embeddings.extend(embeddings)
    return np.array(all_embeddings, dtype="float32")

# Generate embeddings in batches
X = batch_embed(texts)

# Add to FAISS index
index.add(X)

print(f"Added {len(X)} vectors to the FAISS index")

Processing batches: 100%|██████████| 276/276 [07:10<00:00,  1.56s/it]


Added 8807 vectors to the FAISS index


In [23]:
faiss.write_index(index, "index")

In [24]:
index = faiss.read_index("index")

In [25]:
df[df['title'].str.contains('The Matrix')]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,textual_representation
8414,s8415,Movie,The Matrix,"Lilly Wachowski, Lana Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",United States,"November 1, 2019",1999,R,136 min,"Action & Adventure, Sci-Fi & Fantasy",A computer hacker learns that what most people...,"Type: Movie,\nTitle: The Matrix,\nDirector: Li..."
8415,s8416,Movie,The Matrix Reloaded,"Lilly Wachowski, Lana Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",United States,"November 1, 2019",2003,R,138 min,"Action & Adventure, Sci-Fi & Fantasy","The crew must protect Zion, the last outpost s...","Type: Movie,\nTitle: The Matrix Reloaded,\nDir..."
8416,s8417,Movie,The Matrix Revolutions,"Lilly Wachowski, Lana Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...",United States,"November 1, 2019",2003,R,129 min,"Action & Adventure, Sci-Fi & Fantasy",The final installment in the Matrix trilogy fi...,"Type: Movie,\nTitle: The Matrix Revolutions,\n..."


In [36]:
favorite = df.iloc[8415]

In [37]:
res = requests.post("http://localhost:11434/api/embeddings", json={"model": "llama3.2", "prompt": favorite['textual_representation']})

In [38]:
embedding = np.array(res.json()['embedding'], dtype="float32")

D, I = index.search(embedding.reshape(1, -1), 5)

In [39]:
best_matches = np.array(df.iloc[I[0]]['title'])

In [40]:
best_matches

array(['The Matrix Reloaded', 'The Matrix Revolutions',
       'The Lord of the Rings: The Return of the King', 'The Matrix',
       'The Lord of the Rings: The Two Towers'], dtype=object)

In [41]:
best = np.array(df['textual_representation'])[I[0]]

In [42]:
for match in best:
    print(match)
    print("----")

Type: Movie,
Title: The Matrix Reloaded,
Director: Lilly Wachowski, Lana Wachowski,
Cast: Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving, Jada Pinkett Smith, Gloria Foster, Harold Perrineau, Monica Bellucci, Harry Lennix, Lambert Wilson, Randall Duk Kim, Nona Gaye,
Release Year: 2003,

Description: The crew must protect Zion, the last outpost still under human control, after it is pinpointed by the machines responsible for the Matrix.
----
Type: Movie,
Title: The Matrix Revolutions,
Director: Lilly Wachowski, Lana Wachowski,
Cast: Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving, Jada Pinkett Smith, Mary Alice, Harold Perrineau, Monica Bellucci, Harry Lennix, Lambert Wilson, Nona Gaye,
Release Year: 2003,

Description: The final installment in the Matrix trilogy finds an unconscious Neo trapped in a subway station in a zone between the Matrix and the machine world.
----
Type: Movie,
Title: The Lord of the Rings: The Return of the King,
Director: Peter 

In [44]:
def get_movie_suggestions(favorite_text, num_suggestions=5):
    # Get the embedding for the favorite movie
    res = requests.post("http://localhost:11434/api/embeddings", json={"model": "llama3.2", "prompt": favorite_text})
    embedding = np.array(res.json()['embedding'], dtype="float32")

    # Search for the best matches in the FAISS index
    D, I = index.search(embedding.reshape(1, -1), num_suggestions)

    # Get the titles of the best matches
    best_matches = np.array(df.iloc[I[0]]['title'])
    
    return best_matches



['The Matrix Reloaded' 'The Matrix Revolutions'
 'The Lord of the Rings: The Return of the King' 'The Matrix'
 'The Lord of the Rings: The Two Towers']


In [45]:
# Example usage
def get_suggestions_by_attributes(title=None, director=None, cast_member=None, release_year=None, num_suggestions=5):
    # Filter the DataFrame based on the provided attributes
    filtered_df = df
    if title:
        filtered_df = filtered_df[filtered_df['title'].str.contains(title, case=False, na=False)]
    if director:
        filtered_df = filtered_df[filtered_df['director'].str.contains(director, case=False, na=False)]
    if cast_member:
        filtered_df = filtered_df[filtered_df['cast'].str.contains(cast_member, case=False, na=False)]
    if release_year:
        filtered_df = filtered_df[filtered_df['release_year'] == release_year]
    
    if filtered_df.empty:
        return np.array([])  # Return an empty array if no matches found
    
    # Get the textual representation of the first matching row
    favorite_text = filtered_df.iloc[0]['textual_representation']
    
    # Get movie suggestions
    suggested_movies = get_movie_suggestions(favorite_text, num_suggestions)
    
    return suggested_movies

# Example usage
suggested_movies = get_suggestions_by_attributes(title="The Matrix", director="Wachowski", num_suggestions=5)
print(suggested_movies)


['The Matrix' 'The Net' 'V for Vendetta' 'The Prince' 'The Darkest Hour']
