In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("netflix_titles.csv")

In [3]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [5]:
def create_textual_representation(row):
    textual_representation = f"""Type:{row['type']},
Title:{row['title']},
Director:{row['director']},
Cast:{row['cast']},
Released:{row['release_year']},
Genres:{row['listed_in']},

Description:{row['description']},"""

    return textual_representation

In [6]:
df["textual_representation"] = df.apply(create_textual_representation, axis=1)

In [7]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,textual_representation
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...","Type:Movie,\nTitle:Dick Johnson Is Dead,\nDire..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...","Type:TV Show,\nTitle:Blood & Water,\nDirector:..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,"Type:TV Show,\nTitle:Ganglands,\nDirector:Juli..."
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...","Type:TV Show,\nTitle:Jailbirds New Orleans,\nD..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,"Type:TV Show,\nTitle:Kota Factory,\nDirector:n..."


In [8]:
print(df["textual_representation"].values[0])

Type:Movie,
Title:Dick Johnson Is Dead,
Director:Kirsten Johnson,
Cast:nan,
Released:2020,
Genres:Documentaries,

Description:As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.,


In [10]:
import faiss
import requests
import numpy as np

dim = 4096 # What will be returned by llama2

index = faiss.IndexFlatL2(dim) # Essentially our database

X = np.zeros((len(df["textual_representation"]), dim), dtype="float32")

In [11]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [15]:
for i, representation in enumerate(df["textual_representation"]):
    if i % 30 == 0:
        print("Processed", str(i), "instances")
        
    res = requests.post("http://localhost:11434/api/embeddings",
                       json={
                           "model":"llama2",
                           "prompt":representation
                       }
                       )
    embedding = res.json()["embedding"]

    X[i] = np.array(embedding)

index.add(X)

Processed 0 instances
Processed 30 instances
Processed 60 instances
Processed 90 instances
Processed 120 instances
Processed 150 instances
Processed 180 instances
Processed 210 instances
Processed 240 instances
Processed 270 instances
Processed 300 instances
Processed 330 instances
Processed 360 instances
Processed 390 instances
Processed 420 instances
Processed 450 instances
Processed 480 instances
Processed 510 instances
Processed 540 instances
Processed 570 instances
Processed 600 instances
Processed 630 instances
Processed 660 instances
Processed 690 instances
Processed 720 instances
Processed 750 instances
Processed 780 instances
Processed 810 instances
Processed 840 instances
Processed 870 instances
Processed 900 instances
Processed 930 instances
Processed 960 instances
Processed 990 instances
Processed 1020 instances
Processed 1050 instances
Processed 1080 instances
Processed 1110 instances
Processed 1140 instances
Processed 1170 instances
Processed 1200 instances
Processed 1230

In [16]:
faiss.write_index(index, "index")

In [17]:
index = faiss.read_index("index")

In [49]:
df[df.title.str.contains("Rick")]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,textual_representation
597,s598,Movie,Talladega Nights: The Ballad of Ricky Bobby,Adam McKay,"Will Ferrell, John C. Reilly, Sacha Baron Cohe...",United States,"July 1, 2021",2006,PG-13,108 min,"Action & Adventure, Comedies, Sports Movies",NASCAR superstar Ricky Bobby believes he's the...,"Type:Movie,\nTitle:Talladega Nights: The Balla..."
1049,s1050,TV Show,Ricky Zoom,,"Max Fincham, Twinkle Jaiswal, Bradley Bissett,...","China, United Kingdom","April 15, 2021",2019,TV-Y,1 Season,"British TV Shows, Kids' TV",Race along with Ricky Zoom and his loyal Bike ...,"Type:TV Show,\nTitle:Ricky Zoom,\nDirector:nan..."
1081,s1082,TV Show,"Nicky, Ricky, Dicky & Dawn",,"Brian Stepanek, Allison Munn, Aidan Gallagher,...",United States,"April 12, 2021",2018,TV-G,4 Seasons,"Kids' TV, TV Comedies","Just because they're quadruplets, that doesn't...","Type:TV Show,\nTitle:Nicky, Ricky, Dicky & Daw..."
4989,s4990,Movie,Ricky Gervais: Humanity,John L. Spencer,Ricky Gervais,United Kingdom,"March 13, 2018",2018,TV-MA,79 min,Stand-Up Comedy,"In his first special in seven years, Ricky Ger...","Type:Movie,\nTitle:Ricky Gervais: Humanity,\nD..."


In [50]:
favourite_movie = df.iloc[4989]

In [28]:
representation = """Type:Movie,
Title:The Island,
Director:Martin Scorsese,
Cast:Leonardo Dicaprio, Sylvester Stallone,
Released:2020,
Genres:Drama, Thriller,

Description:A group of scientists discover a new species and then find something very shocking,"""

In [51]:
favourite_movie

show_id                                                               s4990
type                                                                  Movie
title                                               Ricky Gervais: Humanity
director                                                    John L. Spencer
cast                                                          Ricky Gervais
country                                                      United Kingdom
date_added                                                   March 13, 2018
release_year                                                           2018
rating                                                                TV-MA
duration                                                             79 min
listed_in                                                   Stand-Up Comedy
description               In his first special in seven years, Ricky Ger...
textual_representation    Type:Movie,\nTitle:Ricky Gervais: Humanity,\nD...
Name: 4989, 

In [52]:
res = requests.post("http://localhost:11434/api/embeddings", json={
    "model":"llama2",
    "prompt":favourite_movie["textual_representation"]
})

In [53]:
embedding = np.array([res.json()["embedding"]], dtype="float32")

D, I = index.search(embedding, 5)

In [54]:
I

array([[4989, 3574, 4232, 5868, 6593]], dtype=int64)

In [55]:
D

array([[   0.    , 1955.9691, 2051.5469, 2098.3086, 2104.2178]],
      dtype=float32)

In [56]:
best_matches = np.array(df["textual_representation"])[I.flatten()]

In [57]:
for match in best_matches:
    print("Next Movie")
    print(match)

Next Movie
Type:Movie,
Title:Ricky Gervais: Humanity,
Director:John L. Spencer,
Cast:Ricky Gervais,
Released:2018,
Genres:Stand-Up Comedy,

Description:In his first special in seven years, Ricky Gervais slings his trademark snark at celebrity, mortality and a society that takes everything personally.,
Next Movie
Type:Movie,
Title:Simon Amstell: Set Free,
Director:Julia Knowles,
Cast:Simon Amstell,
Released:2019,
Genres:Stand-Up Comedy,

Description:Honest, introspective comic Simon Amstell digs deep and delivers a uniquely vulnerable stand-up set on love, ego, intimacy and ayahuasca.,
Next Movie
Type:Movie,
Title:Bill Hicks: Revelations,
Director:Chris Bould,
Cast:Bill Hicks,
Released:1993,
Genres:Stand-Up Comedy,

Description:In his final recorded special, the iconoclastic comedian channels Goat Boy and tackles provocative topics like British porn, pot and the priesthood.,
Next Movie
Type:Movie,
Title:Hannibal Buress: Comedy Camisado,
Director:Lance Bangs,
Cast:Hannibal Buress,
Releas