<a href="https://colab.research.google.com/github/boskidisanalysis/Movies_Recomendation/blob/main/MovieLens_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 8.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 79.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 43.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


# Language Model Sentence Embeddings

In [3]:
def get_zip_file(url):
  # importing the requests module
  import requests
  print('Downloading started')
  

  # Downloading the file by sending the request to the URL
  req = requests.get(url)

  # Split URL to get the file name
  filename = url.split('/')[-1]

  # Writing the file to the local file system
  with open(filename,'wb') as output_file:
    output_file.write(req.content)
  print('Downloading Completed')

In [3]:
url = 'https://files.grouplens.org/datasets/movielens/ml-25m.zip'
get_zip_file(url)

Downloading started
Downloading Completed


In [4]:
!unzip ml-25m.zip

Archive:  ml-25m.zip
   creating: ml-25m/
  inflating: ml-25m/tags.csv         
  inflating: ml-25m/links.csv        
  inflating: ml-25m/README.txt       
  inflating: ml-25m/ratings.csv      
  inflating: ml-25m/genome-tags.csv  
  inflating: ml-25m/genome-scores.csv  
  inflating: ml-25m/movies.csv       


In [4]:
import pandas as pd 
import numpy as np
from transformers import AutoTokenizer, AutoModel

In [5]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/LaBSE')
model = AutoModel.from_pretrained('sentence-transformers/LaBSE')
data = pd.read_csv('/content/ml-25m/movies.csv')

In [6]:
data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
import re
def remove_pars(x):
  x = str(x)
  return re.sub('[()]',"",x)

In [8]:
titles = [remove_pars(i) for i in data['title']]

In [9]:
titles[:10]

['Toy Story 1995',
 'Jumanji 1995',
 'Grumpier Old Men 1995',
 'Waiting to Exhale 1995',
 'Father of the Bride Part II 1995',
 'Heat 1995',
 'Sabrina 1995',
 'Tom and Huck 1995',
 'Sudden Death 1995',
 'GoldenEye 1995']

In [10]:
def remove_pipes(x):
  x=str(x)
  return re.sub('\|', " ",x)

In [11]:
genres = [remove_pipes(i) for i in data['genres']]

In [12]:
def remove_nulls(a,b,i):
  string_m = a[i] + ' '+ b[i]
  return re.sub('\(no genres listed\)',"", string_m)

In [13]:
input_string = [remove_nulls(titles, genres, i) for i in range(len(genres)) ]

In [14]:
input_string[:10]

['Toy Story 1995 Adventure Animation Children Comedy Fantasy',
 'Jumanji 1995 Adventure Children Fantasy',
 'Grumpier Old Men 1995 Comedy Romance',
 'Waiting to Exhale 1995 Comedy Drama Romance',
 'Father of the Bride Part II 1995 Comedy',
 'Heat 1995 Action Crime Thriller',
 'Sabrina 1995 Comedy Romance',
 'Tom and Huck 1995 Adventure Children',
 'Sudden Death 1995 Action',
 'GoldenEye 1995 Action Adventure Thriller']

In [15]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [24]:
embeddings_list = []
for _, i in enumerate(input_string):
  encoded_input = tokenizer(i, padding=True, truncation=True, max_length=64, return_tensors='pt').to(device)
  with torch.inference_mode(): 
    model_output = model(**encoded_input)
  embeddings = model_output.pooler_output
  embeddings = torch.nn.functional.normalize(embeddings)
  embeddings_list.append(embeddings)
  if _ %10000 == 0:
    print(str(_))  

0
10000
20000
30000
40000
50000
60000


In [25]:
embeddings_list_tensors = [i.cpu()[0].numpy() for i in embeddings_list]

In [28]:
embeddings = pd.DataFrame(np.vstack(embeddings_list_tensors))
embeddings.to_csv('embedding_data.csv')

In [22]:
embeddings = pd.read_csv('/content/drive/MyDrive/embedding_data.csv',index_col=0)

In [23]:
embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.006538,-0.042496,0.014313,0.023817,-0.056576,0.047191,0.004182,0.011531,-0.010112,-0.026587,...,-0.03509,-0.022659,-0.0171,-0.039019,-0.056206,-0.006634,-0.034283,-0.031559,0.035495,0.041886
1,-0.008588,-0.030542,0.001161,0.017724,-0.051122,0.065883,0.001737,0.023441,-0.029678,-0.048359,...,-0.001177,0.046209,0.021217,-0.040132,-0.066025,0.035727,-0.022926,-0.064467,0.030721,0.009855
2,-0.003954,-0.058009,0.000429,0.031793,-0.055129,0.041872,0.00737,0.004464,0.017399,-0.052856,...,-0.024539,0.047893,0.008946,-0.027328,-0.027161,0.045568,-0.020122,-0.057999,-0.041704,0.00874
3,0.015047,-0.061407,-0.016938,-0.013356,-0.024377,0.032822,-0.039581,0.018516,-0.003029,-0.022575,...,-0.047106,0.044167,-0.026389,-0.038091,-0.038319,0.03639,-0.037342,-0.063318,0.016386,0.024849
4,-0.012277,-0.061566,-0.006264,-0.001114,-0.03562,0.005099,-0.065697,0.063327,0.074066,-0.015134,...,-0.007887,0.035019,-0.001221,-0.014433,-0.050452,0.050904,-0.040659,-0.008459,0.023693,0.021617


# Approximate Nearest Neighbor Candidate Generation

In [30]:
!pip install tensorflow-recommenders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-recommenders
  Downloading tensorflow_recommenders-0.7.2-py3-none-any.whl (89 kB)
[K     |████████████████████████████████| 89 kB 4.1 MB/s 
Installing collected packages: tensorflow-recommenders
Successfully installed tensorflow-recommenders-0.7.2


In [34]:
!pip install scann

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scann
  Downloading scann-1.2.9-cp38-cp38-manylinux_2_27_x86_64.whl (10.5 MB)
[K     |████████████████████████████████| 10.5 MB 7.4 MB/s 
[?25hCollecting tensorflow~=2.11.0
  Downloading tensorflow-2.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[K     |████████████████████████████████| 588.3 MB 21 kB/s 
Collecting flatbuffers>=2.0
  Downloading flatbuffers-22.12.6-py2.py3-none-any.whl (26 kB)
Collecting tensorboard<2.12,>=2.11
  Downloading tensorboard-2.11.0-py3-none-any.whl (6.0 MB)
[K     |████████████████████████████████| 6.0 MB 53.8 MB/s 
Collecting tensorflow-estimator<2.12,>=2.11.0
  Downloading tensorflow_estimator-2.11.0-py2.py3-none-any.whl (439 kB)
[K     |████████████████████████████████| 439 kB 63.9 MB/s 
Collecting keras<2.12,>=2.11.0
  Downloading keras-2.11.0-py2.py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████

In [1]:
import tensorflow as tf 
import tensorflow_recommenders as tfrs

In [24]:
item_tensor = tf.convert_to_tensor(embeddings, dtype=tf.float32)

In [25]:
scann = tfrs.layers.factorized_top_k.ScaNN(num_leaves=1000,
                                           num_leaves_to_search=100,
                                           k=round(np.sqrt(len(item_tensor))))
scann.index(item_tensor)

<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7f3b90c35a60>

In [19]:
search = "Comedy films for Christmas" #@param {type:"string"}

In [27]:
encoded_input = tokenizer(search, padding=True, truncation=True, max_length=64, return_tensors = 'pt').to(device)
with torch.inference_mode():
  model_output = model(**encoded_input)
query = model_output.pooler_output
query = torch.nn.functional.normalize(query)

In [28]:
test_case = scann(np.array(query.cpu()))

In [44]:
test_case[0][0][0:10].numpy()

array([0.6197646 , 0.6063759 , 0.58653545, 0.5725466 , 0.57201517,
       0.5703147 , 0.56798476, 0.5673812 , 0.5666513 , 0.56088406],
      dtype=float32)

In [30]:
data.iloc[test_case[1].numpy()[0]][0:10]

Unnamed: 0,movieId,title,genres
25351,123530,The Christmas Party (2009),Comedy
57672,196897,Christmas Crush (2012),Comedy|Drama
42922,164907,A Star for Christmas (2012),Comedy|Drama|Romance
45183,169804,Internet - O Filme (2017),Comedy
11969,56158,This Christmas (2007),Comedy|Drama|Romance
44336,168010,A Christmas Wish (2011),Children|Comedy|Drama
49948,179953,A Bad Moms Christmas (2017),Comedy
62143,208046,Täydellinen joulu (2019),Comedy
45311,170105,Saattokeikka (2017),Comedy|Drama
13587,70344,Cold Souls (2009),Comedy|Drama


In [45]:
def prediction(text:str):
  encoded_input = tokenizer(text, padding=True, truncation=True, max_length=64, return_tensors = 'pt').to(device)
  with torch.inference_mode():
    model_output = model(**encoded_input)
  query = model_output.pooler_output
  query = torch.nn.functional.normalize(query)
  test_case = scann(np.array(query.cpu()))
  return data.iloc[test_case[1].numpy()[0]][0:10]

In [47]:
!pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.12.0-py3-none-any.whl (11.6 MB)
[K     |████████████████████████████████| 11.6 MB 8.9 MB/s 
[?25hCollecting httpx
  Downloading httpx-0.23.1-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 4.1 MB/s 
Collecting websockets>=10.0
  Downloading websockets-10.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 72.0 MB/s 
Collecting paramiko
  Downloading paramiko-2.12.0-py2.py3-none-any.whl (213 kB)
[K     |████████████████████████████████| 213 kB 72.7 MB/s 
[?25hCollecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting orjson
  Downloading orjson-3.8.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (278 kB)
[K     |████████████████████████████████| 278 kB 68.2 MB/s 
[?25hCollecting 

In [49]:
import gradio as gr

# We instantiate the Textbox class
textbox = gr.Textbox(label="Type your search here:", placeholder="Search for a film", lines=2)

gr.Interface(fn=prediction, inputs=textbox, outputs="dataframe").launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

