In [None]:
!pip install tensorflow_recommenders scann uuid librosa ffmpeg-python



In [None]:
!pip install git+https://github.com/openai/whisper.git

from google.colab import drive
drive.mount('/content/drive')

!cp /content/drive/MyDrive/Ahana /content/ -r
!unzip /content/Ahana/data.zip -d /content/ml-25m/
!unzip /content/Ahana/1.zip -d /content/Labse/
!unzip /content/Ahana/2.zip -d /content/Labse/

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-vvujh3h0
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-vvujh3h0
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Archive:  /content/Ahana/data.zip
replace /content/ml-25m/ml-25m/ml-25m/genome-scores.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers

EMBEDDING_SIZE = 128

@keras.utils.register_keras_serializable(package="MyLayers")
class RecommenderNet(keras.Model):
    def __init__(self, **kwargs):
        super(RecommenderNet, self).__init__(**kwargs)
        num_users=200000
        num_movies=200000
        embedding_size=128
        self.num_users = num_users
        self.num_movies = num_movies
        self.embedding_size = embedding_size
        self.user_embedding = layers.Embedding(
            num_users,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.user_bias = layers.Embedding(num_users, 1)
        self.movie_embedding = layers.Embedding(
            num_movies,
            embedding_size,
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6),
        )
        self.movie_bias = layers.Embedding(num_movies, 1)

    def call(self, inputs):
        user_vector = self.user_embedding(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        movie_vector = self.movie_embedding(inputs[:, 1])
        movie_bias = self.movie_bias(inputs[:, 1])
        dot_user_movie = tf.tensordot(user_vector, movie_vector, 2)
        # Add all the components (including bias)
        x = dot_user_movie + user_bias + movie_bias
        # The sigmoid activation forces the rating to between 0 and 1
        return tf.nn.sigmoid(x)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import tensorflow_recommenders as tfrs
import tensorflow as tf
from tensorflow import keras

class PersonalisedSearcher():
    def __init__(self):
        self.recommender = RecommenderNet()
        self.recommender.load_weights('/content/Ahana/CF_Final.keras')
        self.movies = pd.read_csv("/content/ml-25m/ml-25m/ml-25m/movies.csv")
        self.ratings = pd.read_csv("/content/ml-25m/ml-25m/ml-25m/ratings.csv")
        self.embeddings = pd.read_csv("/content/Ahana/data.csv", index_col=0)
        self.item_tensor = tf.convert_to_tensor(self.embeddings, dtype=tf.float32)
        self.scann = tfrs.layers.factorized_top_k.ScaNN(num_leaves=1000, num_leaves_to_search = 100,
                                                        k=round(np.sqrt(len(self.item_tensor))))
        self.scann.index(self.item_tensor)
        self.model = AutoModel.from_pretrained("./Labse")
        self.tokenizer = AutoTokenizer.from_pretrained("./Labse")

    def get_user_encodings(self):
        user_ids = self.ratings["userId"].unique().tolist()
        user2user_encoded = {x: i for i, x in enumerate(user_ids)}
        userencoded2user = {i: x for i, x in enumerate(user_ids)}

        return user2user_encoded, userencoded2user

    def get_movie_encodings(self):
        movie_ids = self.ratings["movieId"].unique().tolist()
        movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
        movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}

        return movie2movie_encoded, movie_encoded2movie

    def update_ratings(self):
        user2user_encoded, _ = self.get_user_encodings()
        movie2movie_encoded, _ = self.get_movie_encodings()
        self.ratings["user"] = self.ratings["userId"].map(user2user_encoded)
        self.ratings["movie"] = self.ratings["movieId"].map(movie2movie_encoded)

        return self.ratings

    def get_user_history(self, user_id):
        df = self.update_ratings()
        watched_movies = df[df.userId == user_id]

    def get_candidate_movies(self, query):
        encoded_input = self.tokenizer(query,
                                  padding=True,
                                  truncation=True,
                                  max_length=64,
                                  return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)
        query_embeddings = model_output.pooler_output
        query_embeddings = torch.nn.functional.normalize(query_embeddings)
        test_case = self.scann(np.array(query_embeddings))
        return self.movies.iloc[test_case[1].numpy()[0]][0:11]

    def filter_candidates(self, user_id, query):
        movies_watched_by_user = self.ratings[self.ratings.userId == user_id]
        candidates = self.get_candidate_movies(query)
        movies_not_watched = candidates[
            ~candidates["movieId"].isin(movies_watched_by_user.movieId.values)
        ]["movieId"]
        movie2movie_encoded, _ = self.get_movie_encodings()
        movies_not_watched = list(set(movies_not_watched).
                                  intersection(set(movie2movie_encoded.keys())))
        movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
        user2user_encoded, _ = self.get_user_encodings()
        user_encoder = user2user_encoded.get(user_id)
        movie_array = np.hstack(([[user_encoder]] * len(movies_not_watched), movies_not_watched))

        return movie_array, movies_not_watched, movies_watched_by_user

    def personalised_search(self, user_id, query):
        movie_array, movies_not_watched, movies_watched_by_user = self.filter_candidates(user_id, query)
        scored_items = self.recommender.predict(movie_array).flatten()
        top_rated = scored_items.argsort()[-10:][::-1]
        _, movie_encoded2movie = self.get_movie_encodings()
        recommended_movie_ids = [movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_rated]

        return recommended_movie_ids, movies_watched_by_user

    def print_recs(self, user_id, query):
        recommendations, movies_watched_by_user = self.personalised_search(user_id, query)

        print("Showing Top movie recommendations for user:")
        print("====" * 9)
        # print("Movies with high ratings from user")
        # print("----" * 8)
        # top_movies_user = (
        #     movies_watched_by_user.sort_values(by="rating", ascending=False)
        #     .head(5)
        #     .movieId.values
        # )
        # movie_df_rows = self.movies[self.movies["movieId"].isin(top_movies_user)]
        # for row in movie_df_rows.itertuples():
        #     print(row.title, ":", row.genres)
        # print("----" * 8)
        # print("Top movie recommendations")
        # print("----" * 8)
        recommended_movies = self.movies[self.movies["movieId"].isin(recommendations)]
        for row in recommended_movies.itertuples():
            print(row.title, "\t:\t", row.genres)
        return recommended_movies.itertuples()

In [None]:
!pip install gtts
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import whisper
import gtts
import uuid
from IPython.display import Audio, display
import wave
import librosa


RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=7):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open('audio.wav','wb') as f:
    f.write(b)
  return 'audio.wav'

def transcribe(audio,model):
  result = model.transcribe(audio,language="English")
  return result['text'].lower()

def preprocess_query(query,wakeword):
    query = query.lower()
    query=query.replace(wakeword,"")
    query = query.translate(str.maketrans("", "", string.punctuation))
    tokens = nltk.word_tokenize(query)
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    query = " ".join(tokens)
    return query

def textToSpeech(text):
    tts = gtts.gTTS(text=text, lang='en')
    name=str(uuid.uuid1())+'.wav'
    tts.save(f"./audios/{name}")
    display(Audio(f"./audios/{name}", autoplay=True))
    time.sleep(librosa.get_duration(filename=f"./audios/{name}"))


In [None]:
print("Initializing Recommender Model")
searcher=PersonalisedSearcher()
print("Initializing Whisper Model")
model = whisper.load_model("base",in_memory=True)


In [None]:
import warnings
import time
import random

!rm ./audios -r
!mkdir audios

warnings.filterwarnings('ignore')
wakeword="hello"
record_time=2
query_record_time=7


while True:
  print(f"\n<<Listening for wake word 'hello'>>\n")
  time.sleep(1)
  audio=record(record_time)
  text=transcribe(audio,model)
  if text.count(wakeword) > 0:
      display(Audio("./Ahana/hello.wav", autoplay=True))
      time.sleep(4)
      print(f"<<Now Listening for Query for {query_record_time} seconds>>")
      audio=record(query_record_time)
      print("Now Transcribing")
      text=transcribe(audio,model)
      print("\n"+text+"\n")
      print("Now Preprocessing")
      query=preprocess_query(text,"")
      textToSpeech("Finding your movies. Please be patient")
      recommendations=searcher.print_recs(random.randrange(20, 50, 3),query)
      rec_text="Here are some movies you might like."
      for movie in recommendations:
        rec_text+=" ".join(movie.title.split(" ")[:-1])+". "
      print("\n\n")
      textToSpeech(rec_text)
  else:
      print("Wakeword Not Detected")