In [2]:
import os

import numpy as np
import pandas as pd
import moviepy.editor as mp
import glob
import speech_recognition as sr
import csv

## Qdrant client
from qdrant_client import QdrantClient
from qdrant_client.http import models

from transformers import AutoModel, AutoTokenizer
import torch

In [3]:
# specify your path
path = "content/"

In [4]:
# create directory
os.makedirs(path, exist_ok=True)

In [5]:
#directory containing video files
source_videos_file_path = r"content/videos/"

#directory for storing audio files
destination_audio_files_path = r"content/audios"

# CSV file for storing transcripts
csv_file_path = r"content/transcripts.csv"

# Create the destination directory if it doesn't exist
os.makedirs(destination_audio_files_path, exist_ok=True)

# Initialize recognizer class (for recognizing the speech)
r = sr.Recognizer()

# Open the CSV file in write mode
with open(csv_file_path, 'w', newline='') as csvfile:
    # Create a CSV writer
    writer = csv.writer(csvfile)
    # Write the header row
    writer.writerow(["Video File", "Transcript"])

    # Process video frame by frame
    for video_file in glob.glob(os.path.join(source_videos_file_path, '*.mp4')):
        # Convert video to audio
        video_clip = mp.VideoFileClip(video_file)
        audio_file_path = os.path.join(destination_audio_files_path, os.path.basename(video_file).replace("'", "").replace(" ", "_") + '.wav')
        video_clip.audio.write_audiofile(audio_file_path)

        # Transcribe audio to text
        with sr.AudioFile(audio_file_path) as source:
            # read the audio file
            audio_text = r.listen(source)
            # convert speech to text
            try:
                transcript = r.recognize_google(audio_text)
            except sr.UnknownValueError:
                print("Google Speech Recognition could not understand audio")
                transcript = "Error: Could not understand audio"
            except sr.RequestError as e:
                print("Could not request results from Google Speech Recognition service; {0}".format(e))
                transcript = "Error: Could not request results from Google Speech Recognition service; {0}".format(e)

        # Write the transcript to the CSV file
        writer.writerow([video_file, transcript])

MoviePy - Writing audio in content/audios/3_Body_Problem___Official_Trailer___Netflix.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/The_Ministry_Of_Ungentlemanly_Warfare_(2024)_Official_Trailer_-_Starring_Henry_Cavill.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/MOANA_Live_Action_-_Official_Trailer_(2024)_Zendaya,_Dwayne_Johnson___Disney_.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/The_Gentlemen___A_new_series_from_Guy_Ritchie_Official_Teaser___Netflix.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/Heeramandi__The_Diamond_Bazaar___First_Look___Sanjay_Leela_Bhansali.mp4.wav


                                                                      

MoviePy - Done.
Google Speech Recognition could not understand audio
MoviePy - Writing audio in content/audios/Avatar__The_Last_Airbender___Official_Trailer___Netflix.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/Miller’s_Girl_(2024)_Official_Trailer_-_Martin_Freeman,_Jenna_Ortega.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/Despicable_Me_4___Official_Trailer.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/Road_House_-_Official_Trailer___Prime_Video.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/Griselda___Official_Trailer___Netflix.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/Bridgerton_Season_2___Official_Teaser___Netflix.mp4.wav


                                                                     

MoviePy - Done.
MoviePy - Writing audio in content/audios/Gal_Gadot_and_Arnold_Schwarzenegger_Make_Action-Packed_Coffee___Nobody_Hits_Like_Netflix.mp4.wav


                                                                      

MoviePy - Done.
Google Speech Recognition could not understand audio
MoviePy - Writing audio in content/audios/MEGAMIND_2_The_Doom_Syndicate_Trailer_(2024).mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/Ricky_Stanicky_-_Official_Trailer___Prime_Video.mp4.wav


                                                                      

MoviePy - Done.
Google Speech Recognition could not understand audio
MoviePy - Writing audio in content/audios/Godzilla_x_Kong___The_New_Empire___Official_Trailer.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/Alexander__The_Making_of_a_God___Official_Trailer___Netflix.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/Spaceman___Official_Trailer___Netflix.mp4.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in content/audios/1899___Official_Teaser___Netflix.mp4.wav


                                                                     

MoviePy - Done.
MoviePy - Writing audio in content/audios/Kate_and_Anthonys_Story___Bridgerton___Netflix.mp4.wav


                                                                        

MoviePy - Done.
Could not request results from Google Speech Recognition service; recognition connection failed: [Errno 32] Broken pipe


In [6]:
data = pd.read_csv('content/transcripts.csv')
data.head()

Unnamed: 0,Video File,Transcript
0,content/videos/3 Body Problem _ Official Trail...,I must be 30 dead scientists in the past month...
1,content/videos/The Ministry Of Ungentlemanly W...,gas March Phillips I have a mission I want you...
2,content/videos/MOANA Live Action - Official Tr...,what Generations this peaceful Island has been...
3,content/videos/The Gentlemen _ A new series fr...,do you know what I admire about the British ar...
4,content/videos/Heeramandi_ The Diamond Bazaar ...,Error: Could not understand audio


In [7]:
## See where SpeechRecognition throws error
data = data[~data['Transcript'].str.startswith('Error')]
data.head()

Unnamed: 0,Video File,Transcript
0,content/videos/3 Body Problem _ Official Trail...,I must be 30 dead scientists in the past month...
1,content/videos/The Ministry Of Ungentlemanly W...,gas March Phillips I have a mission I want you...
2,content/videos/MOANA Live Action - Official Tr...,what Generations this peaceful Island has been...
3,content/videos/The Gentlemen _ A new series fr...,do you know what I admire about the British ar...
5,content/videos/Avatar_ The Last Airbender _ Of...,define nation is involved in the dark part in ...


In [8]:
## Initialize Qdrant client
client =  QdrantClient(":memory:")

## Create collection where vector embeddings wil be stored, with distances measured using cosine similarity search
my_collection = "text_collection"
client.recreate_collection(
    collection_name=my_collection,
    vectors_config=models.VectorParams(size=768, distance=models.Distance.COSINE)
)

## Use pre-trained model to extract the embedding layer from dataset
device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModel.from_pretrained("gpt2")#.to(device) # switch this for GPU

In [25]:
# Extract movie names and create a new column to we can know which embedding belongs to which movie
def extract_movie_name(file_path):
    file_name = file_path.split("/")[-1]  # Get the last part of the path
    movie_name = file_name.replace(".mp4", "").strip()
    return movie_name

# Apply the function to create the new column
data['Movie_Name'] = data['Video File'].apply(extract_movie_name)

# Display the DataFrame
data[['Video File', 'Movie_Name', 'Transcript']]


Unnamed: 0,Video File,Movie_Name,Transcript
0,content/videos/3 Body Problem _ Official Trail...,3 Body Problem _ Official Trailer _ Netflix,I must be 30 dead scientists in the past month...
1,content/videos/The Ministry Of Ungentlemanly W...,The Ministry Of Ungentlemanly Warfare (2024) O...,gas March Phillips I have a mission I want you...
2,content/videos/MOANA Live Action - Official Tr...,MOANA Live Action - Official Trailer (2024) Ze...,what Generations this peaceful Island has been...
3,content/videos/The Gentlemen _ A new series fr...,The Gentlemen _ A new series from Guy Ritchie ...,do you know what I admire about the British ar...
5,content/videos/Avatar_ The Last Airbender _ Of...,Avatar_ The Last Airbender _ Official Trailer ...,define nation is involved in the dark part in ...
6,content/videos/Miller’s Girl (2024) Official T...,Miller’s Girl (2024) Official Trailer - Martin...,what is an adult I'm 18 languishing in the wil...
7,content/videos/Despicable Me 4 _ Official Trai...,Despicable Me 4 _ Official Trailer,hello everybody something is missing this guy ...
8,content/videos/Road House - Official Trailer _...,Road House - Official Trailer _ Prime Video,before we start do you have insurance what you...
9,content/videos/Griselda _ Official Trailer _ N...,Griselda _ Official Trailer _ Netflix,for the last 3 years Griselda Blanco has owned...
10,content/videos/Bridgerton Season 2 _ Official ...,Bridgerton Season 2 _ Official Teaser _ Netflix,dearest gentle reader did you miss me as the m...


In [16]:
# Helper function to get embeddings for each movie trailer transcript
def get_embeddings(row):
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    inputs = tokenizer(row['Transcript'], padding=True, truncation=True, max_length=128, return_tensors="pt")

    # Disable gradient computation for the following operations.
    with torch.no_grad():
      outputs = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()

    # Return the computed embeddings.
    return outputs

In [17]:
## Apply embedding function to our dataset
data['embeddings'] = data.apply(get_embeddings, axis=1)

## Save embeddings
np.save("vectors", np.array(data["embeddings"]))

In [18]:
data['embeddings']

0     [[-0.14311637, 0.081827395, -0.36023882, 0.162...
1     [[-0.12195916, -0.19920853, 0.08275758, 0.1441...
2     [[-0.12880684, 0.13759172, -0.35998973, 0.2543...
3     [[-0.24391694, -0.26156387, -0.24205457, 0.132...
5     [[-0.07681858, -0.09796931, -0.26582593, 0.071...
6     [[-0.005122041, 0.06611002, -0.1784917, 0.0797...
7     [[-0.10958139, -0.1089645, -0.18594687, 0.0468...
8     [[-0.123638, -0.14618993, -0.025281252, -0.141...
9     [[0.009990929, 0.01571223, 0.050433688, -0.081...
10    [[-0.0072631813, -0.109210595, -0.76424086, -0...
12    [[-0.14027737, -0.043337416, 0.1824955, 0.1192...
14    [[-0.16945867, 0.229838, -0.78439784, -0.01423...
15    [[-0.43779612, 0.08806487, -0.18701823, -0.076...
16    [[0.17439526, -0.09582436, -0.35456678, -0.018...
17    [[-0.070134774, -0.014191317, -0.5359613, 0.01...
Name: embeddings, dtype: object

In [19]:
## Create payload with metadata for each movie transcript
payload = data[['Transcript', 'Movie_Name', 'embeddings']].to_dict(orient="records")
payload

KeyError: "['Movie_Name'] not in index"

In [None]:
## Helper function for mean pooling for token embedding

# Set the expected size for the vector embeddings
expected_vector_size = 768

# Define a function for mean pooling of token embeddings
def mean_pooling(model_output, attention_mask):
    # Extract token embeddings from the model output
    token_embeddings = model_output[0]

    # Expand the attention mask to match the size of token embeddings
    input_mask_expanded = (attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float())

    # Calculate the sum of token embeddings, considering the attention mask
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)

    # Calculate the sum of the attention mask (clamped to avoid division by zero)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    # Return the mean-pooled embeddings
    return sum_embeddings / sum_mask

# Initialize a list to store text embeddings
text_embeddings = []

# Loop through each transcript in the 'Transcript' column of the 'data' variable
for transcript in data['Transcript']:
    # Tokenize the transcript, ensuring padding and truncation, and return PyTorch tensors
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    inputs = tokenizer(transcript, padding=True, truncation=True, max_length=128, return_tensors="pt")

    # Perform inference using the model with the tokenized inputs
    with torch.no_grad():
        embs = model(**inputs)

    # Calculate mean-pooled embeddings using the defined function
    embedding = mean_pooling(embs, inputs["attention_mask"])

    # Ensure the embeddings are of the correct size by trimming or padding
    embedding = embedding[:, :expected_vector_size]
    
    # Append the resulting embedding to the list
    text_embeddings.append(embedding)

In [None]:
## Assign each transcript an explicit ID within the  Qdrant database collection
## Create a list of ids and then upsert the combination of IDs, vectors and payloads

ids = list(range(len(data)))

# Convert PyTorch tensors to lists of floats
text_embeddings_list = [[float(num) for num in emb.numpy().flatten().tolist()[:expected_vector_size]] for emb in text_embeddings]

client.upsert(collection_name=my_collection,
              points=models.Batch(
                  ids=ids,
                  vectors=text_embeddings_list,
                  payloads=payload
                  )
              )

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
## Using a sentiment analysis model, generate a sentiment scort where sentiment polarity between -1 and 1 will be calculated
## A score of -1 will indicare negative sentiments
## A score of 0 will indicare neutral sentiment
## A score of 1 will indicare positive sentiment

from textblob import TextBlob

def calculate_sentiment_score(text):
    # Create a TextBlob object
    blob = TextBlob(text)

    # Get the sentiment polarity (-1 to 1, where -1 is negative, 0 is neutral, and 1 is positive)
    sentiment_score = blob.sentiment.polarity

    return sentiment_score

# Example usage:
text_example = data['Transcript'].iloc[10]
sentiment_score_example = calculate_sentiment_score(text_example)
print(f"Sentiment Score: {sentiment_score_example}")

Sentiment Score: 0.2004040404040404


In [None]:
## Apply sentiment helper function to data dataframe
data['Sentiment Score'] = data['Transcript'].apply(calculate_sentiment_score)
data.head()

Unnamed: 0,Video File,Transcript,Movie_Name,embeddings,Sentiment Score
0,content/videos/3 Body Problem _ Official Trail...,I must be 30 dead scientists in the past month...,3 Body Problem _ Official Trailer _ Netflix,"[[-0.14311637, 0.081827395, -0.36023882, 0.162...",0.061786
1,content/videos/The Ministry Of Ungentlemanly W...,gas March Phillips I have a mission I want you...,The Ministry Of Ungentlemanly Warfare (2024) O...,"[[-0.12388463, -0.2054558, 0.08573259, 0.14563...",0.094872
2,content/videos/MOANA Live Action - Official Tr...,what Generations this peaceful Island has been...,MOANA Live Action - Official Trailer (2024) Ze...,"[[-0.12880684, 0.13759172, -0.35998973, 0.2543...",0.374306
3,content/videos/The Gentlemen _ A new series fr...,do you know what I admire about the British ar...,The Gentlemen _ A new series from Guy Ritchie ...,"[[-0.24391694, -0.26156387, -0.24205457, 0.132...",0.1875
5,content/videos/Avatar_ The Last Airbender _ Of...,define nation is involved in the dark part in ...,Avatar_ The Last Airbender _ Official Trailer ...,"[[-0.07681858, -0.09796931, -0.26582593, 0.071...",0.117143


In [None]:
## Take the average of the vector embeddings of each movie transcript and combne it with sentiment score and we get the final opinion score
data['avg_embeddings'] = data['embeddings'].apply(lambda x: np.mean(x, axis=0))
data['Opinion_Score'] = 0.7 * data['avg_embeddings'] + 0.3 * data['Sentiment Score']
data['Opinion_Score']

0     [-0.08164574, 0.07581489, -0.23363145, 0.13234...
1     [-0.058257706, -0.11535751, 0.08847435, 0.1304...
2     [0.022126876, 0.20860586, -0.13970116, 0.29034...
3     [-0.11449186, -0.1268447, -0.1131882, 0.149115...
5     [-0.018630147, -0.033435654, -0.15093529, 0.08...
6     [0.06541457, 0.115277015, -0.05594419, 0.12482...
7     [0.0068644583, 0.007296279, -0.04659138, 0.116...
8     [-0.05581608, -0.071602434, 0.013033643, -0.06...
9     [0.046148412, 0.050153323, 0.074458346, -0.017...
10    [0.09991577, 0.028552584, -0.42996863, 0.03016...
12    [-0.038072947, 0.029785022, 0.18786806, 0.1436...
14    [-0.028621063, 0.25088662, -0.45907846, 0.0800...
15    [-0.26218605, 0.108838774, -0.11150808, -0.018...
16    [0.17307669, -0.01607705, -0.19719675, 0.03807...
17    [0.0034056567, 0.042566076, -0.32267287, 0.060...
Name: Opinion_Score, dtype: object

In [None]:
## Movie recommender function
def get_recommendations(movie_name):
    # Find the row corresponding to the given movie name
    query_row = data[data['Movie_Name'] == movie_name]

    if not query_row.empty:
      # Convert the 'Opinion_Score' column to a NumPy array
      opinion_scores_array = np.array(data['Opinion_Score'].tolist())
      # Upsert the 'Opinion_Score' vectors to the Qdrant collection
      opinion_scores_ids = list(range(len(data)))
      # Convert the 'Opinion_Score' array to a list of lists
      opinion_scores_list = opinion_scores_array.reshape(-1, 1).tolist()

      client.upsert(
          collection_name=my_collection,
          points=models.Batch(
              ids=opinion_scores_ids,
              vectors=opinion_scores_list
              )
          )
      # Define a query vector based on the opinion score you want to find similar movies for
      query_opinion_score = np.array([0.8] * 768)  # Adjust as needed

      # Perform a similarity search
      search_results = client.search(
          collection_name=my_collection,
          query_vector=query_opinion_score.tolist(),
          limit=3)

       # Extract movie recommendations from search results
      recommended_movie_ids = [result.id for result in search_results]
      recommended_movies = data.loc[data.index.isin(recommended_movie_ids)]

      # Display recommended movies
      print("Recommended Movies:")
      print(recommended_movies[['Movie_Name', 'Opinion_Score']])
    else:
      print(f"Movie '{movie_name}' not found in the dataset.")

# Example usage:
get_recommendations("Miller’s Girl (2024) Official Trailer - Martin Freeman, Jenna Ortega")

Recommended Movies:
                                           Movie_Name  \
10    Bridgerton Season 2 _ Official Teaser _ Netflix   
14  Godzilla x Kong _ The New Empire _ Official Tr...   

                                        Opinion_Score  
10  [0.09991577, 0.028552584, -0.42996863, 0.03016...  
14  [-0.028621063, 0.25088662, -0.45907846, 0.0800...  
