****Important – Do not use in production, for demonstration purposes only – please review the legal notices before continuing****

# Building a Video Recommender Model

This notebook builds a video recommendation model using video transcripts

<iframe width="1280" height="720" src="https://www.videoindexer.ai/embed/player/c1928055-b882-4ec3-916c-eebcd801ac98/bcc23802ee/?&locale=en&location=eastus" frameborder="0" allowfullscreen></iframe>

#### Importing required libraries

In [1]:
import os
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import find
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json
from nltk.stem import WordNetLemmatizer 
import pandas as pd

#### Downloading required datasets for cleaning our data

In [2]:
# You only need to run this line once
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
transcripts_location = os.path.join(os.getcwd(),'video_files')
transcripts_location

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


'/mnt/batch/tasks/shared/LS_root/mounts/clusters/mlw-retailprodcompute/code/Users/dreamdemoretail2.0/Notebooks/video_files'

#### Define function for tokenizing documents

In [3]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(nltk.PorterStemmer().stem(item))
    return stems

#### Indexing all files with video data 

In [4]:
_,_,file_names = next(os.walk(transcripts_location))
transcript_files = {}
raw_text = []
file_index_mapping = {}
original_transcript = []
ids = []

# Going over all files in the chosen directory
for fname in file_names:
    try:
        json_file = os.path.join(transcripts_location,fname)
        text = '' 
        
        with open(json_file) as f: 
            json_obj = json.load(f)

        text = json_obj['transcript']
        words = text.split()
        transcript_files[fname] = words
        raw_text.append(words)
        original_transcript.append(text)
        file_index_mapping[json_obj['video_id']] = fname
        ids.append(json_obj['video_id'])
    except:
        print(fname)

.amlignore
.amlignore.amltmp


#### Remove stopwords from the corpus

In [5]:
stopwords_english = stopwords.words('english')

# Removing stop words from video transcripts
for i in range(len(ids)):
    raw_text[i] = [word.lower() for word in raw_text[i]]
    raw_text[i] = [word for word in raw_text[i] if word not in string.punctuation]
    raw_text[i] = [word for word in raw_text[i] if word not in stopwords_english]
    corresponding_file = file_index_mapping[ids[i]]
    transcript_files[corresponding_file] = raw_text[i]
raw_text = [' '.join(i) for i in raw_text]


#### Building the model using video transcript data

In [6]:
tfidf = TfidfVectorizer(tokenizer=tokenize)
video_tfidf = tfidf.fit_transform(raw_text)
cos_sim = cosine_similarity(video_tfidf, video_tfidf) 
cos_sim.shape

(9, 9)

#### Defining recommmender function

In [7]:
def recomender(index,similarity = cos_sim,topk=5):
    recommended = []
    inds = np.argsort(-1*cos_sim[index])[:topk+1]     
    for i in inds: 
        recommended.append(file_index_mapping[ids[i]])
    return recommended[1:]


#### Testing recommender model

In [8]:
id_to_search = "07c796c0ba"    # Change this to change the video being searched
index_in_list = ids.index(id_to_search) 
print("Video that is being used to recommend: ", file_index_mapping[id_to_search])
recomendations = recomender(index_in_list)
recomendations

Video that is being used to recommend:  07c796c0ba.json


['09427df377.json',
 '168bac451a.json',
 'e376c3dc1c.json',
 '88a3d79c82.json',
 'bcc23802ee.json']

#### Exporting model to a file

In [9]:
pd.DataFrame(cos_sim).to_csv('similarity.csv',header=ids,index=False)