In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re
from nltk.corpus import stopwords
# nltk.download('stopwords') # Uncomment to download for initial run
import ipywidgets as widgets
from IPython.display import display

In [2]:
# Read into CSV
data = pd.read_csv("video_info.csv")
data = data.drop_duplicates(subset='Title', keep="first")


# Confirm no null and duplicate values
assert (data.isnull().sum() == 0).all(
) == True, "Please review input csv file. Null values detected."

assert data.duplicated(subset='Title').sum() == 0, "Remove duplicative values in the input csv file."

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   45 non-null     object
 1   Genre   45 non-null     object
 2   URL     45 non-null     object
dtypes: object(3)
memory usage: 1.2+ KB


In [4]:
# stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

def clean(text):
    "Clean show titles."
    text = str(text).title()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = [word for word in text if word != '']
    # text=" ".join(text)
    # text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["Title"] = data["Title"].apply(clean)

data["title_n_genre"] = data["Title"] + " " + data["Genre"]

In [5]:
data.head(2)

Unnamed: 0,Title,Genre,URL,title_n_genre
0,Must Watch Top New Special Comedy Video Amazin...,Comedy,https://www.youtube.com/watch?v=w72ZPyzhjFw,Must Watch Top New Special Comedy Video Amazin...
1,Must Watch Eid Special New Comedy Video Amazin...,Comedy,https://www.youtube.com/watch?v=tz4Ome72eOI,Must Watch Eid Special New Comedy Video Amazin...


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(data["title_n_genre"])

In [7]:
def search(title):
    title = clean(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = data.iloc[indices].iloc[::-1].reset_index(drop=True)
    
    return results[['Title','Genre','URL']]

In [8]:
movie_input = widgets.Text(
    value='Enter Here',
    description='Video Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='Enter Here', description='Video Title:')

Output()