In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
import string
import re
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords') # Uncomment to download for initial run

In [2]:
# Read into CSV
data = pd.read_csv("video_info.csv")

# Confirm no null values
assert (data.isnull().sum() == 0).all(
) == True, "Please review input csv file. Null values detected."


In [3]:
data

Unnamed: 0,Title,Genre,URL
0,"Im Nervous, Insecure and Squishy - Mark Norman...",Comedy,https://www.youtube.com/watch?v=E3KN_Mhq3UQ
1,Insurance Companies vs God with Ricky Gervais ...,Comedy,https://www.youtube.com/watch?v=z-2-Oru9sR0
2,Must Watch Challenging New Comedy Video 2023 ...,Comedy,https://www.youtube.com/watch?v=dSqWmvLXDsA
3,Funniest Fun Amazing videos must Entertainment...,Comedy,https://www.youtube.com/watch?v=zVsgBdV36ts
4,Baby Police & Armed Robbers,Comedy,https://www.youtube.com/watch?v=I73D3lQU1f0
...,...,...,...
146,3 TRUE Scary Boating Horror Stories,Horror,https://www.youtube.com/watch?v=5AIqIzANSls
147,10 MORE Horror Movies You Must Never Pause,Horror,https://www.youtube.com/watch?v=KB-XnNRbyN4
148,Patrick Star the HORROR GAME! (Full Game),Horror,https://www.youtube.com/watch?v=3R7C4upbmZM
149,Transformation || Silent Horror || #viral #com...,Horror,https://www.youtube.com/watch?v=3CJCv8qRNt8


In [4]:
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

def clean(text):
    "Clean show titles."
    text = str(text).title()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["Title"] = data["Title"].apply(clean)

In [5]:
data

Unnamed: 0,Title,Genre,URL
0,im nervous insecur and squishi mark normand ...,Comedy,https://www.youtube.com/watch?v=E3KN_Mhq3UQ
1,insur compani vs god with ricki gervai univer...,Comedy,https://www.youtube.com/watch?v=z-2-Oru9sR0
2,must watch challeng new comedi video nonstop...,Comedy,https://www.youtube.com/watch?v=dSqWmvLXDsA
3,funniest fun amaz video must entertain comedi ...,Comedy,https://www.youtube.com/watch?v=zVsgBdV36ts
4,babi polic arm robber,Comedy,https://www.youtube.com/watch?v=I73D3lQU1f0
...,...,...,...
146,true scari boat horror stori,Horror,https://www.youtube.com/watch?v=5AIqIzANSls
147,more horror movi you must never paus,Horror,https://www.youtube.com/watch?v=KB-XnNRbyN4
148,patrick star the horror game full game,Horror,https://www.youtube.com/watch?v=3R7C4upbmZM
149,transform silent horror viral comic short fy...,Horror,https://www.youtube.com/watch?v=3CJCv8qRNt8


In [6]:
feature = data["Genre"].tolist()
tfidf = text.TfidfVectorizer(input=feature, stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)

InvalidParameterError: The 'input' parameter of TfidfVectorizer must be a str among {'file', 'filename', 'content'}. Got ['Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Comedy', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Drama', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Action', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror', 'Horror'] instead.

In [7]:
feature = data["Genre"].tolist()
tfidf = text.TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)

In [13]:
indices = pd.Series(data.index, 
                    index=data['Title']).drop_duplicates()


In [18]:
indices

Title
im nervous insecur and squishi  mark normand  full special                                  0
insur compani vs god with ricki gervai  univers comedi                                      1
must watch challeng new comedi video   nonstop maha funniest video episod  by topfuntv      2
funniest fun amaz video must entertain comedi  to not laugh episod  by my famili            3
babi polic  arm robber                                                                      4
                                                                                         ... 
 true scari boat horror stori                                                             146
 more horror movi you must never paus                                                     147
patrick star the horror game full game                                                    148
transform  silent horror  viral comic short fyp silenthorrorstori                         149
horror game charact rank by how christian they are    

In [16]:
indices["babi polic  arm robber"]

4

In [17]:
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

print(netFlix_recommendation("babi polic  arm robber"))

0    im nervous insecur and squishi  mark normand  ...
1    insur compani vs god with ricki gervai  univer...
2    must watch challeng new comedi video   nonstop...
3    funniest fun amaz video must entertain comedi ...
4                               babi polic  arm robber
5     accident tri crack  will foskey  stand up comedi
6              matt rife  onli fan full comedi special
7    i look like a senat nephew  andi hayn  full sp...
8       comput witch  live with dad  mark angel comedi
9    credit card affair  two and a half men  comedi...
Name: Title, dtype: object
