# Youtube Recommender System

### Group Meeting: 12/3/2019

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

import pandas as pd
import nltk

from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

import json

import warnings
warnings.filterwarnings('ignore')

In [2]:
us_video_data = pd.read_csv("dataset/USvideos.csv")
ca_video_data = pd.read_csv("dataset/CAvideos.csv")
gb_video_data = pd.read_csv("dataset/GBvideos.csv")

video_data = us_video_data.append(ca_video_data, ignore_index=True)
video_data = video_data.append(gb_video_data, ignore_index=True)
video_data.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


In [3]:
len(video_data)

120746

In [4]:
clean_vid_data = video_data.groupby(["title"]).first().reset_index()

In [5]:
clean_vid_data.head(5)

Unnamed: 0,title,video_id,trending_date,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,!! THIS VIDEO IS NOTHING BUT PAIN !! | Getting...,PNn8sECd7io,18.04.01,Markiplier,20,2018-01-03T19:33:53.000Z,"getting over it|""markiplier""|""funny moments""|""...",835930,47058,1023,8250,https://i.ytimg.com/vi/PNn8sECd7io/default.jpg,False,False,False,Getting Over It continues with RAGE BEYOND ALL...
1,"#1 Fortnite World Rank - 2,323 Solo Wins!",DvPW66IFhMI,18.09.03,AlexRamiGaming,20,2018-03-09T07:15:52.000Z,"PS4 Battle Royale|""PS4 Pro Battle Royale""|""Bat...",212838,5199,542,11,https://i.ytimg.com/vi/DvPW66IFhMI/default.jpg,False,False,False,Discord For EVERYONE - https://discord.gg/nhud...
2,"#1 Fortnite World Rank - 2,330 Solo Wins!",EXEaMjFeiEk,18.10.03,AlexRamiGaming,20,2018-03-10T06:26:17.000Z,"PS4 Battle Royale|""PS4 Pro Battle Royale""|""Bat...",200764,5620,537,45,https://i.ytimg.com/vi/EXEaMjFeiEk/default.jpg,False,False,False,Discord For EVERYONE - https://discord.gg/nhud...
3,#1 MOST ANTICIPATED VIDEO (Timber Frame House ...,bYvQmusLaxw,17.20.12,Pure Living for Life,24,2017-12-20T02:49:11.000Z,"timber frame|""timber framing""|""timber frame ra...",79152,7761,159,1965,https://i.ytimg.com/vi/bYvQmusLaxw/default.jpg,False,False,False,Shelter Institute: http://bit.ly/2iwXj8B\nFull...
4,#1 WORLD RANKED 1463 SOLO WINS! - FORTNITE BAT...,xQ4Q5b2WwO8,18.18.01,AlexRamiGaming,20,2018-01-17T18:00:05.000Z,"PS4 Battle Royale|""PS4 Pro Battle Royale""|""Bat...",541482,15430,891,40,https://i.ytimg.com/vi/xQ4Q5b2WwO8/default_liv...,False,False,False,►Twitter @AlexRamiGaming\n\n►Tips & Donations\...


In [6]:
len(clean_vid_data)

30626

## Part 1: Category Classification Model

In [7]:
def create_category_map():
    category_map = {}
    data = {}
    with open('dataset/US_category_id.json', 'r') as outfile:
        data = json.load(outfile)
        for item in data["items"]:
            category_map[item["id"]] = item["snippet"]["title"]
    return category_map
category_map = create_category_map()

In [8]:
category_map

{'1': 'Film & Animation',
 '2': 'Autos & Vehicles',
 '10': 'Music',
 '15': 'Pets & Animals',
 '17': 'Sports',
 '18': 'Short Movies',
 '19': 'Travel & Events',
 '20': 'Gaming',
 '21': 'Videoblogging',
 '22': 'People & Blogs',
 '23': 'Comedy',
 '24': 'Entertainment',
 '25': 'News & Politics',
 '26': 'Howto & Style',
 '27': 'Education',
 '28': 'Science & Technology',
 '29': 'Nonprofits & Activism',
 '30': 'Movies',
 '31': 'Anime/Animation',
 '32': 'Action/Adventure',
 '33': 'Classics',
 '34': 'Comedy',
 '35': 'Documentary',
 '36': 'Drama',
 '37': 'Family',
 '38': 'Foreign',
 '39': 'Horror',
 '40': 'Sci-Fi/Fantasy',
 '41': 'Thriller',
 '42': 'Shorts',
 '43': 'Shows',
 '44': 'Trailers'}

In [9]:
import re

def preprocess_text_df(text):
    text = text.apply(lambda x: x.lower())
    text = text.str.replace(r'[^a-zA-Z ]', '') #text.str.replace(r'[^\w\s]', '')
    return text

def preprocess_text(text):
    text = text.lower()
    regex = re.compile('[^a-zA-Z ]')
    text = regex.sub('', text)
    return text

def preprocess_tags(text):
    try:
        text = text.apply(lambda x: x.lower())
        text = text.str.replace(r'[^a-zA-Z ]', ' ')
    except:
        text = text.lower()
        regex = re.compile('[^a-zA-Z ]')
        text = regex.sub(' ', text)
    return text

In [10]:
preprocess_text("Dua Lipa - IDGAF (Official Music Video)")

'dua lipa  idgaf official music video'

In [11]:
def predict_category(X_train, y_train, X_test, vectorizer, clf):
    X_train = vectorizer.fit_transform(X_train)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(vectorizer.transform(X_test))
    return y_pred

def predict_category_from_title(X_train, y_train, X_test, vectorizer, clf):
    X_train = vectorizer.fit_transform(X_train)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(vectorizer.transform(pd.Series(X_test)))
    return y_pred

In [12]:
clean_vid_data["title_clean"] = preprocess_text_df(clean_vid_data.title)
clean_vid_data["tags"] = preprocess_tags(clean_vid_data.tags)
X_train, X_test, y_train, y_test = train_test_split(clean_vid_data["title_clean"] + clean_vid_data["tags"], 
                                                    clean_vid_data['category_id'], 
                                                    test_size=0.1, 
                                                    shuffle=True
                                                    )
y_pred = predict_category(X_train, y_train, X_test, TfidfVectorizer(ngram_range=(1, 2)), SGDClassifier())
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.88      0.56      0.69       149
           2       0.92      0.72      0.81        32
          10       0.77      0.90      0.83       265
          15       0.83      0.77      0.80        31
          17       0.83      0.89      0.86       218
          19       0.82      0.53      0.64        17
          20       0.85      0.80      0.82        83
          22       0.82      0.50      0.62       337
          23       0.88      0.80      0.83       246
          24       0.77      0.89      0.83       940
          25       0.82      0.85      0.83       354
          26       0.82      0.89      0.85       199
          27       0.79      0.64      0.71        81
          28       0.73      0.74      0.73        97
          29       1.00      0.14      0.25         7
          43       0.62      0.71      0.67         7

   micro avg       0.80      0.80      0.80      3063
   macro avg       0.82   

In [13]:
title = preprocess_text("Dua Lipa - IDGAF (Official Music Video)")
predict_category_from_title(X_train, 
                            y_train, 
                            title, 
                            TfidfVectorizer(), 
                            SGDClassifier())

array([10])

In [14]:
metrics.accuracy_score(y_test, y_pred)

0.7998694090760692

In [15]:
def category_group(num):
    category_number = num    
    category_name = category_map[str(num)]
    rslt_df = clean_vid_data[clean_vid_data['category_id'] == category_number] 
    return rslt_df.reset_index()

ann

In [16]:
def find_tags(vidtitle, rslt_df) : 
    row = rslt_df[rslt_df['title'] == vidtitle]
    tags = row['tags']
    list_of_tags = []
    for it in tags:
        ls = it.split('|')
        for l in ls:
            if l != '[none]':
                list_of_tags.append(l)    
    return list_of_tags#[0] if len(list_of_tags) else list_of_tags

## Part 2: Same Category Video Rankings

annabelle

### 2.1 Ranking Based on Title

In [17]:
video = clean_vid_data[["title", "category_id"]]
video["title_clean"] = preprocess_text_df(video.title)
video.head()

Unnamed: 0,title,category_id,title_clean
0,!! THIS VIDEO IS NOTHING BUT PAIN !! | Getting...,20,this video is nothing but pain getting over...
1,"#1 Fortnite World Rank - 2,323 Solo Wins!",20,fortnite world rank solo wins
2,"#1 Fortnite World Rank - 2,330 Solo Wins!",20,fortnite world rank solo wins
3,#1 MOST ANTICIPATED VIDEO (Timber Frame House ...,24,most anticipated video timber frame house rai...
4,#1 WORLD RANKED 1463 SOLO WINS! - FORTNITE BAT...,20,world ranked solo wins fortnite battle roya...


In [18]:
#source: https://towardsdatascience.com/overview-of-text-similarity-metrics-3397c4601f50
def get_jaccard_sim(a, str2): 
    #a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def get_jaccard_sim_list(list1, list2): 
    a = set(list1) 
    b = set(list2)
    c = a.intersection(b)
    if (len(a) + len(b) - len(c)) == 0:
        return 0
    return float(len(c)) / (len(a) + len(b) - len(c))

In [19]:
def find_similar_title(v):
    cleaned_title = preprocess_text(v)
    a = set(cleaned_title.split())
    
    category = video.loc[video.title.str.endswith(v)]["category_id"].values[0]
    video_category = video.loc[video['category_id']==category]
    
    score = []
    
    for t in video_category.title_clean:
        score += [get_jaccard_sim(a, t)]
    video_category["title_score"] = score
    
    video_category = video_category.loc[video_category['title_score'] < 1].sort_values(by=['title_score'], ascending=False)
    return video_category.reset_index().drop(columns=['index', 'title_clean'])[0:25]

#similar_video_titles = find_similar_title("#ProudToCreate: Pride 2018")
similar_video_titles = find_similar_title("Marshmello - FLY (Official Music Video)")

In [20]:
preprocess_text("Dua Lipa - IDGAF (Official Music Video)")

'dua lipa  idgaf official music video'

In [21]:
video["title_clean"]

0         this video is nothing but pain   getting over...
1                          fortnite world rank   solo wins
2                          fortnite world rank   solo wins
3         most anticipated video timber frame house rai...
4         world ranked  solo wins  fortnite battle roya...
5                                  world record  solo wins
6            world ranked   solo wins   new minigun update
7            world record  solo wins  fortnite live stream
8            world record  solo wins  fortnite live stream
9         most emotional moments in cricket history  cr...
10                      making a pcb using easyeda  review
11        how to go fast coaching famous youtuber mike ...
12                    feed the homeless  one list one life
13        did jesus exist jordan peterson and lindsay s...
14        does jerusalem belong to israel ted cruz gues...
15        crowder is back trump did what ben shapiro an...
16        elon musk i need your  razorfist and clint ho.

In [22]:
similar_video_titles.head(10)

Unnamed: 0,title,category_id,title_score
0,Marshmello - Blocks (Official Music Video),10,0.666667
1,Marshmello - You & Me (Official Music Video),10,0.571429
2,Marshmello - Love U (Official Music Video),10,0.571429
3,Marshmello - LoVe U (Official Music Video),10,0.571429
4,Marshmello - Take It Back (Official Music Video),10,0.5
5,Marshmello & Anne-Marie - FRIENDS (Music Video...,10,0.444444
6,Marshmello x Lil Peep - Spotlight (Official Mu...,10,0.444444
7,Ghost - Rats (Official Music Video),10,0.428571
8,Justice - Stop (Official Music Video),10,0.428571
9,PENTAGON(펜타곤) - '빛나리(Shine)' Official Music Video,10,0.428571


### 2.2 Ranking Based on Tags

In [23]:
tags = clean_vid_data["tags"]
tags[1]

'ps  battle royale  ps  pro battle royale   battle royale win   fornite duo   fortnite squads   solo   fortnite   umbrella   fortnite gameplay   fortnite ps    fortnite stream   fortnite game   fortnite tips   fortnite tricks   fortnight   fort night   fort nite   fortnite br   fortnite pubg   fornite battle royale   fortnight br   fortnight   fortnite br solo   fortnite solo   fortnite new update   fortnite new city   fortnite patch   fortnite    world rank '

In [24]:
def map_titles_to_tags(category):
    category_df = category_group(category)
    title_tags_dict = {}
    for i in range(len(category_df)):
        video_title = category_df["title"][i]
        video_tags = find_tags(video_title, category_df)
        title_tags_dict[video_title] = video_tags
    return title_tags_dict

In [25]:
# def find_category_tags(interested_category):
#     tags_df = pd.DataFrame()
#     title_tags_dict = {}
#     trimed_df = category_group(interested_category)
#     for i in range(len(trimed_df["tags"])):
        
#         video_title = trimed_df["title"][i]
#         ls = trimed_df["tags"][i]
#         individual_tags = ls.split("|")
#         for it in individual_tags[0:5]:
#             if it != '[none]':
#                 tags_df = tags_df.append({"Tag Name": it, "Video Title": video_title}, ignore_index=True)
#     return tags_df

In [26]:
def find_similar_tag_video(title):
    
    category = video.loc[video.title.str.endswith(title)]["category_id"].values[0]
    #category_df = video.loc[video['category_id']==category]
    
    category_df = category_group(category)
    #return category_df
    interested_tags = find_tags(title, category_df)
    category_df = category_df[["title"]]
#     print(category)
#     print(interested_tags)
    title_tag_mapping = map_titles_to_tags(category)
    score = []
    i = 0
    for title, tags in title_tag_mapping.items():
        score.append(get_jaccard_sim_list(interested_tags, tags))
    
    category_df["tag_score"] = score
    
    category_df = category_df.loc[category_df['tag_score'] > 0].loc[category_df['tag_score'] != 1].loc[category_df['tag_score'] != len(interested_tags)].sort_values(by = ['tag_score'], ascending=False)

    return category_df.reset_index().drop(columns=['index'])[0:25]

#similar_videos_tags = find_similar_tag_video("Calvin Harris, Dua Lipa - One Kiss (Lyric Video)")
similar_videos_tags = find_similar_tag_video("Dua Lipa - IDGAF (Official Music Video)")  
#similar_videos_tags = find_similar_tag_video("#ProudToCreate: Pride 2018")

In [27]:
similar_videos_tags.head(10)

Unnamed: 0,title,tag_score


In [28]:
category_df10 = category_group(10)
a0 = find_tags("Calvin Harris, Dua Lipa - One Kiss (Lyric Video)", category_df10)#"Dua Lipa - IDGAF (Official Music Video)", category_df10)
a0

['calvin harris  calvin harris one kiss   calvin harris dua lipa one kiss   dua lipa one kiss   one kiss   one kiss lyrics   calvin harris one kiss lyrics   calvin harris dua lipa one kiss live   calvin harris dua lipa one kiss lyrics   dua lipa one kiss lyrics   dua lipa one kiss live   calvin harris feels   calvin harris summer   calvin harris nuh ready   calvin harris thinking about you   calvin harris my way   calvin harris feel so close   calvin   harris ']

In [29]:
a1 = find_tags("Azealia Banks - Anna Wintour", category_df10)
a1

['azealia  banks   anna   wintour   eone   music   dance   alternative indie   electronic   club dance   pop ']

In [30]:
get_jaccard_sim_list(a0, a1)

0.0

In [31]:
a2 = find_tags("Dua Lipa - Golden Slumbers", category_df10)
a2

['dua lipa  golden slumbers   xmas songs   christmas music   john lewis advert   christmas   festive ']

In [32]:
get_jaccard_sim_list(a0, a2)

0.0

## Annabelle's part 1209

### Three methods to calulate the similarity between two titles
     get_jaccard_sim and get_cosine_sim measure the count of words that appear in two titles while get_gensim_sim calculates the similarity using semantic meaning from a pre-trained model





In [38]:
#source: https://towardsdatascience.com/overview-of-text-similarity-metrics-3397c4601f50
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [39]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)[0][1]
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [42]:
#!pip install gensim
import gensim
import gensim.downloader as api
#word_vectors = api.load("glove-wiki-gigaword-100")
def get_gensim_sim(first, second):
    first = first.split()
    second = second.split()
    try:
        similarity = word_vectors.n_similarity(first.split(), second.split())
    except:
        new_first = []
        new_second = []
        
        for f in first:
            if f in word_vectors:
                new_first += [f]
        for s in second:
            if s in word_vectors:
                new_second += [s]
        similarity = word_vectors.n_similarity(new_first or ['none'], new_second or ['none'])
    return similarity





#### Outputs the top three titles in the same category

In [59]:
def find_similar_title(v, function, num):
    category = video.loc[video.title_clean.str.endswith(v)]["category_id"].values[0]
    video_category = video.loc[video['category_id']==category]

    score = []
    
    for t in video_category.title_clean:
        score += [function(v, t)]
    video_category["score"] = score
    
    video_category = video_category.loc[video_category['score']< 1].sort_values(by=['score'], ascending=False)
    #return score
    return video_category.title[0:num]
find_similar_title("iphone case", get_gensim_sim, 3)

28211    iPhone X – Selfies on iPhone X – Apple
19598                Retro iMac iPhone X Cases!
2682            Apple iPhone X - One Week Later
Name: title, dtype: object

## Testing function examples
#### https://www.youtube.com/channel/UCF0pVplsI8R5kcAqgtoRqoA

In [74]:
# https://www.youtube.com/watch?v=Ufye3xSjcqM
# Category: Travel & Events
# Tags: #Singapore #buffet #luxury
# Title: BEST LUXURY BUFFET in Singapore!? Colony Buffet Review at Ritz Carlton
title = preprocess_text("BEST LUXURY BUFFET in Singapore!? Colony Buffet Review at Ritz Carlton")
predict_category_from_title(X_train, 
                            y_train, 
                            title, 
                            TfidfVectorizer(), 
                            SGDClassifier())

#make_recommendations(title)

array([19])

In [73]:
# https://www.youtube.com/watch?v=sfv1QaRzJg8
# Category: Entertainment
# Tags: None
# Title: I Ordered Pizza And Tipped The House
title = preprocess_text("I Ordered Pizza And Tipped The House")
predict_category_from_title(X_train, 
                            y_train, 
                            title, 
                            TfidfVectorizer(), 
                            SGDClassifier())

#make_recommendations(title)

array([24])

In [62]:
# https://www.youtube.com/watch?v=ahZFCF--uRY
# Category: Entertainment
# Tags: #Ghostbusters #OfficialTrailer #Sony
# Title: GHOSTBUSTERS: AFTERLIFE - Official Trailer (HD)

title = preprocess_text('GHOSTBUSTERS: AFTERLIFE - Official Trailer (HD)')
predict_category_from_title(X_train, 
                            y_train, 
                            title, 
                            TfidfVectorizer(), 
                            SGDClassifier())

#make_recommendations(title)

array([24])

In [67]:
# https://www.youtube.com/watch?v=I99SmVy52jU
# Category: Entertainment
# Tags: None
# Title: Rich Christmas Vs. Broke Christmas
title = preprocess_text('Rich Christmas Vs. Broke Christmas')
predict_category_from_title(X_train, 
                            y_train, 
                            title, 
                            TfidfVectorizer(), 
                            SGDClassifier())

#make_recommendations(title)

array([24])

In [66]:
# https://www.youtube.com/watch?v=i01R1fA6gos
# Category: Sports
# Tags: #NFL #49ers #Saints
# Title: 49ers vs. Saints Week 14 Highlights | NFL 2019
title = preprocess_text('49ers vs. Saints Week 14 Highlights | NFL 2019')
predict_category_from_title(X_train, 
                            y_train, 
                            title, 
                            TfidfVectorizer(), 
                            SGDClassifier())

#make_recommendations(title)

array([17])

In [70]:
# https://www.youtube.com/watch?v=kx_uCdTRCAw
# Category: Howto & Style
# Tags: None
# Title: Doing LADY GAGA'S Makeup! | NikkieTutorials
title = preprocess_text("Doing LADY GAGA'S Makeup! | NikkieTutorials")
predict_category_from_title(X_train, 
                            y_train, 
                            title, 
                            TfidfVectorizer(), 
                            SGDClassifier())

#make_recommendations(title)

array([26])

In [78]:
find_similar_title("makeup", get_gensim_sim, 3)

25661    Updated Everyday Makeup Routine 💋
16594          My Everyday Makeup Routine!
25345           Trying $1 Makeup From Wish
Name: title, dtype: object

## Putting It All Together

In [86]:
def make_recommendations(title):
    
    top_titles = find_similar_title(title)
    top_tags = find_similar_tag_video(title)
    
    category = top_titles["category_id"][0]
    category_df = category_group(category)
    
    top_titles = top_titles.drop(columns=['category_id'])
    interested_tags = find_tags(title, category_df)
    cleaned_title = preprocess_text(title)
    title_set = set(cleaned_title.split())
    
    title_supplement_score = []
    for t in top_titles["title"]:
        tags = find_tags(t, category_df)
        title_supplement_score.append(get_jaccard_sim_list(interested_tags, tags))
    
    tags_supplement_score = []
    for t in top_tags["title"]:        
        tags_supplement_score.append(get_jaccard_sim(title_set, t))
    
    top_titles["tag_score"] = title_supplement_score
    top_tags["title_score"] = tags_supplement_score     
    
    combined_top = top_titles.append(top_tags, ignore_index = True)
    combined_top["Combined Score"] = np.array(combined_top["tag_score"]) +  np.array(combined_top["title_score"])
    
    combined_top = combined_top.sort_values(by = ['Combined Score'], ascending=False).reset_index().drop(columns=['index'])
    return combined_top

In [34]:
make_recommendations("Dua Lipa - IDGAF (Official Music Video)").head(10)

Unnamed: 0,tag_score,title,title_score,Combined Score
0,0.0,"Calvin Harris, Dua Lipa - One Kiss (Official V...",0.4,0.4
1,0.0,Marshmello - Blocks (Official Music Video),0.375,0.375
2,0.0,Justice - Stop (Official Music Video),0.375,0.375
3,0.0,Marshmello - Fly (Official Music Video),0.375,0.375
4,0.0,Cam - Diane (Official Music Video),0.375,0.375
5,0.0,Marshmello - FLY (Official Music Video),0.375,0.375
6,0.0,Veno - Dolce (Official Music Video),0.375,0.375
7,0.0,DDG - Arguments (Official Music Video),0.375,0.375
8,0.0,Ghost - Rats (Official Music Video),0.375,0.375
9,0.0,PENTAGON(펜타곤) - '빛나리(Shine)' Official Music Video,0.375,0.375


In [35]:
make_recommendations("#ProudToCreate: Pride 2018").head(10)

Unnamed: 0,tag_score,title,title_score,Combined Score
0,0.0,LGBT Pride | Brooklyn Nine-Nine,0.2,0.2
1,0.0,Dragon Ball Super Episode 122 English Subbed E...,0.071429,0.071429
2,0.0,Sridevi In Dubai | Marriage Video | శ్రీదేవి చ...,0.0,0.0
3,0.0,Spiderman dancing Take on me by A-Ha | DJ IVAN...,0.0,0.0
4,0.0,Stalker in the Swamp | Critical Role | Campaig...,0.0,0.0
5,0.0,Spoiler Alert! RuPaul's Drag Race S10 Ep7 John...,0.0,0.0
6,0.0,Spoiler Alert! RuPaul's Drag Race All Stars 3 ...,0.0,0.0
7,0.0,"Spoiler Alert! 🌹Bekah M., Will You Accept This...",0.0,0.0
8,0.0,Sridevi: Boney Kapoor CRIES badly as he hugged...,0.0,0.0
9,0.0,Sridevi's Dead Body To Arrive At Mumbai Airpor...,0.0,0.0


In [36]:
make_recommendations("#1 Fortnite World Rank - 2,330 Solo Wins!").head(10)

Unnamed: 0,tag_score,title,title_score,Combined Score
0,1.0,NEW UPDATE #1 WORLD RANKED FORTNITE SOLO PLAYE...,0.444444,1.444444
1,1.0,"#1 World Ranked - 1,700 Solo Wins - New Minig...",0.333333,1.333333
2,0.0,"🔴 #1 World Ranked | 2,532 Solo Wins | Fortnite...",0.5,0.5
3,0.0,"#1 WORLD RECORD 3,288 SOLO WINS",0.5,0.5
4,0.0,"#1 World Record 3,225 Solo Wins | Fortnite Liv...",0.5,0.5
5,0.0,"#1 World Record 3,359 Solo Wins | Fortnite Liv...",0.5,0.5
6,0.0,"FREE VBUCKS TODAY | FORTNITE WORLD RECORD 2,75...",0.4,0.4
7,0.0,#1 WORLD RANKED 1463 SOLO WINS! - FORTNITE BAT...,0.4,0.4
8,0.0,"FORTNITE WORLD RECORD 2,732 SOLO WINS - New Fo...",0.363636,0.363636
9,0.0,"SOLO ASSASSIN!! *TOP FORTNITE PLAYER* // 10,60...",0.272727,0.272727
