# Preliminary EDA on US Youtube Trending Videos

In [1]:
import re
import numpy as np
import pandas as pd
from pandas import DataFrame
import seaborn as sns
from bs4 import BeautifulSoup
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# ignoring warnings that pop up during execution
import warnings
warnings.filterwarnings("ignore")

In [2]:
# for libraries not in docker image

# installing libraries
import sys
!{sys.executable} -m pip install umap
!{sys.executable} -m pip install missingno
!{sys.executable} -m pip install rake_nltk
!{sys.executable} -m pip install wordcloud
!{sys.executable} -m pip install umap-learn
!{sys.executable} -m pip install datashader 
!{sys.executable} -m pip install bokeh
!{sys.executable} -m pip install holoviews
!{sys.executable} -m pip install colorcet
!{sys.executable} -m pip install pyLDAvis

# importing installed libraries
import umap
import missingno
from rake_nltk import Rake
from wordcloud import WordCloud, STOPWORDS

Collecting numba!=0.49.*,!=0.50.*,>=0.37.0
  Using cached numba-0.52.0-cp38-cp38-manylinux2014_x86_64.whl (3.2 MB)
Collecting llvmlite<0.36,>=0.35.0
  Using cached llvmlite-0.35.0-cp38-cp38-manylinux2010_x86_64.whl (25.3 MB)
Installing collected packages: llvmlite, numba
  Attempting uninstall: llvmlite
    Found existing installation: llvmlite 0.33.0
    Uninstalling llvmlite-0.33.0:
      Successfully uninstalled llvmlite-0.33.0
  Attempting uninstall: numba
    Found existing installation: numba 0.50.1
    Uninstalling numba-0.50.1:
      Successfully uninstalled numba-0.50.1
Successfully installed llvmlite-0.35.0 numba-0.52.0


In [3]:
# loading the dataset into a dataframe
df = pd.read_csv("dataset/USvideos.csv")

In [4]:
# dimensions of the dataframe
df.shape

(40949, 16)

In [5]:
# summary of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                40949 non-null  object
 1   trending_date           40949 non-null  object
 2   title                   40949 non-null  object
 3   channel_title           40949 non-null  object
 4   category_id             40949 non-null  int64 
 5   publish_time            40949 non-null  object
 6   tags                    40949 non-null  object
 7   views                   40949 non-null  int64 
 8   likes                   40949 non-null  int64 
 9   dislikes                40949 non-null  int64 
 10  comment_count           40949 non-null  int64 
 11  thumbnail_link          40949 non-null  object
 12  comments_disabled       40949 non-null  bool  
 13  ratings_disabled        40949 non-null  bool  
 14  video_error_or_removed  40949 non-null  bool  
 15  de

In [6]:
# checking for null values
df.isnull().values.any()

True

In [7]:
# count the number of NaNs each column has.
nans = pd.isnull(df).sum()
nans[nans > 0]

description    570
dtype: int64

In [8]:
# grouping columns by datatype
df.dtypes.value_counts()

object    8
int64     5
bool      3
dtype: int64

Description of numeric attributes

### Cleaning the description

In [9]:
# installing libraries
import sys
!{sys.executable} -m pip install nltk

# importing these libraries
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords



[nltk_data] Downloading package stopwords to /home/aditya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
df = pd.read_csv("dataset/USvideos.csv")

In [11]:
# drop rows with NA values
df = df.dropna()
df.reset_index(inplace=True)

In [12]:
# remove newlines 
df['description'] = df['description'].str.replace(r'\\t|\\n|\\r',' ')

In [13]:
# removing URL's
df['description'] = df['description'].str.replace(r'https?:\/\/\S+','')

In [14]:
# removing HTML tags
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").text

df['description'] = df['description'].apply(remove_html_tags)

In [15]:
# remove start/end spaces & lower-casing the result
df['description'] = df['description'].str.strip().str.lower()

In [16]:
# removing punctuations
df['description'] = df['description'].str.replace('[^\w\s]','')

In [17]:
# stopwords removal
stop_words = stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
df['description'] = df['description'].apply(lambda text: ' '.join([word for word in str(text).split() if word not in stop_words]))

In [18]:
# method for removing emoji's and emoticon's
# u"\U00002500-\U00002BEF" -> chinese char
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  
                               u"\U0001F300-\U0001F5FF"  
                               u"\U0001F680-\U0001F6FF"  
                               u"\U0001F1E0-\U0001F1FF"  
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

df['description'] = df['description'].apply(remove_emoji)

In [19]:
# removing numbers
df['description'] = df['description'].str.replace(r'\d','')

In [20]:
# checking if cleaning resulted in nan value
df['description'].isnull().values.any()

False

In [21]:
# Some Issue: 
#
# After cleaning there are some entries that have an empty description or a very small one
# Example:
# https://www.youtube.com/watch?v=GcbsIv3QdFs - only has links in its description
# Since we are removing all links, its description will be empty

df[df['description'].str.len() <= 3].head()

Unnamed: 0,index,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
2592,2644,5WUDfviiKRE,17.27.11,二贵摔跤 - tienghoa.net,Tina Nguyen,27,2011-03-01T04:14:08.000Z,hanyuqiao,16823,93,275,172,https://i.ytimg.com/vi/5WUDfviiKRE/default.jpg,False,False,False,
2808,2869,5WUDfviiKRE,17.28.11,二贵摔跤 - tienghoa.net,Tina Nguyen,27,2011-03-01T04:14:08.000Z,hanyuqiao,21342,107,312,201,https://i.ytimg.com/vi/5WUDfviiKRE/default.jpg,False,False,False,
3055,3122,5WUDfviiKRE,17.29.11,二贵摔跤 - tienghoa.net,Tina Nguyen,27,2011-03-01T04:14:08.000Z,hanyuqiao,21762,108,312,203,https://i.ytimg.com/vi/5WUDfviiKRE/default.jpg,False,False,False,
3286,3359,5WUDfviiKRE,17.30.11,二贵摔跤 - tienghoa.net,Tina Nguyen,27,2011-03-01T04:14:08.000Z,hanyuqiao,22535,108,312,203,https://i.ytimg.com/vi/5WUDfviiKRE/default.jpg,False,False,False,
3517,3595,5WUDfviiKRE,17.01.12,二贵摔跤 - tienghoa.net,Tina Nguyen,27,2011-03-01T04:14:08.000Z,hanyuqiao,22695,108,312,203,https://i.ytimg.com/vi/5WUDfviiKRE/default.jpg,False,False,False,


In [22]:
# Fix
df.loc[df['description'].str.len() < 3, ['description']] = 'none'

In [23]:
df[df['description'].str.len() == 4].head()
df_new = df.drop_duplicates(subset=['video_id'], keep = 'last')
df_new.reset_index(inplace=True)
df_new = df_new.drop(['level_0', 'index'], axis=1)
df_new_description = pd.DataFrame(df_new)

In [24]:
df_content_filter = df_new

## Content based filtering

In [25]:
# initializing the new column
df_new['Key_words'] = ""

for index, row in df_new.iterrows():
    plot = row['description']
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    # assigning the key words to the new column for the corresponding video title
    df_new['Key_words'][index] = list(key_words_dict_scores.keys())
    
# dropping the description column
df_new.drop(columns = ['description'], inplace = True)

In [26]:
df_new = df_new.drop_duplicates(subset=['video_id'], keep = 'last')
df_new.reset_index(inplace=True)
df_title_channel_title = pd.DataFrame(df_new)

In [27]:
df_new.head()

Unnamed: 0,index,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,Key_words
0,0,9wRQljFNDW8,17.14.11,Dion Lewis' 103-Yd Kick Return TD vs. Denver! ...,NFL,17,2017-11-13T02:05:26.000Z,"NFL|""Football""|""offense""|""defense""|""afc""|""nfc""...",81377,655,25,177,https://i.ytimg.com/vi/9wRQljFNDW8/default.jpg,False,False,False,"[new, england, patriots, returner, dion, lewis..."
1,1,Om_zGhJLZ5U,17.14.11,TL;DW - Every DCEU Movie Before Justice League,Screen Junkies,1,2017-11-12T18:00:03.000Z,"screenjunkies|""screen junkies""|""sj news""|""hone...",288922,7515,792,2111,https://i.ytimg.com/vi/Om_zGhJLZ5U/default.jpg,False,False,False,"[justice, league, approaching, fast, rewatched..."
2,2,goP4Z5wyOlM,17.14.11,Iraq-Iran earthquake: Deadly tremor hits borde...,BBC News,25,2017-11-12T21:16:40.000Z,"bbc|""bbc news""|""news""|""iran""|""iran news""|""iraq...",34785,308,26,413,https://i.ytimg.com/vi/goP4Z5wyOlM/default.jpg,False,False,False,"[strong, magnitude, earthquake, rattled, north..."
3,3,8NHA23f7LvU,17.14.11,Jason Momoa Wows Hugh Grant With Some Dothraki...,The Graham Norton Show,24,2017-11-10T19:06:23.000Z,"Graham Norton|""Graham Norton Show Official""|""E...",1496225,16116,236,605,https://i.ytimg.com/vi/8NHA23f7LvU/default.jpg,False,False,False,"[think, sarah, millican, excited, subscribe, w..."
4,4,IE-xepGLVt8,17.14.11,Mayo Clinic's first face transplant patient me...,Mayo Clinic,28,2017-11-10T12:04:17.000Z,"Mayo Clinic|""Health Care (Issue)""|""Healthcare ...",237307,1896,74,260,https://i.ytimg.com/vi/IE-xepGLVt8/default.jpg,False,False,False,"[one, half, years, surgery, transformed, life,..."


In [28]:
df_new = df_new.drop(['index'], axis=1)

In [29]:
df_new = df_new.drop(['video_id','trending_date','category_id','publish_time','tags','views','likes','dislikes','comment_count','thumbnail_link','comments_disabled','ratings_disabled','video_error_or_removed'],axis=1)

In [30]:
df_new

Unnamed: 0,title,channel_title,Key_words
0,Dion Lewis' 103-Yd Kick Return TD vs. Denver! ...,NFL,"[new, england, patriots, returner, dion, lewis..."
1,TL;DW - Every DCEU Movie Before Justice League,Screen Junkies,"[justice, league, approaching, fast, rewatched..."
2,Iraq-Iran earthquake: Deadly tremor hits borde...,BBC News,"[strong, magnitude, earthquake, rattled, north..."
3,Jason Momoa Wows Hugh Grant With Some Dothraki...,The Graham Norton Show,"[think, sarah, millican, excited, subscribe, w..."
4,Mayo Clinic's first face transplant patient me...,Mayo Clinic,"[one, half, years, surgery, transformed, life,..."
...,...,...,...
6249,BTS Plays With Puppies While Answering Fan Que...,BuzzFeed Celeb,"[bts, pps, puppies, adorable, provided, vander..."
6250,The Cat Who Caught the Laser,AaronsAnimals,"[cat, caught, laser, aarons, animals]"
6251,I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...,Brad Mondo,"[much, fun, transforming, safiyas, hair, video..."
6252,How Black Panther Should Have Ended,How It Should Have Ended,"[black, panther, endedwatch, hishes, hishe, th..."


In [31]:
# To create a bag of words by joining the words present in the title, channel_title and key_words field.
df_new['bag_of_words'] = ''
columns = df_new.columns
for index, row in df_new.iterrows():
    words = ''
    for col in columns:
        if col != 'channel_title':
            words = words + ''.join(row[col])+ ' '
        else:
            words = words + row[col]+ ' '
    df_new['bag_of_words'][index] = words

In [32]:
df_new

Unnamed: 0,title,channel_title,Key_words,bag_of_words
0,Dion Lewis' 103-Yd Kick Return TD vs. Denver! ...,NFL,"[new, england, patriots, returner, dion, lewis...",Dion Lewis' 103-Yd Kick Return TD vs. Denver! ...
1,TL;DW - Every DCEU Movie Before Justice League,Screen Junkies,"[justice, league, approaching, fast, rewatched...",TL;DW - Every DCEU Movie Before Justice League...
2,Iraq-Iran earthquake: Deadly tremor hits borde...,BBC News,"[strong, magnitude, earthquake, rattled, north...",Iraq-Iran earthquake: Deadly tremor hits borde...
3,Jason Momoa Wows Hugh Grant With Some Dothraki...,The Graham Norton Show,"[think, sarah, millican, excited, subscribe, w...",Jason Momoa Wows Hugh Grant With Some Dothraki...
4,Mayo Clinic's first face transplant patient me...,Mayo Clinic,"[one, half, years, surgery, transformed, life,...",Mayo Clinic's first face transplant patient me...
...,...,...,...,...
6249,BTS Plays With Puppies While Answering Fan Que...,BuzzFeed Celeb,"[bts, pps, puppies, adorable, provided, vander...",BTS Plays With Puppies While Answering Fan Que...
6250,The Cat Who Caught the Laser,AaronsAnimals,"[cat, caught, laser, aarons, animals]",The Cat Who Caught the Laser AaronsAnimals cat...
6251,I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...,Brad Mondo,"[much, fun, transforming, safiyas, hair, video...",I GAVE SAFIYA NYGAARD A PERFECT HAIR MAKEOVER ...
6252,How Black Panther Should Have Ended,How It Should Have Ended,"[black, panther, endedwatch, hishes, hishe, th...",How Black Panther Should Have Ended How It Sho...


In [33]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df_new['bag_of_words'])
# creating a Series for the video titles so they are associated to an ordered numerical
# we will use this list later to match the indices
indices = pd.Series(df_new['title'])
indices[:5]

0    Dion Lewis' 103-Yd Kick Return TD vs. Denver! ...
1       TL;DW - Every DCEU Movie Before Justice League
2    Iraq-Iran earthquake: Deadly tremor hits borde...
3    Jason Momoa Wows Hugh Grant With Some Dothraki...
4    Mayo Clinic's first face transplant patient me...
Name: title, dtype: object

In [34]:
# generating the cosine similarity matrix.
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.05129892],
       [0.        , 0.        , 0.        , ..., 0.        , 0.05129892,
        1.        ]])

In [35]:
# function that takes in video title as input and returns the top 10 recommended videos based on the similarity scores calculated.
def recommendations(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the video title that matches the title given as argument to this method.
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar videos
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    top_10_indexes_scores = list(score_series.iloc[1:11])
    
    # populating the list with the titles of the best 10 matching videos
    index = 0
    for i in top_10_indexes:
        recommended_movies.append(list(df_new['title'])[i]+": "+str(top_10_indexes_scores[index]))
        index+=1
        
    return recommended_movies

In [36]:
recommendations('Official Call of Duty®: Black Ops 4 — Multiplayer Reveal Trailer')

['Official Call of Duty®: Black Ops 4 Teaser: 0.8346223261119858',
 "Eagles' Home Radio Call of the Last Play of Super Bowl 52 | NFL Highlights: 0.31578947368421045",
 'Map of Biology: 0.30588764516074896',
 'LIFE OF THE PARTY - Official Trailer 1: 0.29019050004400465',
 'Welcome to the Official Class of 2018 Inductees: 0.2867696673382022',
 'The Week Of | Official Trailer [HD] | Netflix: 0.27668578554642986',
 "Rise of the TMNT Official Live Stream Character Art Reveal ft. Andre 'Black Nerd' & Kevin Eastman: 0.27036903521793754",
 'Troy: Fall Of A City | Official Trailer [HD] | Netflix: 0.26490647141300877',
 'SICARIO, Day of the Soldado - Official Trailer (HD): 0.26490647141300877',
 'Irelia: The Blade Dancer | Champion Trailer - League of Legends: 0.26315789473684204']

In [37]:
recommendations('Camila Cabello - Havana (Vertical Video) ft. Young Thug')

['Camila Cabello, Daddy Yankee - Havana (Remix - Audio): 0.4216370213557839',
 'Camila Cabello - Live from Youtube: 0.3651483716701108',
 'Camila Cabello - Real Friends (Audio): 0.3585685828003181',
 'Camila Cabello - Never Be The Same: 0.3508232077228117',
 'Camila Cabello - Never Be the Same: 0.33541019662496846',
 "Camila Cabello - Something's Gotta Give (Audio): 0.33541019662496846",
 'Made in Miami (Artist Spotlight Story) - Camila Cabello: 0.3265986323710904',
 'Camila Cabello - Never Be the Same (Audio): 0.31622776601683794',
 'Camila Cabello - Havana ft. Miranda Sings (Tana Mongeau Parody): 0.31622776601683794',
 'HAVANA - CAMILA CABELLO (English + Spanish Cover): 0.31622776601683794']

In [38]:
recommendations('How To Make a Giant Flaming Vortex Fountain')

['How to Make MAGIC SAND: 0.6390096504226939',
 'How To Make a Mega Metal Foundry: 0.6092717958449424',
 'How To Make an Ocarina of Time IRL: 0.5962847939999439',
 'Make a Glowing Announcement Board: 0.4811252243246882',
 'GLOW Slime: 0.4364357804719848',
 'How Does Smoke Behave in a Vacuum?: 0.4351941398892446',
 'What Are the Chemicals In Our Bread | How to Make Everything: 0.4124789556921528',
 'RAT TRAP Trebuchet: 0.408248290463863',
 'Humongous Turkey Lollipop: 0.408248290463863',
 'How to make the KELP SHAKE from Spongebob Squarepants!: 0.4003203845127179']

In [39]:
recommendations('TL;DW - Every DCEU Movie Before Justice League')

['Honest Trailers - Justice League: 0.4558423058385518',
 'Honest Trailers - Every Wes Anderson Movie: 0.40201512610368484',
 'Honest Trailers - Every Christopher Nolan Movie: 0.40201512610368484',
 'Justice League - Movie Review: 0.3418817293789138',
 'Justice League - Movie Review: 0.3418817293789138',
 'Honest Trailers - The Emoji Movie: 0.3198010745334156',
 'Screen Junkies 2017 Oscar Nominations: Our Academy Awards Picks: 0.30151134457776363',
 'Honest Trailers - mother!: 0.2461829819586655',
 'Honest Trailers - Jumanji: 0.2461829819586655',
 'Justice League Bad: 0.2461829819586655']

### LDA on Description

In [40]:
# function that takes in video title as input and returns the top 10 recommended videos based on the similarity scores calculated for the lda
def recommendations_lda(title, cosine_sim = cosine_sim):
    
    recommended_movies = []
    
    # gettin the index of the video title that matches the title given as argument to this method.
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar videos
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    top_10_indexes_scores = list(score_series.iloc[1:11])
    
    # populating the list with the titles of the best 10 matching videos
    index = 0
    for i in top_10_indexes:
        recommended_movies.append(list(df_new['title'])[i]+": "+str(top_10_indexes_scores[index]))
        index+=1
        
    return recommended_movies

#### LDA model training on description text using tfdif vecortizer and UMAP results visualization based on the category and description

In [41]:
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.simplefilter("ignore", DeprecationWarning)
 
# Helper function
def print_topics(model, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i] for i in topic.argsort()[:-11:-1]]))
        
# Initialise the count vectorizer with the English stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# Fit and transform the processed description
count_data_tfidf = tfidf_vectorizer.fit_transform(df_new_description['description'])        
# Number of Topics
number_topics = 16
# Create and fit the LDA model
lda = LDA(n_components=number_topics)
lda.fit(count_data_tfidf)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, tfidf_vectorizer)

Topics found via LDA:

Topic #0:
espn nfl youtube watch follow football nba mlb sportscenter scores

Topic #1:
guardian simons cat tshirt gq half itunes reddit interesting machine

Topic #2:
fbe feast fine react production credit brothers assistant coordinator cw

Topic #3:
tmz starr murrell spencer gilbert refinery geazy dan johns twice

Topic #4:
shots studios lele stocking anwar pons rudy jibawi hannah mancuso

Topic #5:
netflix music bravo members wwhl mythical entertainment follow watch million

Topic #6:
chappell warner production licensed buzzfeed images music idol buzzfeedvideo credits

Topic #7:
nail polish code broadcasts licensingstoryfulcom use lounge player bbc domino

Topic #8:
wwe mustsee wwecom month network crate waterjet match rest gordons

Topic #9:
refinery licensing licensingviralhogcom usage fort contact explosm century mcelfatrick fox

Topic #10:
nba conan yiay league refinery pass plays stories obrien highlights

Topic #11:
noggin life sins dude outer human gus 

In [42]:
cosine_sim_tfidf = cosine_similarity(count_data_tfidf, count_data_tfidf)

#### Recommendations after lda

In [43]:
recommendations_lda('Official Call of Duty®: Black Ops 4 — Multiplayer Reveal Trailer',cosine_sim = cosine_sim_tfidf)

['Official Call of Duty®: Black Ops 4 Teaser: 0.1622615784454197',
 'Kendrick Lamar, SZA - All The Stars (Audio): 0.1342173880526215',
 'Battlefield 5 Official Multiplayer Trailer: 0.10709087927840101',
 'Obama arrives for jury duty at Daley Center: 0.10356914808524821',
 "How Black Panther's Visual Effects Were Made | WIRED: 0.0912294465256556",
 'BLACK LIGHTNING - Series Premiere Review (Black Nerd): 0.08138965728908563',
 'The Weeknd, Kendrick Lamar - Pray For Me (Audio): 0.07773996159439474',
 "PLAYERUNKNOWN'S BATTLEGROUNDS - The Game Awards 2017 Gameplay Trailer: 0.0665184731819172",
 "Chris Stapleton - Tryin' To Untangle My Mind (Audio): 0.0647159006590976",
 "What New Yorkers Think Childish Gambino's “This Is America” Means | Genius News: 0.06080561830568116"]

In [44]:
recommendations_lda('TL;DW - Every DCEU Movie Before Justice League',cosine_sim = cosine_sim_tfidf)

['Screen Junkies 2017 Oscar Nominations: Our Academy Awards Picks: 0.3770390884099942',
 'Justice League - Movie Review: 0.2334326380969939',
 'Honest Trailers - Batman Forever: 0.2137371262086743',
 'Honest Trailers - Justice League: 0.19969919559453558',
 'Honest Trailers - The Santa Clause: 0.1899689432008999',
 'Justice League Could Lose WB Big Money - SJU: 0.18591306601347096',
 'Honest Trailers - The Oscars (2018): 0.17452607774472406',
 'Honest Trailers - Thor: Ragnarok: 0.16505314829804824',
 'Honest Trailers - The Room: 0.16312768223515942',
 'Honest Trailers - mother!: 0.16139923905898934']

In [45]:
count_data_tfidf

<6254x42966 sparse matrix of type '<class 'numpy.float64'>'
	with 338308 stored elements in Compressed Sparse Row format>

In [46]:
# perform 
# <pip uninstall umap-learn> (if not installed then skip this step)
# <pip install umap-learn>
# and then follow it
# <pip install umap>
import umap.umap_ as umap
embedding = umap.UMAP(metric='hellinger').fit(count_data_tfidf)
embedding_cv = umap.UMAP(metric='hellinger').fit(count_matrix)

In [47]:
embedding

UMAP(angular_rp_forest=True, metric='hellinger')

In [48]:
# to get the category_id's as the labels
category_ids = []
for idx, document in enumerate(df_new_description['category_id']):
        category_ids.append(document)

In [49]:
len(category_ids)

6254

In [None]:
hover_df = pd.DataFrame(category_ids, columns=['category_id'])

In [None]:
# umap plot on the processed descriptions after the application of fitting and transforming the descriptions for tfidf vectorizer.
import umap.plot
f = umap.plot.points(embedding,labels = hover_df['category_id'])

In [None]:
# umap plot on the processed descriptions after the application of fitting and transforming the descriptions for count vectorizer.
import umap.plot
f = umap.plot.points(embedding_cv,labels = hover_df['category_id'])

In [None]:
#Plotting umap connectivity for tfidf
umap.plot.connectivity(embedding, show_points=True)
#Plotting umap connectivity for Count vectorizer
umap.plot.connectivity(embedding_cv, show_points=True)

In [None]:
df_new_titles_channel = pd.DataFrame(df_title_channel_title)

In [None]:
df_new_titles_channel = df_new_titles_channel[['category_id','title','channel_title']]

In [None]:
#UMAP INTERACTIVE FOR TFIDF
p = umap.plot.interactive(embedding, labels=hover_df['category_id'], hover_data = df_new_titles_channel, point_size=2)
umap.plot.show(p)

In [None]:
#UMAP INTERACTIVE FOR COUNT VECTORIZER
p = umap.plot.interactive(embedding_cv, labels=hover_df['category_id'], hover_data = df_new_titles_channel, point_size=2)
umap.plot.show(p)

In [None]:
import pyLDAvis
from pyLDAvis import sklearn as sklearn_lda

In [None]:
pyLDAvis.enable_notebook()
sklearn_lda.prepare(lda, count_data_tfidf, tfidf_vectorizer)

In [None]:
sklearn_lda.prepare(lda, count_data_tfidf, tfidf_vectorizer,mds='tsne')