In [None]:
# from google.colab import drive
# drive.mount("/content/drive")

# Helper Functions


In [None]:
# To create a folder with a given path
def createDirectory(path):
  try:
      os.mkdir(path)
  except OSError:
      print ("Creation of the directory %s failed" % path)
  else:
      print ("Successfully created the directory %s " % path)

# Getting Video Links From Playlists

This code is for collecting data from the Youtube API. Do not run without an API key and an input file with the playlist ids.



In [None]:
import os
import json
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
import sys

def get_youtube_video_links(input_file):
    scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
    # Disable OAuthlib's HTTPS verification when running locally.
    # *DO NOT* leave this option enabled in production.
    os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

    api_service_name = "youtube"
    api_version = "v3"
    client_secrets_file = "client_secret.json"

    # Get credentials and create an API client
    flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
        client_secrets_file, scopes)
    credentials = flow.run_console()
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, credentials=credentials)

    with open(input_file) as f:
      for playlist in f:
        playlist_url = playlist.strip()
        request = youtube.playlistItems().list(
            part="snippet",
            maxResults=300,
            playlistId=playlist_url
        )
        response = request.execute()
        run = False
        if "nextPageToken" in response.keys():
            pageToken = response["nextPageToken"]
            run = True
        
        videoIds = []
        
        i = 0
        for item in response["items"]:
            videoIds.append(item["snippet"]["resourceId"]["videoId"])
            i += 1
            print(i)
        
        while run:
            request = youtube.playlistItems().list(
                part="snippet",
                maxResults=300,
                playlistId=playlist_url,
                pageToken= pageToken
            )
            response = request.execute()
            if "nextPageToken" in response.keys():
                pageToken = response["nextPageToken"]
            else:
                run = False
            for item in response["items"]:
                videoIds.append(item["snippet"]["resourceId"]["videoId"])
                i += 1
                print(i)
        link = "https://www.youtube.com/watch?v="
        videoLinks = [link + id for id in videoIds]
        with open("testLink.txt", "a") as file:
            file.writelines('\n'.join(videoLinks)+'\n')

# Getting Video Transcripts

This code is for collecting data from the Youtube API. Do not run without a file of links for YouTube videos 

In [None]:
!pip install pytube
!pip install youtube_transcript_api



In [None]:
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from moviepy.editor import *
from bs4 import BeautifulSoup
import urllib.request
import requests
import re

In [None]:
#Reads the input file that contains the links for the vidoes and returns a list of URLs
def readFile(path):
  urls = []
  with open(path, "r") as file:
    for line in file:
      urls.append(line.strip())
  return urls

In [None]:
class YouTubeVideo:
  def __init__(self, url, path="transcripts"):
    self.url = url
    self.id = self.converYoutubeLinkToId()
    self.title = self.getVideoTitle()
    try:
      self.captionData = YouTubeTranscriptApi.get_transcript(self.id)
    except:
      self.captionData = None
    
    if self.captionData is not None:
      self.writeTranscriptFile(path)
    else:
      print("Exceeded the limit")
  
  def converYoutubeLinkToId(self):
    return self.url.split("=", 1)[1].strip()
  
  def getVideoTitle(self):
    source = requests.get(self.url).text
    soup = BeautifulSoup(source,'html.parser')
    Title = soup.title.text
    Title = Title[:-10] 
    return Title
  def reNameTitle(self):
    title = re.sub('[\W_]+', "_", self.title)
    return title.lower()
  def getVideoTranscript(self):
    if self.captionData is None: return None
    transcript = ""
    addNewLine = False
    for caption in self.captionData:
      if addNewLine:
        transcript += str(caption["text"] + "\n")
        addNewLine = False
      else:
        transcript += str(caption["text"] + " ")
        addNewLine = True
    return transcript

  def writeTranscriptFile(self, path):
    createDirectory(path)
    i = 0
    title = self.title
    transcript = self.getVideoTranscript()
    print("{0} Writing to transcript file...".format(i+1))
    i+=1
    with open(path + '/'f"{self.reNameTitle()}_transcript.txt", "w") as file:
      file.write(transcript)

# Data Preprocessing

In [None]:
!pip install pytube
!pip install youtube_transcript_api
!python3 -m pip install pyLDAvis
!python3 -m spacy download en
!pip install pandas==1.3.1

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 1.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import pyLDAvis
import pyLDAvis.gensim_models 

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in STOP_WORDS:
  lexeme = nlp.vocab[word]
  lexeme.is_stop = True

if 'lemmatizer' not in nlp.pipe_names:
# The add_pipe function appends our functions to the default pipeline.
  nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
  nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

# Training The LDA Model

This code will take a very long time to run. You can just upload the model manually and run the code where it says START HERE. 

In [None]:
def pass_through_pipeline(documents_from_files):
  documents = []
  for doc in documents_from_files:
    documents.append(nlp(doc))

  return documents

In [None]:
def read_file_from_drive(): 
  i = 1
  docFromFile = []
  path = '/content/drive/My Drive/topics2/'
  for file in os.listdir(path):
    with open(path + file, 'r') as f:
      print("reading file# ", file)
      print(i)
      i+=1
      docFromFile.append(f.read()) 

In [None]:
def train_lda_model():
  doc_list = []
  # Iterates through each article in the corpus.
  # for doc in newest_doc:
  # Passes that article through the pipeline and adds to a new list.
  docFromFile = read_file_from_drive
  j = 1
  for doc in docFromFile:
      print('NLP#', j)
      pr = nlp(doc)
      doc_list.append(pr)
      j+=1

  # Creates, which is a mapping of word IDs to words.
  words = corpora.Dictionary(doc_list)

  # Turns each document into a bag of words.
  corpus = [words.doc2bow(doc) for doc in doc_list]
  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=words2,num_topics=20)
  
  #Storing the model
  with open("/content/drive/My Drive/topic_files/50topics_lda.pickle", 'wb') as file:
    pickle.dump(lda_model, file)

# START HERE

In [None]:
URL = input("Enter a YouTube video url to get the topics: ")
yt = YouTubeVideo(URL)

Enter a YouTube video url to get the topics: https://www.youtube.com/watch?v=-EZ_3Tq9a8c
Creation of the directory transcripts failed
1 Writing to transcript file...


In [None]:
# import pickle
# #Insert the words_20000.pickle file path in the open function
# with open("/content/drive/My Drive/topic_files/words_20000.pickle", 'rb') as file:
#   words = pickle.load(file)

# #Insert the corpus_20000.pickle file path in the open function
# with open("/content/drive/My Drive/topic_files/corpus_20000.pickle", 'rb') as file:
#   corpus = pickle.load(file)

# #Insert the 20topics_lda.pickle file path in the open function
# with open("/content/drive/My Drive/topic_files/20topics_lda.pickle", 'rb') as file:
#   lda_model = pickle.load(file)  

Upload the file SDA_P2_model.zip to run this code


In [None]:
!unzip SDA_P2_model.zip

Archive:  SDA_P2_model.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of SDA_P2_model.zip or
        SDA_P2_model.zip.zip, and cannot find SDA_P2_model.zip.ZIP, period.


In [None]:
import pickle
with open("words_20000.pickle", 'rb') as file:
  words = pickle.load(file)

with open("corpus_20000.pickle", 'rb') as file:
  corpus = pickle.load(file)

with open("20topics_lda.pickle", 'rb') as file:
  lda_model = pickle.load(file) 

Creating the documents for the new transcript

In [None]:
docs2 = []
docs2.append(nlp(yt.getVideoTranscript()))
corpus2 = [words.doc2bow(doc) for doc in docs2]

In [None]:
#Updating the model with the new 
lda_model.update(corpus2)
new_lda = lda_model[corpus2]

In [None]:
#Get the topic ids that have > 0.1 probability
topics_ids = []
for topic in new_lda[0][0]:
  if topic[1] > 0.1:
    topics_ids.append(topic)
print(topics_ids)

[(0, 0.34132323), (5, 0.16276176), (12, 0.38946837)]


In [None]:
#Getting the terms of the topics inside the topics_id list which has all the ids that have > 0.1 probability
topic_words = []
for id in topics_ids:
  t = lda_model.get_topic_terms(id[0],topn=20)
  t2 = []
  for i in range(len(t)):
    t2.append(words[t[i][0]])
  topic_words.append(t2)
print(topic_words)

[['phone', 'use', 'plug', 'laptop', 'giveaway', 'music', 'tablet', 'link', 'tech', 'actually', 'podcast', 'usb', 'enter', 'new', 'cool', '10', 'thank', 'help', 'shirt', 'easy'], ['car', 'power', 'battery', 'drive', 'bit', 'need', 'aesthetic', 'left', 'new', 'massive', 'cool', 'matte', 'black', 'pretty', 'seat', 'watt', '60', 'solar', 'moment', 'nice'], ['play', 'game', 'xbox', 'video', 'earbud', 'gaming', 'actually', 'screen', 'tap', 'super', 'world', 'obviously', 'watch', 'hear', 'pretty', 'plug', 'match', 'volume', 'probably', 'graphic']]


In [None]:
#Print all the topics
for idx, topic in lda_model.print_topics(-1):
   print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.014*"phone" + 0.013*"use" + 0.011*"plug" + 0.011*"laptop" + 0.010*"giveaway" + 0.010*"music" + 0.010*"tablet" + 0.010*"link" + 0.009*"tech" + 0.007*"actually"
Topic: 1 
Words: 0.012*"hertz" + 0.012*"body" + 0.009*"test" + 0.008*"doctor" + 0.008*"use" + 0.008*"brain" + 0.008*"actually" + 0.007*"fat" + 0.007*"study" + 0.006*"eat"
Topic: 2 
Words: 0.055*"eat" + 0.027*"food" + 0.012*"chicken" + 0.011*"meat" + 0.008*"grow" + 0.008*"fish" + 0.007*"animal" + 0.007*"stuff" + 0.007*"taste" + 0.007*"feel"
Topic: 3 
Words: 0.029*"movie" + 0.012*"watch" + 0.012*"play" + 0.011*"song" + 0.010*"music" + 0.008*"love" + 0.007*"hear" + 0.006*"listen" + 0.006*"God" + 0.006*"cause"
Topic: 4 
Words: 0.015*"year" + 0.007*"human" + 0.006*"idea" + 0.006*"change" + 0.006*"aaron" + 0.006*"science" + 0.006*"point" + 0.006*"actually" + 0.005*"earth" + 0.005*"ago"
Topic: 5 
Words: 0.053*"car" + 0.017*"power" + 0.015*"battery" + 0.013*"drive" + 0.010*"bit" + 0.009*"need" + 0.009*"aesthetic" + 0.0

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus2, words)

vis