Mounting Google Drive to get the dataset

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Requirements

Install all needed libraries

In [2]:
!pip install pytube
!pip install youtube_transcript_api
!python3 -m pip install pyLDAvis
!python3 -m spacy download en
!pip install pandas==1.3.1

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 4.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


Import all libraries

In [3]:
! wget https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg.linux64
! mv ffmpeg.linux64 ~/.imageio/ffmpeg/ffmpeg-linux64-v3.3.1

--2021-12-02 01:07:52--  https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg.linux64
Resolving github.com (github.com)... 52.192.72.89
Connecting to github.com (github.com)|52.192.72.89|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/imageio/imageio-binaries/master/ffmpeg/ffmpeg.linux64 [following]
--2021-12-02 01:07:53--  https://raw.githubusercontent.com/imageio/imageio-binaries/master/ffmpeg/ffmpeg.linux64
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28549024 (27M) [application/octet-stream]
Saving to: ‘ffmpeg.linux64.1’


2021-12-02 01:07:53 (133 MB/s) - ‘ffmpeg.linux64.1’ saved [28549024/28549024]



In [4]:
from os import listdir
import os.path
import numpy as np
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import pyLDAvis
import pyLDAvis.gensim_models 
from tqdm import tqdm_notebook as tqdm
from pprint import pprint
import re
import sys
import urllib.request
import requests
from pytube import YouTube
from youtube_transcript_api import YouTubeTranscriptApi
from moviepy.editor import *
from bs4 import BeautifulSoup  

  from collections import Iterable


# Data Collection 

YouTube class to hold all the video data

In [5]:
PATH = "/content/"
API_KEY = None

In [6]:
# This class holds all the video data
class YoutubeStats:
    def __init__(self, url, id):
        #You can use the youtube data api v3 by uncommenting the following 3 lines and passing an API key to the class 
        #callUrl = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={id}&key={API_KEY}"
        #self.respose = requests.get(callUrl)
        #self.data = json.loads(self.respose.text)
        try:
            self.captionData = YouTubeTranscriptApi.get_transcript(id)
        except:
            self.captionData = None
        

        self.url = url
        self.id = id
        self.title = self.getVideoTitle()

    def printData(self):
        print(self.data)

    def getURL(self):
        return self.url
    
    def getID(self):
        return self.id

    # Returns the video title
    def getVideoTitle(self):
        source = requests.get(self.url).text
        soup = BeautifulSoup(source,'html.parser')
        Title = soup.title.text
        Title = Title[:-10] 
        return Title

    # Returns the video description as a string
    def getVideoDescription(self):
        return self.data["items"][0]["snippet"]["description"]

    # Downloads the video from youtube with 360p resolution
    # given the video URL and the name of the file
    def downloadVideo(self):
        path = PATH + 'videos'
        title = reNameTitle(self.title) + ".mp4"
        print("title ........... ", title)
        YouTube(self.url).streams.first().download(output_path=path, filename=title)

    # Coverts video from mp4 to mp3 given the name of
    # the file and the type of the output file eg: mp3, wav
    def convertVideoToSound(self, type="wav"):
        VideoPath = PATH + 'videos/'
        path = PATH + 'audio/'
        title = reNameTitle(self.title)
        video = VideoFileClip(VideoPath + title + '.mp4')
        video.audio.write_audiofile(path + title + '.' + type)

    # Returns the video transcript as a dictionary with times
    def getVideoTranscript(self):
        return self.captionData

    # Return the video transcript as a string
    def getVideoTranscriptString(self):
        transcript = ""
        addNewLine = False
        for caption in self.captionData:
            if addNewLine:
                transcript += str(caption["text"] + "\n")
                addNewLine = False
            else:
                transcript += str(caption["text"] + " ")
                addNewLine = True
        return transcript
    
    def writeToJsonFile(self):
        path = PATH + 'json_scripts'
        i = 0
        title = self.title
        transcript = self.getVideoTranscript()
        print("{0} Writing to json file...".format(i+1))
        i+=1
        with open(path + '/'f"{reNameTitle(title)}.json", "w") as file:
            json.dump(transcript, file)
    def writeTranscriptFile(self):
        path = "/content/drive/MyDrive/nlp_videos/nlp_transcripts"
        i = 0
        title = self.title
        transcript = self.getVideoTranscriptString()
        print("{0} Writing to transcript file...".format(i+1))
        i+=1
        with open(path + '/'f"{reNameTitle(title)}_transcript.txt", "w") as file:
            file.write(transcript)


Supporting Functions

In [7]:
# This function create a file for every video
# and it assigns the file name to the title of
# that video. It then writes the title, description
# and the transcript to the file
def writeToVideoFile(youtubeStats):
    path = createDirectory('video_scripts')
    i = 0
    for stats in youtubeStats:
        title = stats.title
        #description = stats.getVideoDescription()
        transcript = stats.getVideoTranscriptString()
        print("{0} Writing to file...".format(i+1))
        i+=1
        with open(path + '/'f"{reNameTitle(title)}.txt", "w") as file:
            file.write("Title\n\n" + title)
            file.write(
                "\n--------------------------------------------------------------------\n")
            #file.write("\nDescription\n\n" + description)
            #file.write(
            #    "\n\n--------------------------------------------------------------------\n")
            file.write("\nTranscript\n\n" + transcript)
            file.write(
                "\n--------------------------------------------------------------------\n")

In [8]:
#download all the videos in the input file
def downloadAllVideos(youtubeStats):
    createDirectory('videos')   
    i = 0
    for stats in youtubeStats:
        print("{0} Downloading...".format(i+1))
        i+=1
        stats.downloadVideo(stats.getURL(), stats.title) 

In [9]:
def convertAllVideosToSound(youtubeStats):
    createDirectory('audio')
    for stats in youtubeStats:
        stats.convertVideoToSound(stats.title, 'wav')


In [10]:
# Helper function to rename the file
def reNameTitle(title):
    title = re.sub('[\W_]+', "_", title)
    return title.lower()

In [11]:
# Helper function to create a directory
def createDirectory(name):
    path = PATH + name
    try:
        os.mkdir(path)
    except OSError:
        print ("Creation of the directory %s failed" % path)
    else:
        print ("Successfully created the directory %s " % path)

In [12]:
def converYoutubeLinkToId(link):
  return link.split("=", 1)[1].strip()

In [13]:
# Reads a list of URLs from a given file
# and parses them to video IDs.
# Returns a list of URLs and video IDs
def readFile(path):
    videoIds = []
    urls = []
    with open(path, "r") as file:
        for line in file:
            videoIds.append(line.split("=", 1)[1].strip())
            urls.append(line.strip())
    return videoIds, urls

# Data Processing

Data Preprocessing

In [14]:
nlp = spacy.load("en_core_web_sm")

Read my stop words from Google Drive

In [15]:
def get_stop_words_from_drive():
  with open('/content/drive/My Drive/NLP Project/stop_words.txt', "r") as f:
    return f.read().splitlines()

In [16]:
def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)

In [17]:
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

In [18]:
def pass_through_pipeline(documents_from_files):
  documents = []
  for doc in documents_from_files:
    documents.append(nlp(doc))

  return documents

In [19]:
def data_preprocessing():
  # My list of stop words
  stop_words = get_stop_words_from_drive()
  
  #Add my stop words to Spacy's stop words
  nlp.Defaults.stop_words.update(stop_words)

  # Iterates over the words in the stop words list and resets the "is_stop" flag.
  for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

  # The add_pipe function appends our functions to the default pipeline.
  nlp.add_pipe(lemmatizer,name='lemmatizer',after='ner')
  nlp.add_pipe(remove_stopwords, name="stopwords", last=True)

# Results

In [20]:
data_preprocessing()

In [21]:
videoURL = input("Enter the Youtube video link: ")
videoId = converYoutubeLinkToId(videoURL)
youtubeStats = YoutubeStats(videoURL, videoId)
transcript= youtubeStats.getVideoTranscriptString()
createDirectory('transcripts')
youtubeStats.writeTranscriptFile()
documents = pass_through_pipeline([transcript])

Enter the Youtube video link: https://www.youtube.com/watch?v=yZKhfZ25L4o&t
Creation of the directory /content/transcripts failed
1 Writing to transcript file...


In [22]:
import pickle
with open("/content/drive/My Drive/topic_files/words_20000.pickle", 'rb') as file:
  words2 = pickle.load(file)

with open("/content/drive/My Drive/topic_files/corpus_20000.pickle", 'rb') as file:
  corpus2 = pickle.load(file)

with open("/content/drive/My Drive/topic_files/50topics_lda.pickle", 'rb') as file:
  lda2_model2 = pickle.load(file)  

In [23]:

# Creates, which is a mapping of word IDs to words.
words = corpora.Dictionary(documents)

# Turns each document into a bag of words.
corpus = [words2.doc2bow(doc) for doc in documents]

# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=words2,num_topics=10,)



In [24]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda2_model2, corpus, words2)

vis