In [2]:
from pydub import AudioSegment # https://github.com/jiaaro/pydub
import gc
import speech_recognition as sr
from pydub.silence import split_on_silence
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import time

## 1. Preprocessing video and audio

### 1.1 convert video to audio (wav) files

#### 1.1.1 load video files

In [12]:
lec_3_vid = AudioSegment.from_file("data/video/20200120-102822-008.mp4")
lec_5_vid = AudioSegment.from_file("data/video/20200127-102817-008.mp4")

In [3]:
os.listdir("data/video")

['20200120-102822-008.mp4', '20200127-102817-008.mp4']

In [10]:
videos_dict = {f"vid_{i+1}" : os.listdir("data/video")[i] for i in range(len(os.listdir("data/video")))}

In [11]:
videos_dict

{'vid_1': '20200120-102822-008.mp4', 'vid_2': '20200127-102817-008.mp4'}

#### 1.1.2 Convert video to audio

In [13]:
lec_3_vid.export("data/audio/vid_1.wav", format="wav")
lec_5_vid.export("data/audio/vid_2.wav", format="wav")

<_io.BufferedRandom name='data/audio/vid_2.wav'>

In [14]:
del lec_3_vid, lec_5_vid
gc.collect();

### 1.2 Convert audio to text [1] [2]

#### 1.2.1 Chunk audio files

In [48]:
def audio_chunker(audio_files_dir, op_dir, chunk_size=5000):
    """
    Takes in list of audio files
    directories and prepares 
    audio segment to process into
    audio chunks. Output of chunk
    will be under subdirectory of
    video title under op_dir/vid_n
    where n is count of video.
    
    Parameters:
    ----------
    audio_files_dir : list
        list of audio files to
        create chunks of.
    
    op_dir : str
        output directory to 
        store chunks.
    
    chunk_size : int
        size of audio chunks in ms,
        default 5000 as Google API
        supports 5 seconds.
    
    Returns:
    --------
    None
    
    Example:
    --------
    >>> audio_files = ["data/audio/vid_1.wav", "data/audio/vid_2.wav"]
    >>> audio_chunker(audio_files, "data/audio_chunks/")
    """
    audio_segs = [AudioSegment.from_wav(audio_files[0]),
                  AudioSegment.from_wav(audio_files[1])]
    
    for count, audio_seg in enumerate(audio_segs):
        
        print(f"Prcoessing audio file {count+1}/{len(audio_segs)}...")
        len_audio = len(audio_seg) # length of audio segment
        n_chunks = len_audio//chunk_size # calculate chink size
        
        storage_dir = f"{op_dir}vid_{count+1}/"
        os.mkdir(storage_dir) # create storage dir
        
        # create chunks and export to directory
        for i in range(n_chunks):
            
            if i == 0:
                start = 0
                end = chunk_size
            else:
                start = end
                end = start + chunk_size

            chunk = audio_seg[start:end]
            
            chunk.export(f"{storage_dir}chunk_{i}.wav", format="wav")
    
    print("Processing complete!")

In [44]:
audio_files = ["data/audio/vid_1.wav", "data/audio/vid_2.wav"]
audio_chunker(audio_files, "data/audio_chunks/")

Prcoessing audio file 1/2...
Prcoessing audio file 2/2...
Processing complete!


#### 1.2.2 Speech recognition and saving to text file

In [18]:
import speech_recognition as sr

In [73]:
def audio_chunk_to_text(audio_chunks_dir, op_dir, max_text_len=10000):
    """
    Takes in audio chunks directory
    and performs speech recognition
    on each chunk and stores text
    to vid_n.txt where n is the 
    count of the video.
    
    Parameters:
    ------------
    audio_chunks_dir : str
        parent directory where audio 
        of where audio chunks for 
        video_n's are located.
    
    op_dir : str
        text storage directory,
        will be in the form video_n.txt,
        where n is the count of video.
        
    max_text_len : 
        will stop the speech recognition
        after if exceeds max_character length.
        
    Returns:
    ---------
    None
    """
    r = sr.Recognizer()
    audio_chunk_videos = os.listdir(audio_chunks_dir)
    for count, video in enumerate(audio_chunk_videos):
        audio_chunks = os.listdir(audio_chunks_dir + video)
        print(f"Processing video: {count+1}/{len(audio_chunk_videos)}")
        print("---"*12)
        output_text = ""
        step_counter = 0
        
        for i in range(len(audio_chunk_files)):
            if i%5 == 0:
                print(f"Step {step_counter+1}")
                print(f"Char length: {len(output_text)}")
                print(f"Processing chunk: {i+1}/{len(audio_chunks)}...")
                step_counter += 1
            
            aud_file = f"{audio_chunks_dir}{video}/chunk_{i}.wav"
            with sr.AudioFile(aud_file) as source:
                r.adjust_for_ambient_noise(source)
                audio = r.listen(source)
            try:
                text = r.recognize_google(audio)
                output_text += text + " "

            except Exception as e:
                pass

            if len(output_text) > max_text_len:
                break

            time.sleep(0.5)
            
        text_file = open(f"{op_dir}transcribed_{video}.txt", "w+")
        text_file.write(output_text)
        text_file.close()
        print(f"Process complete! {i+1}/{len(audio_chunks)} converted.")

In [74]:
audio_chunk_to_text("data/audio_chunks/", "data/text/")

Processing video: 1/2
------------------------------------
Step 1
Char length: 0
Processing chunk: 1/1018...
Step 2
Char length: 195
Processing chunk: 6/1018...
Step 3
Char length: 351
Processing chunk: 11/1018...
Step 4
Char length: 486
Processing chunk: 16/1018...
Step 5
Char length: 641
Processing chunk: 21/1018...
Step 6
Char length: 753
Processing chunk: 26/1018...
Step 7
Char length: 887
Processing chunk: 31/1018...
Step 8
Char length: 1052
Processing chunk: 36/1018...
Step 9
Char length: 1168
Processing chunk: 41/1018...
Step 10
Char length: 1351
Processing chunk: 46/1018...
Step 11
Char length: 1483
Processing chunk: 51/1018...
Step 12
Char length: 1683
Processing chunk: 56/1018...
Step 13
Char length: 1841
Processing chunk: 61/1018...
Step 14
Char length: 2023
Processing chunk: 66/1018...
Step 15
Char length: 2182
Processing chunk: 71/1018...
Step 16
Char length: 2381
Processing chunk: 76/1018...
Step 17
Char length: 2596
Processing chunk: 81/1018...
Step 18
Char length: 2750


## 2. Performing text similarity matching

### 2.1 Preprocess lecture notebooks

#### 2.1.1 Convert ipynb files to html for easier parsing

In [21]:
import glob

In [54]:
glob.glob("data/text/lec*")[0]

'data/text\\lecture3_floating-point.html'

In [20]:
_ = [os.system(f"jupyter nbconvert --to html {i}") for i in glob.glob("data/text/lec*")]

#### 2.1.2 Read in text using beautiful soup

In [22]:
from bs4 import BeautifulSoup

In [51]:
html_files = glob.glob("data/text/*.html")

In [75]:
def ipynb_html_text_parser(html_files_list):
    """
    Parses text from jupyter notebook,
    HTML files.
    
    Parameters:
    -----------
    html_files : list
        list of relative paths
        to the html files.
    
    Returns:
    --------
    text_dict : dictionary
        A dictionary of text
        where keys are lecture
        names and values are 
        respective texts.
    
    Example:
    ---------
    html_files = glob.glob("data/text/*.html")
    lecture_texts = ipynb_text_parser(html_files)
    """
    lec_list = []
    joined_text_list = []
    for file in html_files_list:
        # read in file using beautiful soup
        soup = BeautifulSoup(open(file), "html.parser")
        
        # get all the text and join
        all_text = soup.find_all("div", {"class" : "text_cell_render border-box-sizing rendered_html"})
        joined_text = " ".join([txt.text for txt in all_text])
        
        # append lecture name and joined
        # text to respective lists.
        lecture_name = file.split("\\")[1].split('.')[0]
        lec_list.append(lecture_name)
        joined_text_list.append(joined_text)
        
    
    text_dict = {"lec_name" : lec_list, "lec_text" : joined_text_list}
    
    
    return text_dict

In [76]:
lecture_texts = ipynb_html_text_parser(html_files)

#### 2.2 Preprocess text

#### 2.2.1 Preprocess lecture notes text

In [81]:
import re
import pandas

In [94]:
def string_scrub(string_input):
    string_input = str(string_input)
    string_input = re.sub(r'^https?:\/\/.*[\r\n]*', '', string_input, flags=re.MULTILINE)
    string_input = string_input.lower()
    string_input = string_input.replace('¶', ' ')
    string_input = string_input.replace('&', 'and')
    string_input = string_input.replace('/', 'or')
    string_input = string_input.replace('\t', ' ')
    string_input = string_input.replace('\n', ' ')
    string_input = string_input.translate(str.maketrans(' ', ' ', string.punctuation))

    
    return string_input

In [95]:
df_lec_notes = pd.DataFrame(lecture_texts)

In [96]:
df_lec_notes["lec_text"] = df_lec_notes["lec_text"].apply(lambda x: string_scrub(x))[0]

In [97]:
df_lec_notes

Unnamed: 0,lec_name,lec_text
0,lecture3_floating-point,dsci 572 lecture 3 how to survive in a world ...
1,lecture5_neural-networks,dsci 572 lecture 3 how to survive in a world ...


#### 2.2.2 Preprocess transcribed lecture notes

In [100]:
transcribed_notes = glob.glob("data/text/transc*")

In [101]:
transcribed_notes

['data/text\\transcribed_vid_1.txt', 'data/text\\transcribed_vid_2.txt']

In [108]:
transcribed_texts = []
vid_names = []
for notes in transcribed_notes:
    with open(notes, "r") as f:
        transcribed_text = f.read()
    vid_name = notes.split("\\")[1].split(".")[0].split("ed_")[1]
    transcribed_texts.append(transcribed_text)
    vid_names.append(vid_name)
transcribed_notes_dict = {"vid_name" : vid_names, "transc_text": transcribed_texts}

In [109]:
df_transcr_notes = pd.DataFrame(transcribed_notes_dict)
df_transcr_notes

Unnamed: 0,vid_name,transc_text
0,vid_1,let's get started are you good at the So today...
1,vid_2,all right let's go starting neural networks to...


## 2.2 Performing text similarity

In [None]:
p

In [70]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

## References
- [1] https://codeloop.org/python-how-to-convert-recorded-audio-to-text/
- [2] https://www.geeksforgeeks.org/audio-processing-using-pydub-and-google-speechrecognition-api/