# Extract raw transcript 

https://pypi.org/project/youtube-transcript-api/

In [1]:
from youtube_transcript_api import YouTubeTranscriptApi

In [2]:
video_id = 'VIANLddo-ec'
transcript_raw = YouTubeTranscriptApi.get_transcript(video_id)

The transcript is stored as a list of dictionaries

In [3]:
transcript_raw[0:3]

[{'text': 'hello if you have found the ideas i',
  'start': 0.399,
  'duration': 3.92},
 {'text': 'discussed', 'start': 3.76, 'duration': 3.28},
 {'text': 'interesting and useful perhaps you might',
  'start': 4.319,
  'duration': 5.2}]

In [4]:
type(transcript_raw)

list

In [5]:
type(transcript_raw[0])

dict

In [6]:
import sys

transcript_raw_size = sys.getsizeof(transcript_raw)
print("Variable size in bytes: ", transcript_raw_size)

Variable size in bytes:  26744


# Extract transcript text

In [7]:
X = transcript_raw
transcript_text = [X[i]['text'] for i in range(len(X))]

In [8]:
transcript_text

['hello if you have found the ideas i',
 'discussed',
 'interesting and useful perhaps you might',
 'consider purchasing my recently released',
 'book beyond order 12 more rules for life',
 'available from penguin random house in',
 'print',
 'or audio format you could use the links',
 'we provide below or',
 'buy through amazon or at your local',
 'bookstore',
 'this new book beyond order provides what',
 'i hope is a productive and',
 'interesting walk through ideas that are',
 'both philosophically and',
 'sometimes spiritually meaningful as well',
 'as being',
 'immediately implementable and practical',
 'beyond order can be read and understood',
 'on its own',
 'but also builds on the concepts that i',
 'developed in my previous books',
 '12 rules for life and before that',
 'maps of meaning thanks for listening',
 'and enjoy the podcast',
 '[Music]',
 "hello everyone i'm pleased to have with",
 'me today dr',
 'marion l toopy who is the editor',
 'of humanprogress.org',
 'a senio

# Segment transcript text 

In [9]:
from itertools import groupby
marker = "[Music]"
text_split = [list(group) for k, group in groupby(transcript_text, lambda x: x==marker) if not k]

In [10]:
text_split[1]

["hello everyone i'm pleased to have with",
 'me today dr',
 'marion l toopy who is the editor',
 'of humanprogress.org',
 'a senior fellow at the center for global',
 'liberty and prosperity',
 'and co-author of the simon project',
 'he specializes in globalization and the',
 'study of global well-being',
 'as well as the politics and economics of',
 'europe and southern africa',
 'his work has been published or featured',
 'in major print and non-print media',
 'outlets',
 'all throughout the english-speaking',
 'world dr toopy received his ba',
 'in international relations and classics',
 'from the university of the wits waters',
 'rand in johannesburg',
 'south africa and his phd in',
 'international relations from the',
 'university of saint andrews',
 'in great britain he is the co-author of',
 'a recent book',
 '10 global trends that every smart person',
 'needs to know',
 'and many other trends you will find',
 'interesting',
 "it's a beautiful book uh and so that's",
 'an acco

SOLUTION TAKEN FROM https://stackoverflow.com/a/14529615/11033215

# Merge split sentences

In [11]:
separator = " "
text_segment_list = text_split[0]

In [12]:
text_segment = separator.join(text_segment_list)
text_segment

'hello if you have found the ideas i discussed interesting and useful perhaps you might consider purchasing my recently released book beyond order 12 more rules for life available from penguin random house in print or audio format you could use the links we provide below or buy through amazon or at your local bookstore this new book beyond order provides what i hope is a productive and interesting walk through ideas that are both philosophically and sometimes spiritually meaningful as well as being immediately implementable and practical beyond order can be read and understood on its own but also builds on the concepts that i developed in my previous books 12 rules for life and before that maps of meaning thanks for listening and enjoy the podcast'

# Save into a `.docx` file

```bash
pip install python-docx
```

In [13]:
from docx import Document

In [32]:
document = Document()
title = "Ten Global Trends Every Smart Person Should Know\n"
document.add_heading(title, level=1)

<docx.text.paragraph.Paragraph at 0x7fed0a00fa90>

In [33]:
separator = " "
for i in range(len(text_split)):
    text_segment_list = text_split[i]
    text_segment = separator.join(text_segment_list)
    text_formatted = text_segment + "\n"
    document.add_paragraph(text_segment)

In [34]:
doc_filename = 'jbp_podcast.docx'
document.save(doc_filename)

SOLUTION TAKE FROM https://stackoverflow.com/a/60840125/11033215

# Speaker labelling

## Download audio file using URL

In [35]:
from __future__ import unicode_literals
import youtube_dl


ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
}

url = 'http://www.youtube.com/watch?v=VIANLddo-ec'

with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([url])

[youtube] VIANLddo-ec: Downloading webpage
[download] Destination: Ten Global Trends Every Smart Person Should Know _ Marian Tupy - Jordan B Peterson Podcast S4 E18-VIANLddo-ec.m4a
[download] 100% of 102.20MiB in 00:1800MiB/s ETA 00:00757
[ffmpeg] Correcting container in "Ten Global Trends Every Smart Person Should Know _ Marian Tupy - Jordan B Peterson Podcast S4 E18-VIANLddo-ec.m4a"
[ffmpeg] Destination: Ten Global Trends Every Smart Person Should Know _ Marian Tupy - Jordan B Peterson Podcast S4 E18-VIANLddo-ec.mp3
Deleting original file Ten Global Trends Every Smart Person Should Know _ Marian Tupy - Jordan B Peterson Podcast S4 E18-VIANLddo-ec.m4a (pass -k to keep)


SOURCE https://stackoverflow.com/a/27481870/11033215 

In [None]:
import IPython
filename = "Ten Global Trends Every Smart Person Should Know _ Marian Tupy - Jordan B Peterson Podcast S4 E18-VIANLddo-ec.mp3"
IPython.display.Audio(filename)

## Classify speakers

In [41]:
from google.cloud import speech_v1p1beta1 as speech

client = speech.SpeechClient()

speech_file = filename

with open(speech_file, "rb") as audio_file:
    content = audio_file.read()

audio = speech.RecognitionAudio(content=content)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=8000,
    language_code="en-US",
    enable_speaker_diarization=True,
    diarization_speaker_count=2,
)

print("Waiting for operation to complete...")
response = client.recognize(config=config, audio=audio)

# The transcript within each result is separate and sequential per result.
# However, the words list within an alternative includes all the words
# from all the results thus far. Thus, to get all the words with speaker
# tags, you only have to take the words list from the last result:
result = response.results[-1]

words_info = result.alternatives[0].words

# Printing out the output:
for word_info in words_info:
    print(
        u"word: '{}', speaker_tag: {}".format(word_info.word, word_info.speaker_tag)
    )

ModuleNotFoundError: No module named 'google'

SOURCE https://cloud.google.com/speech-to-text/docs/multiple-voices#speech_transcribe_diarization_beta-python

# Upload document to cloud storage

Install `rclone` to upload document to cloud

https://rclone.org/

ModuleNotFoundError: No module named 'rclone'

# TO-DOs

* Find out how to install google.cloud 

https://cloud.google.com/sdk/docs/install#deb 

https://github.com/googleapis/google-cloud-python 

# Appendix

Another method to segment transcript text

In [28]:
def split_at_markers(mylist, markers):
    indices = [i for i, m in enumerate(mylist) if m in markers]
    for start, end in zip([0, *indices], [*indices, len(mylist)]):
        yield mylist[start:end+1]

## Text speaker classification

Potentially useful links:

* https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781787125193/9/ch09lvl1sec61/identifying-speakers-with-voice-recognition 

* https://towardsdatascience.com/how-to-build-a-neural-network-for-voice-classification-5e2810fe1efa 

* https://towardsdatascience.com/voice-classification-with-neural-networks-ff90f94358ec