In [None]:
!pip install pandas BeautifulSoup4 selenium elasticsearch
!apt-get update 
!apt install chromium-chromedriver

Collecting selenium
  Downloading selenium-4.1.3-py3-none-any.whl (968 kB)
[K     |████████████████████████████████| 968 kB 5.1 MB/s 
Collecting urllib3[secure,socks]~=1.26
  Downloading urllib3-1.26.9-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 54.2 MB/s 
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting trio~=0.17
  Downloading trio-0.20.0-py3-none-any.whl (359 kB)
[K     |████████████████████████████████| 359 kB 61.0 MB/s 
[?25hCollecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting outcome
  Downloading outcome-1.1.0-py2.py3-none-any.whl (9.7 kB)
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.1.0-py3-none-any.whl (24 kB)
Collecting pyOpenSSL>=0.14
  Downloading pyOpenSSL-22.0.0-py2.py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.7 MB/s 
[?

In [None]:
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed
from bs4 import BeautifulSoup as bfs
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import InvalidArgumentException, WebDriverException

In [None]:
df = pd.read_csv("data.csv")

In [None]:
df

Unnamed: 0,title,author,date,views,likes,link,transcript
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,https://ted.com/talks/ozawa_bineshi_albert_cli...,Yuchi F’as English Good afternoon I come from ...
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,https://ted.com/talks/sydney_iaukea_the_dark_h...,It was January 16th 1895 Two men arrived at Li...
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,https://ted.com/talks/martin_reeves_how_play_c...,Have you ever trodden on a Lego brick in your ...
3,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,https://ted.com/talks/james_k_thornton_why_is_...,Imagine a world in which China was an environm...
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,October 2021,2400,72,https://ted.com/talks/mahendra_singhi_cement_s...,
...,...,...,...,...,...,...,...
5435,The best stats you've ever seen,Hans Rosling,February 2006,15000000,458000,https://ted.com/talks/hans_rosling_the_best_st...,About 10 years ago I took on the task to teach...
5436,Do schools kill creativity?,Sir Ken Robinson,February 2006,72000000,2100000,https://ted.com/talks/sir_ken_robinson_do_scho...,Good morning How are you Audience Good Its bee...
5437,Greening the ghetto,Majora Carter,February 2006,2900000,88000,https://ted.com/talks/majora_carter_greening_t...,If youre here today and Im very happy that you...
5438,Simplicity sells,David Pogue,February 2006,2000000,60000,https://ted.com/talks/david_pogue_simplicity_s...,Music The Sound of Silence Simon amp Garfunkel...


In [None]:
# Initializing headless browser for selenium
browser_options = webdriver.ChromeOptions()
browser_options.add_argument('--headless')
browser_options.add_argument('--no-sandbox')
browser_options.add_argument('--disable-dev-shm-usage')
browser_options.add_argument("--lang=en-US")   # making sure we get english transcript
browser_service = Service('chromedriver')      # need to make chromdriver executable

In [None]:
def get_transcript(index, url):
    url += '/transcript'
    browser = webdriver.Chrome(options=browser_options, service=browser_service)      
  
    try:
        browser.get(url)
    except InvalidArgumentException:
        browser.close()
        return index, None    # return None if request error 

    html = browser.page_source
    browser.close()
    bfs_html = bfs(html, features="html.parser")
    transcript_bfs_list = bfs_html.find_all('span', attrs={'class': 'cursor-pointer inline hover:bg-red-300 css-82uonn'})

    if transcript_bfs_list:
        transcript_list = [transcript.text.strip() for transcript in transcript_bfs_list]
    else:
        return index, None    # return None if transcript was not found

    return index, ' '.join(transcript_list)

In [None]:
with ProcessPoolExecutor(max_workers=5) as executor:
    # getting rows that have transcript field set to Null
    input_data = df.loc[df['transcript'].isnull()].iterrows()
    
    results = [executor.submit(get_transcript, index, row['link']) for index, row in input_data]
    for result in as_completed(results):
      try:
        index, transcript = result.result()
        df.loc[index, 'transcript'] = transcript
        if transcript is not None: print(f'Completed: {index},\t Remaining: {len(df) - len(df.loc[~df["transcript"].isnull()])}')
      except WebDriverException:
        pass  # ignoring chromedriver errors

In [None]:
df

Unnamed: 0,title,author,date,views,likes,link,transcript
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,https://ted.com/talks/ozawa_bineshi_albert_cli...,Yuchi F’as English Good afternoon I come from ...
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,https://ted.com/talks/sydney_iaukea_the_dark_h...,It was January 16th 1895 Two men arrived at Li...
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,https://ted.com/talks/martin_reeves_how_play_c...,Have you ever trodden on a Lego brick in your ...
3,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,https://ted.com/talks/james_k_thornton_why_is_...,Imagine a world in which China was an environm...
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,October 2021,2400,72,https://ted.com/talks/mahendra_singhi_cement_s...,
...,...,...,...,...,...,...,...
5435,The best stats you've ever seen,Hans Rosling,February 2006,15000000,458000,https://ted.com/talks/hans_rosling_the_best_st...,About 10 years ago I took on the task to teach...
5436,Do schools kill creativity?,Sir Ken Robinson,February 2006,72000000,2100000,https://ted.com/talks/sir_ken_robinson_do_scho...,Good morning How are you Audience Good Its bee...
5437,Greening the ghetto,Majora Carter,February 2006,2900000,88000,https://ted.com/talks/majora_carter_greening_t...,If youre here today and Im very happy that you...
5438,Simplicity sells,David Pogue,February 2006,2000000,60000,https://ted.com/talks/david_pogue_simplicity_s...,Music The Sound of Silence Simon amp Garfunkel...


In [None]:
df.to_csv("data_with_transcript.csv", index=False)