## Possible sources of data

- Scrape TED Talks

### Scraping TED Talks

We begin with the standard imports:

In [46]:
import requests, urllib
import time, os, glob
import re
import pandas as pd, numpy as np
from bs4 import BeautifulSoup
from itertools import chain

Function to look at a *single* TED talk page and grab all TED talk links:

In [2]:
def get_names(path, alltalks):
    r = urllib.request.urlopen(path).read()
    soup = BeautifulSoup(r, "lxml")
    talks = soup.find_all("a", class_ = "")
    for i in talks:
        if i.attrs['href'].find('/talks/') == 0 and alltalks.get(i.attrs['href']) != 1:
            alltalks[i.attrs['href']] = 1
    
    return alltalks

Function to scrape all TED talk links:

In [3]:
alltalks = {}
link = "https://www.ted.com/talks?language=zh-tw&page={}&sort=newest"

In [5]:
def get_talks(alltalks):
    try:
        for i in range(int(len(alltalks) / 36), 66):
            path = link.format(i)
            alltalks = get_names(path, alltalks)
            print(path, len(alltalks))
            time.sleep(3)
    except urllib.request.HTTPError:
        print("TED got mad at you, waiting 30 seconds")
        time.sleep(30)
        get_talks(alltalks)

get_talks(alltalks)

https://www.ted.com/talks?language=zh-tw&page=5&sort=newest 180
https://www.ted.com/talks?language=zh-tw&page=6&sort=newest 216
https://www.ted.com/talks?language=zh-tw&page=7&sort=newest 252
https://www.ted.com/talks?language=zh-tw&page=8&sort=newest 288
https://www.ted.com/talks?language=zh-tw&page=9&sort=newest 324
https://www.ted.com/talks?language=zh-tw&page=10&sort=newest 360
https://www.ted.com/talks?language=zh-tw&page=11&sort=newest 396
https://www.ted.com/talks?language=zh-tw&page=12&sort=newest 432
https://www.ted.com/talks?language=zh-tw&page=13&sort=newest 468
https://www.ted.com/talks?language=zh-tw&page=14&sort=newest 504
https://www.ted.com/talks?language=zh-tw&page=15&sort=newest 540
https://www.ted.com/talks?language=zh-tw&page=16&sort=newest 576
https://www.ted.com/talks?language=zh-tw&page=17&sort=newest 612
https://www.ted.com/talks?language=zh-tw&page=18&sort=newest 648
https://www.ted.com/talks?language=zh-tw&page=19&sort=newest 684
https://www.ted.com/talks?lang

Function to scrape Traditional Chinese and English for a single talk:

In [56]:
def extract_talk(path, talk_name):
        
    langs = ["en", "zh-tw"]        
    titles = "(?<![A-Z][a-z])(?<![A-Z][a-z][a-z])\."
    r = urllib.request.urlopen(path).read()
    soup = BeautifulSoup(r, "lxml")
    df = pd.DataFrame()
    
    for i in soup.findAll("link"):
        
        #only look at talks with traditional chinese
        try:
            if i.get("href") != None:
                for lang in langs:
                    if i.attrs["href"].find("?language={}".format(lang)) != -1: 
                        path = i.attrs["href"]
                        r1 = urllib.request.urlopen(path).read()
                        soup1 = BeautifulSoup(r1, "lxml")
                        text_talk = []
                        #print(soup1)
                        for i in soup1.findAll("p", class_= "m-b:0"):
                            #print(i.text.strip().replace("\t", "").replace("\n", ""))
                            if lang == "zh-tw":
                                #text_talk.append(i.text.replace("\t","").replace("\n", "")
                                                 #.strip().split(u"。"))
                                text_talk.append(i.text.strip().replace("\t", "").replace("\n", ""))
                            else:
                                #text_talk.append(re.split(titles, i.text.replace("\t","").replace("\n", " ")
                                             #.strip()))
                                text_talk.append(i.text.strip().replace("\t", "").replace("\n", " "))
                            #text_talk.append(i.text.strip().replace("\t","").split("\n")) #split line by line
                        #text_talk = [x for x in list(chain.from_iterable(text_talk)) if x != ""]
                        #text_talk = list(chain.from_iterable(text_talk))
                        #print(" ".join(text_talk))
                        text_talk = [" ".join(text_talk)]
                        #print(talk_name + " " + lang + " " + str(len(text_talk)))
                        df1 = pd.DataFrame()
                        df1[lang] = text_talk
                        df = pd.concat([df1, df], axis = 1)
        except KeyError:
            break
    df = pd.concat([pd.DataFrame({"Talk" : [talk_name]}), df], axis = 1)
    df.to_csv(talk_name + '.txt', index = False, sep='\t', encoding='utf-8')
    #return df

One possible edgecase (lacking perfectly aligned periods):

In [57]:
?urllib.request.urlopen

In [58]:
extract_talk("https://www.ted.com/talks/shubhendu_sharma_an_engineers_vision_for_tiny_forests_everywhere/transcript", "shubhendu_sharma_an_engineers_vision_for_tiny_forests_everywhere")

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>

In [63]:
alltalks = [x.replace("?language=zh-tw", "") for x in list(alltalks)]
alltalks[832]

'/talks/steve_ramirez_and_xu_liu_a_mouse_a_laser_beam_a_manipulated_memory'

Function to scrape Traditional Chinese and English for *all* TED talks:

I temporarily stopped at alltalks[833]

In [61]:
def to_csv(alltalks, talknum):
    try:
        for i in range(talknum, len(alltalks)):
            extract_talk('https://www.ted.com'+ alltalks[i] +'/transcript', alltalks[i][7:])
            time.sleep(3)
            print("On talk number {}".format(talknum + 1) + ", {}% done".format(round((talknum + 1) / len(alltalks), 4)))
            talknum += 1
    except urllib.request.HTTPError:
        print("TED got mad at you, waiting 30 seconds")
        time.sleep(30)
        to_csv(alltalks, talknum)

alltalks = [x.replace("?language=zh-tw", "") for x in list(alltalks)]
to_csv(alltalks, 830)

URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>

## Reading in all txt files

In [94]:
glob.glob("*.csv")[2]

'mehdi_ordikhani_seyedlar_what_happens_in_your_brain_when_you_pay_attention.csv'

In [22]:
path = "C:\\Users\\liblabs-user\\Desktop\\Cal Poly Summer Research 2017"
spath = "/Users/sierra/Desktop/Cal Poly Summer Research 2017"
os.chdir(spath)
pd.read_csv(glob.glob("*.txt")[0], sep = "\t", encoding = "utf-8")

Unnamed: 0,Talk,en,zh-tw
0,abigail_marsh_why_some_people_are_more_altruis...,"There's a man out there, somewhere, who looks ...",一位男子站在那，長的有點神似演員伊卓瑞斯·艾巴，或者是艾巴20年前的樣子。除了他鋌而走險救了...
