##### What follows is a bunch of ways to get text from the web that you can write out to files. It includes guides for pulling some basic text from various places on the web. The main ones:

1. Corpora from NLTK
    a. Brown corpus
    b. Gutenberg corpus
2. Social Media
    a. Tweets from a specific user
    b. Transcripts from Youtube
3. Webpages

# 0. Universal Functions - Run First

In [None]:
import re

# i made this specifically for the original text file I was using, but it can catch a lot of weird issues with spaces
# note that it runs at the sentence level, rather than at the level of a full text
def punctuation_fix(sentence):
    sentence = sentence.replace(" ,",",").replace(" .",".").replace(r'(?<=[^:]) \.\.\.',"...").replace(":...",": ...").replace(" ?","?").replace(" !","!").replace(" ;",";").replace(" :",":").replace("  "," ").replace("   ","  ").replace("\n","").replace(" )", ")").replace("( ","(").replace(";;",";").replace("::",":")
    try: 
        # this regex pattern fixes apostrophe errors for contractions, e.g. "I 'm" or "do n't"
        patterns = ["[A-Za-z]* '[a-z]", "[A-Za-z]* [a-z]'[a-z]"]
        for pattern in patterns:
            matches = re.findall(pattern, sentence)
            for match in matches:
                match_f = match.replace(" ","")
                sentence = sentence.replace(match, match_f)
        # this regex pattern fixes apostrophe errors for questions and exclamations, e.g. "Hi!It's" or "Yes?This"
        odd_pattern = "[?!](?=[A-Z])"
        matches = re.findall(odd_pattern, sentence)
        for match in matches:
            sentence = sentence.replace(match, match+" ")
        if sentence[-1] == " ":
            sentence = sentence[0:-1]
    except:
        pass
    return sentence

# writes out a file given a name; 
# NOTE! if you want to extend a file rather than make it from scratch, be sure to include
# overwrite=False as a third input when you call the function; otherwise it'll write over your file
def file_writer(filename, fulltext, overwrite=True):
    if overwrite == True:
        with open(filename, 'w', encoding="utf=8") as f:
            f.write(fulltext)
    elif overwrite == False:
        with open(filename, 'a', encoding="utf=8") as f:
            f.write(fulltext+"\n")

# 0. File Output - Run Last
### Set up the various filenames and string names you want to use from the rest of the notebook in the first block, then run the second block to write out the files.

In [None]:
# enter as many tuples as you have strings to write out to a text file; format is (filename, string_name)
# note you can give the tuple a third argument, False (no quotes), if you want to extend rather than overwrite the file
output_files = [
    ("scifi.txt", scifi_fulltext), 
    ("mystery.txt", mystery_fulltext),
    ("emma.txt", emma_fulltext, False), 
    ("gigi_tweets.txt", gigi_tweets_fulltext), 
    ("gigi_transcripts.txt", gigi_transcripts_fulltext),
    ("pathologic_script.txt", pathologic_script),
    ("pathologic_dia.txt", pathologic_dialogue)
    ]

# folder where you want your output text files saved
output_directory = "data/"

In [None]:
for output in output_files:
    if len(output) == 2:
        file_writer(output_directory+output[0],output[1])
    elif len(output) == 3:
        file_writer(output_directory+output[0],output[1], overwrite=output[2])

# 1. NLTK - Brown & Gutenberg
### Run next block first

In [None]:
'''if needed:
!pip install nltk
nltk.download('gutenberg')
'''
# run this; it will import the brown and gutenberg corpora tools, and build a dictionary we'll use for the brown corpus
try: 
    import nltk
    from nltk.corpus import brown
    from nltk.corpus import gutenberg
except:
    nltk.download('gutenberg')
    import nltk
    from nltk.corpus import brown
    from nltk.corpus import gutenberg
    
brown_bycat = dict()
gutenberg_byid = dict()

# fetching and sorting texts for each category in the brown corpus
for category in brown.categories():
    sentences = brown.sents(categories=[category])
    category_str = ""
    for sentence in sentences:
        sentence_out = " ".join(sentence)
        sentence_out = punctuation_fix(sentence_out)
        category_str += sentence_out + " "
    brown_bycat[category]=category_str

# fetching and sorting texts for each category in the gutenberg corpus
for fileid in nltk.corpus.gutenberg.fileids():
    wordslist = nltk.corpus.gutenberg.words(fileid)
    meta_index = wordslist.index("]")
    fileid_str = " ".join(wordslist[meta_index+1:])
    fileid_str = punctuation_fix(fileid_str)
    gutenberg_byid[fileid] = fileid_str

## 1.a Brown
brown_bycat is a dictionary we made above, and it maps a category name to a string.
The string will be the combined strings of every text in that category, so it'll be pretty long!

Categories to choose from: adventure, belles_lettres, editorial, fiction, government, hobbies, humor, learned, lore, mystery, news, religion, reviews, romance, science_fiction

In [None]:
scifi_fulltext = brown_bycat["science_fiction"]
mystery_fulltext = brown_bycat["mystery"]

## 1.b Gutenberg
gutenberg_byid is a dictionary we made above, and it maps a file id to a string. The strings are full books, so be aware of length here too.

File ids to choose from: austen-emma.txt, austen-persuasion.txt, austen-sense.txt, bible-kjv.txt, blake-poems.txt, bryant-stories.txt, burgess-busterbrown.txt, carroll-alice.txt, chesterton-ball.txt, chesterton-brown.txt, chesterton-thursday.txt, edgeworth-parents.txt, melville-moby_dick.txt, milton-paradise.txt, shakespeare-caesar.txt, shakespeare-hamlet.txt, shakespeare-macbeth.txt, whitman-leaves.txt

In [None]:
emma_fulltext = gutenberg_byid["austen-emma.txt"]

# 2. Social Media - Twitter & YouTube Transcripts

## 2.a Twitter
### Run next two blocks first, but note!!
#### (Note filepath that you'll probably need to update in the first block.)
I have this block set up to pull your Twitter access/authorization info from a file, which for me is located in my "data" folder. You might need to change this, depending on the location and name of your text file. If you don't know what those are or how to get them, this is a good resource:

    https://www.earthdatascience.org/courses/use-data-open-source-python/intro-to-apis/twitter-data-in-python/

(Basically, to use Twitter's API, you need to have a certain kind of account and get approval from them by filling out a form. If they approve you, which will (in my experience) happen quickly if you're filling it out as an academic, you'll get the info that is being pulled from the twitter_auth.txt file.

Similarly, the file needs to be in this format:

    consumer_key='key'
    consumer_secret='key'
    access_token='key'
    access_token_secret='key'


Replace the word key in each instance with the correct key, which will be a string of digits, upper-case letters, and lower-case letters. Don't delete the quotation marks or anything else.

In [None]:
'''if needed:
!pip install tweepy
'''
import tweepy as tw

twitter_auth = "data/twitter_auth.txt"
with open(twitter_auth, 'r') as f:
    keys = f.readlines()
    
pattern = r"(?<=')[-\w]*(?=')"
consumer_key = re.search(pattern,keys[0]).group(0)
consumer_secret = re.search(pattern,keys[1]).group(0)
access_token = re.search(pattern,keys[2]).group(0)
access_token_secret = re.search(pattern,keys[3]).group(0)

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [None]:
# turns a username and number of tweets to fetch into a string, with tweets separated by spaces
# if you want each tweet on its own line, change the " " part of the join statement to "\n"
def user_tweets(username, limit):
    tweets = api.user_timeline(screen_name=user, 
                           count=limit,
                           include_rts = False,
                           tweet_mode = 'extended'
                           )
    
    tweets_f = list()
    for tweet in tweets:
        tweet_txt = tweet.full_text
        url_rgx = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
        at_rgx = r'@[a-zA-Z0-9]*'
        rgxs = [url_rgx, at_rgx]
        for rgx in rgxs:
            tweet_txt = re.sub(rgx, " ", tweet_txt)
        tweets_f.append(tweet_txt)
    tweets_str = " ".join(tweets_f)
    tweets_str = punctuation_fix(tweets_str)
    tweets_str = tweets_str.replace("\n"," ")
    return tweets_str

### Enter a username and number of tweets to fetch (max: 900). The result will be a string, so you can add it directly to the File Output section's list toward the top of this notebook. (Plus, if you didn't before, now you know who Gigi Gorgeous is.) You can call the fulltext variable whatever you want.

In [None]:
user = "TheGigiGorgeous"
status_count = 100
gigi_tweets_fulltext = user_tweets(user, status_count)

## 2.b Youtube Transcripts
### Run next two blocks first, but note!!
#### (Note filepath that you'll probably need to update in the first block.)
The way this API works is that it relies on video IDs. The ID is visible from the URL:

    url: https://www.youtube.com/watch?v=LxOUh1qO8Ls
    id: LxOUh1qO8Ls
    (don't include & or any other extra stuff after the id)
    
To avoid extra work, I've made it so that if you save a txt file where each line in the file is a full Youtube link like the one above (*just* the link), then it'll automatically turn those links into the necessary shortened ids. That way, if you have a link extractor in your web browser (e.g. Link Gopher for Firefox), you can just open a playlist or channel's page of videos in your browser, run the link extractor, and copy+paste all the relevant links into a text file.

In [None]:
'''if needed:
!pip install youtube_transcript_api
'''
# run this; it will import a Youtube transcript scraper

from youtube_transcript_api import YouTubeTranscriptApi

videourls_path = "data/video_ids.txt"

with open(videourls_path, 'r') as f:
    videourls = f.readlines()

videoids = list()
pattern = r"(?<=\?v=)[\w\_-]*"
for url in videourls:
    try: 
        videoid = re.search(pattern,url).group(0)
        videoids.append(videoid)
    except:
        videoids.append(url)

In [None]:
# turns a list of video ids (see code and markdown blocks above) into a text string of those videos' eng transcripts
def video_transcripts(videoids_list):
    transcripts = ""
    for videoid in videoids_list:
        try:
            transcript = YouTubeTranscriptApi.get_transcript(videoid, languages=['en'])
            transcript_str = ""
            for line in transcript:
                transcript_str += line['text']
                transcript_str = re.sub(r"\[.*\]", "",transcript_str)
                transcript_str = transcript_str.replace(" i ", " I ").replace(" i'", " I'")
                transcript_str = punctuation_fix(transcript_str) + " "
            transcripts += transcript_str
        except:
            pass
    return transcripts

### Since you're doing most of the decision-making when you make the .txt file, you're mostly just running this next block and copying the variable name for the file output section at the top. You can call the fulltext variable whatever you want.

In [None]:
gigi_transcripts_fulltext = video_transcripts(videoids)

# 3. Webpages
### Run next two blocks first
Note that this probably won't be useful for most websites since each is formatted differently, but the soup-ification process is all set up, so you can tinker with it as needed. The next two blocks will work for any website, since they just do the necessary prereq imports and define some functions, but note that any url besides urls_to_soups makes some basic assumptions about the structure of a page. You might need to modify them based on the structure of the page you're looking at and what you're trying to get. If you want to learn more, this might be useful: 

    https://www.analyticsvidhya.com/blog/2021/08/a-simple-introduction-to-web-scraping-with-beautiful-soup/

In [None]:
'''if needed:
!pip BeautifulSoup
'''

import requests
from bs4 import BeautifulSoup

In [None]:
def urls_to_soups(urls):
    soups = list()
    for url in urls:
        request = requests.get(url)
        soup = BeautifulSoup(request.content, "html.parser")
        soups.append(soup)
    return soups

def urls_from_sitemap(sitemap_soups, root_urls):
    hrefs_txt = list()
    for sitemap_soup in sitemap_soups:
        ahrefs = sitemap_soup.find_all('a')
        for ahref in ahrefs:
            href_txt = ahref.get('href')
            '''
            the lines from the if statement until the return statement are what you'll need to change based on
            the structure of the website; for example, this only grabs links from the sitemap page that 
            start with "html_en" or "html2_en", because those strings were unique to the links I wanted to 
            grab from the page. The last two statements before the return were manual removal of unwanted links.
            
            Eventually, I'll make the qualifiers more flexible by making them function parameters.
            '''
            if href_txt[0:7] == "html_en" or  href_txt[0:8] == "html2_en":
                hrefs_txt.append(root_url+href_txt)
        hrefs_txt.pop(91)
        hrefs_txt = hrefs_txt[:-3]    
    return hrefs_txt

### ***Run the next block only once!***

It makes requests to the pages, and if you do it a bunch then you might annoy them or get rate limited (which means you're prevented from making more requests; it's also rude, imo). The result will be a list of soups, where each item is a soup corresponding to the urls pulled from the sitemap via the function.

You should also double check the rules of the site for web crawling. You can find these rules for most sites by taking the root url (everything up to the top-level domain, e.g. https://www.wikipedia.org or https://github.com) and adding "/robots.txt" (no quotation marks). 

For example:

     https://www.wikipedia.org/robots.txt
     https://github.com/robots.txt

In [None]:
'''
note:
    * sitemap_urls is a list even if there's only one item
    * the root url includes a / at the end; this is important!
    * you can populate the sitemap_urls list with whatever you need based on the structure of the site you're using
'''
root_url = "https://pathologicdialogue.github.io/"
sitemap_urls = ["https://pathologicdialogue.github.io/"]

# note that sitemap_soups is a list that is the same length as sitemap_urls
sitemap_soups = urls_to_soups(sitemap_urls)

# here we grab all the urls on the sitemap pages as per the function, then turn them into an equal-length list of soups
subpage_urls = urls_from_sitemap(sitemap_soups, root_url)
subpage_soups = urls_to_soups(subpage_urls)

### Okay, sorry for yelling up there with the italics. You're good from here down.

In fact, you'll have to change and tinker with a lot of this stuff. This is the stuff I used to parse the very specific pages I pulled from the site above. It will almost assuredly not be directly useful for your needs, but it's here for two reasons:

1. I'm still using this notebook, y'know.
2. Maybe seeing how I got text from a pretty idiosyncratically formatted site can help you see some methods for parsing soup objects.

In [None]:
def soups_to_sentences(soups, urls):
    sentences = list()
    for idx, soup in enumerate(soups):
        if "html_" in urls[idx]:
            maindivs = soup.find_all('div', {"class": "ui-content"})
            for maindiv in maindivs:
                subdivs = maindiv.find_all('div', recursive=False)
                for div in subdivs:
                    paras = div.find_all('p')
                    for para in paras:
                        para_f = para.text
                        para_f = para_f.split(".", 1)
                        para_f = para_f[-1]
                        para_f = para_f.replace("<","").replace(">","").replace("/","").replace("\\","").replace("RatProphet_speech_1","Rat Prophet")
                        sentences.append(para_f)
                    sentences.append("\n")
        elif "html2_" in urls[idx]:
            maindivs = soup.find_all('script')
            pattern = r'textarr=\[.*\]'
            subpattern = r'(?<=\[)(.*?)(?=[\"\']\])'
            for maindiv in maindivs:
                try: 
                    m_arr = re.search(pattern,str(maindiv))
                    match_arr = m_arr.group(0)
                    m_cont = re.search(subpattern, match_arr)
                    match_cont = m_cont.group(0)
                    match_cont = match_cont.replace("<","").replace(">","").replace("/","").replace("\\","").replace("RatProphet_speech_1","Rat Prophet")
                    cont_arr = re.split('''['"], ['"]''',match_cont)
                except:
                    pass
            sentences.extend(cont_arr)
    return sentences


def sentences_to_fulltexts(sentences):
    script = ""
    dialogue = ""
    for line in sentences:
        if line != None:
            script += line + "\n"
            try: 
                character, statement = line.split(": ", 1)
                dialogue += statement + " "
            except:
                pass
    return script, dialogue

In [None]:
pathologic_script_sentences = soups_to_sentences(subpage_soups, subpage_urls)
pathologic_script, pathologic_dialogue = sentences_to_fulltexts(pathologic_script_sentences)