## Building a Text-Based Data Set by Web Scraping

### Imports

In [1]:
import sqlite3 #sqlite package
import requests #requests package to get the pages

from bs4 import BeautifulSoup #beautiful soup to process/parse the pages
from bs4.element import Comment

import random #to use randomization when we're pulling a fraction of the pages
 
import datetime #to work with data and time

### Reads in list of websites from anasoundcloud_websites.txt file

In [2]:
#reads in list of A New Angle Soundcloud links as `sites`
sites = []
with open("anasoundcloud_test.txt",'r') as infile :
    for line in infile :
        sites.append(line.strip())

In [3]:
#prints all links in `sites`
#checks HTTP response status codes -> 200 = good
print(sites)
r = requests.get(sites[0])
r.status_code

['https://soundcloud.com/anewangle/lara-birkes-sees-a-sustainable-montana']


200

### Extracts visible text from each page

In [29]:
#stores the text in a dictionary that has the url as the key and the value is the text.
from bs4.element import Comment

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

In [30]:
anasc_text = dict()

for link in sites :
    try :
        r = requests.get(link)
    except :
        pass 
    
    if r.status_code == 200 :
        soup = BeautifulSoup(r.text, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(tag_visible, texts) 
        anasc_text[link] = " ".join(t.strip() for t in visible_texts)

In [31]:
anasc_text

{'https://soundcloud.com/anewangle/lara-birkes-sees-a-sustainable-montana': "                                                        SoundCloud           JavaScript is disabled  You need to enable JavaScript to use SoundCloud   Show me how to enable it       Lara Birkes sees a sustainable Montana by A New Angle published on 2019-11-18T15:36:47Z    University of Montana alumna Lara Birkes is a sustainability rock star. She recently returned to Montana after serving in leadership roles with the World Trade Center and Hewlett Packard Enterprise. Her fascinating new startup, Eqogo, is attempting to create a sustainability score for all consumer products. Lara also works with local entrepreneurs on nature based solutions to persistent environmental challenges.   Genre  Business    Download Lara Birkes sees a sustainable Montana   Users who like Lara Birkes sees a sustainable Montana  Users who reposted Lara Birkes sees a sustainable Montana  Playlists containing Lara Birkes sees a sustainab

### Writes all visible text from each page to a local text file

In [None]:
#fills table with data
with open("anewangle_soundcloud.txt",'w') as ofile :
    for idx, row in enumerate :
        ofile.write("\t".join([str(element) for element in row]) +"\n")