# Let's scrape Vocal Challenge

In [None]:
# import necesary tools for scraping
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service as ChromeService 
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#import time 
#import random

In [None]:
# enable headless mode in Selenium
options = Options()
options.add_argument('--headless=new')
# block image loading
options.experimental_options['prefs'] = {'profile.managed_default_content_settings.images': 2}
# instantiate driver 
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options) 

In [None]:
# write down the website root; create containers to store stories data: authors, stories titles, and links to the stories
root = 'https://vocal.media'
stories_titles = list()
stories_h_ref = list()
stories_authors = list()

In [None]:
# Let's start scraping
i = 0
timeout = 10
while True:
    i=i+1
    challenge_entries = f'{root}/challenges/epistolary/submissions?page={i}'
    driver.get(challenge_entries)
    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME , 'css-1g9btk4-SiteLink-PostTile'))
        WebDriverWait(driver, timeout).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")
    page_source = driver.page_source
    page_soup = BeautifulSoup(page_source, "html.parser")
    stories = page_soup.find_all("a", href = True, class_="css-1g9btk4-SiteLink-PostTile")
    flag = bool(stories)
    if flag == False:
        break
    authors = page_soup.find_all("a", href = True, class_="css-1oz1xzv-SiteLink-PostTile")
    for story in stories:
        title = story.find("h3", class_="post-name css-1alfqzo-Text")
        stories_h_ref.append(f"{root}{story['href']}")
        stories_titles.append(title.text.strip())
    for author in authors:
        stories_authors.append(author.text.strip())
    print(f'{i} done...')

In [None]:
# create a dataframe with all extracted authors/stories names and links to them
import pandas as pd
stories_dataframe = pd.DataFrame(
    {'Author': stories_authors,
     'Story': stories_titles,
     'Link': stories_h_ref})
stories_dataframe

In [None]:
# download all the stories
stories_text_storage = list()
counter = 0
timeout = 1000

for index, row in stories_dataframe.iterrows():
    story_url = row['Link']
    driver.get(story_url)
    try:
        element_present = EC.presence_of_element_located((By.CLASS_NAME , 'css-1mu5bpv-TextContent-PostPage'))
        WebDriverWait(driver, timeout).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")
    page_source = driver.page_source
    story_soup = BeautifulSoup(page_source, "html.parser")
    html_post_text = story_soup.find("div", class_="css-1mu5bpv-TextContent-PostPage")
    post_text = html_post_text.find_all('p')
    temporal_story_list = list()
    for line in post_text:
        temporal_story_list.append(line.text.strip())
    temporal_story_list = list(filter(None, temporal_story_list))
    full_text = '\n\n'.join(temporal_story_list)
    stories_text_storage.append(full_text)
    counter += 1
    print(f'done with story {counter}, {row["Story"]}')

In [None]:
# update your dataframe: include a column with full texts of the stories
full_stories_df = pd.DataFrame(stories_text_storage, columns = ['Full text'])
final_df_with_stories = pd.concat([stories_dataframe, full_stories_df], axis="columns")
final_df_with_stories

In [None]:
# how many stories start with Dear/dear?
final_df_with_stories['Full text'].str[0:4].str.contains("Dear|dear").sum()

In [None]:
# how many stories contain word depression?
final_df_with_stories['Full text'].str.contains("Depression|depression|DEPRESSION").sum()

In [None]:
# count how many times each author published a story for the challenge
stories_dataframe['# of stories'] = stories_dataframe.groupby('Author')['Author'].transform('count')
stories_dataframe = stories_dataframe.sort_values(by=['# of stories', "Author"], ascending=False)
stories_dataframe

In [None]:
# Creat a sorted list of how many stories each author published for the challenge
authors_productivity_df = stories_dataframe.drop_duplicates(subset='Author', keep="first").drop(['Story', 'Link'], axis=1)
authors_productivity_df

In [None]:
# Find stories of the authors you're interested in
options = ['Gina C.'] 
authors_of_interest_df = stories_dataframe[stories_dataframe['Author'].isin(options)].reset_index()
authors_of_interest_df

In [None]:
# print out a story. By changing value in ['Full text'][value] get access to any of the downloaded stories
read_story_No = 1
print(authors_of_interest_df['Full text'][read_story_No-1])

In [None]:
# Save a story as a Microsoft office document 
download_story_No = 2
from docx import Document

document = Document()
document.add_heading(final_df_with_stories_to_read['Story'][download_story_No-1], level=1)
document.add_paragraph()
document.add_paragraph(final_df_with_stories_to_read['Full text'][download_story_No-1])

document.save('Downloaded_story.docx')