## Acquire

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import strftime

In [2]:
response = requests.get("https://codeup.com/blog/", headers={"user-agent": "Codeup DS"})
soup = BeautifulSoup(response.text)

In [3]:
soup.select('.more-link')

[<a class="more-link" href="https://codeup.com/dallas-newsletter/codeup-dallas-open-house/">read more</a>,
 <a class="more-link" href="https://codeup.com/codeup-news/codeups-placement-team-continues-setting-records/">read more</a>,
 <a class="more-link" href="https://codeup.com/it-training/it-certifications-101/">read more</a>,
 <a class="more-link" href="https://codeup.com/cybersecurity/a-rise-in-cyber-attacks-means-opportunities-for-veterans-in-san-antonio/">read more</a>,
 <a class="more-link" href="https://codeup.com/codeup-news/use-your-gi-bill-benefits-to-land-a-job-in-tech/">read more</a>,
 <a class="more-link" href="https://codeup.com/tips-for-prospective-students/which-program-is-right-for-me-cyber-security-or-systems-engineering/">read more</a>,
 <a class="more-link" href="https://codeup.com/it-training/what-the-heck-is-system-engineering/">read more</a>,
 <a class="more-link" href="https://codeup.com/alumni-stories/from-speech-pathology-to-business-intelligence/">read more</

In [4]:
links = [link['href'] for link in soup.select('.more-link')]
links

['https://codeup.com/dallas-newsletter/codeup-dallas-open-house/',
 'https://codeup.com/codeup-news/codeups-placement-team-continues-setting-records/',
 'https://codeup.com/it-training/it-certifications-101/',
 'https://codeup.com/cybersecurity/a-rise-in-cyber-attacks-means-opportunities-for-veterans-in-san-antonio/',
 'https://codeup.com/codeup-news/use-your-gi-bill-benefits-to-land-a-job-in-tech/',
 'https://codeup.com/tips-for-prospective-students/which-program-is-right-for-me-cyber-security-or-systems-engineering/',
 'https://codeup.com/it-training/what-the-heck-is-system-engineering/',
 'https://codeup.com/alumni-stories/from-speech-pathology-to-business-intelligence/',
 'https://codeup.com/behind-the-billboards/boris-behind-the-billboards/',
 'https://codeup.com/codeup-news/is-codeup-the-best-bootcamp-in-san-antonio-or-the-world/',
 'https://codeup.com/codeup-news/codeup-launches-first-podcast-hire-tech/',
 'https://codeup.com/tips-for-prospective-students/why-should-i-become-a-s

In [5]:
url = 'https://codeup.com/dallas-newsletter/codeup-dallas-open-house/'
response = requests.get(url, headers={"user-agent": "Codeup DS"})
soup = BeautifulSoup(response.text)

In [6]:
# content
soup.select_one('.entry-content').text.strip()

'Come join us for the re-opening of our Dallas Campus with some drinks and snacks at Codeup! Curious about what our campus looks like? Click here to register for free\nAbout this event\nCome join us for the re-opening of our Dallas Campus with some drinks and snacks at Codeup!\nCurious about what our campus looks like? Interested in our Web Development Career Accelerator? Keen to chat with an instructor or financial aid rep?\nAt our Open House, we are here to answer all your questions!\nMeet a Codeup instructor, who can help explain what’s taught in our classes and answer questions.\nUnderstand how to join one of our upcoming cohort ( Dec. 6th).\nDon’t miss this opportunity to learn more about how you can start the new year transitioning into a new, exciting career in tech. We’re here to answer any questions you may have about Codeup and your future.\nTake the first step of your new career today and create your tomorrow!'

In [7]:
#title
title = soup.select_one('.entry-title').text
title

'Codeup Dallas Open House'

In [8]:
# published date
published = soup.select_one('.published').text
published

'Nov 30, 2021'

In [9]:
def get_front_page_links():
    """
    Short function to hit the codeup blog landing page and return a list of all the urls to further blog posts on the
    page.
    """
    response = requests.get("https://codeup.com/blog/", headers={"user-agent": "Codeup DS"})
    soup = BeautifulSoup(response.text)
    links = [link.attrs["href"] for link in soup.select(".more-link")]

    return links

def parse_codeup_blog_article(url):
    "Given a blog article url, extract the relevant information and return it as a dictionary."
    response = requests.get(url, headers={"user-agent": "Codeup DS"})
    soup = BeautifulSoup(response.text)
    return {
        "title": soup.select_one(".entry-title").text,
        "published": soup.select_one(".published").text,
        "content": soup.select_one(".entry-content").text.strip(),
    }


def get_blog_articles():
    "Returns a dataframe where each row is a blog post from the codeup blog landing page."
    links = get_front_page_links()
    df = pd.DataFrame([parse_codeup_blog_article(link) for link in links])
    return df


In [10]:
get_blog_articles()
# get_front_page_links()


Unnamed: 0,title,published,content
0,Codeup Dallas Open House,"Nov 30, 2021",Come join us for the re-opening of our Dallas ...
1,Codeup’s Placement Team Continues Setting Records,"Nov 19, 2021",Our Placement Team is simply defined as a grou...
2,"IT Certifications 101: Why They Matter, and Wh...","Nov 18, 2021","AWS, Google, Azure, Red Hat, CompTIA…these are..."
3,A rise in cyber attacks means opportunities fo...,"Nov 17, 2021","In the last few months, the US has experienced..."
4,Use your GI Bill® benefits to Land a Job in Tech,"Nov 4, 2021","As the end of military service gets closer, ma..."
5,Which program is right for me: Cyber Security ...,"Oct 28, 2021",What IT Career should I choose?\nIf you’re thi...
6,What the Heck is System Engineering?,"Oct 21, 2021",Codeup offers a 13-week training program: Syst...
7,From Speech Pathology to Business Intelligence,"Oct 18, 2021","By: Alicia Gonzalez\nBefore Codeup, I was a ho..."
8,Boris – Behind the Billboards,"Oct 3, 2021",
9,Is Codeup the Best Bootcamp in San Antonio…or ...,"Sep 16, 2021",Looking for the best data science bootcamp in ...


In [11]:
# Save the blogs as json:

today = strftime('%Y-%m-%d')
get_blog_articles().to_json(f'codeup_blog_{today}.json')

In [12]:
url = 'https://www.inshorts.com/en/read/business'
response = requests.get(url, headers={'user-agent': 'Codeup DS'})
soup = BeautifulSoup(response.text)

In [13]:
# identify class to select all the news cards
cards = soup.select('.news-card')
len(cards)

25

In [14]:
card = cards[0]
# card

In [15]:
# headline
headline = card.find('span', itemprop = 'headline').text
headline

'RBI cancels licence of Maha-based Independence Co-operative Bank'

In [16]:
# author
card.find('span', class_ = 'author').text

'Shalini Ojha'

In [17]:
# content
card.find('div', itemprop = 'articleBody').text

"RBI has cancelled licence of Maharashtra-based Independence Co-operative Bank, citing inadequate capital. It will cease to carry on banking operations from the close of business on February 3. In the present situation, the bank won't be able to pay its depositors in full, RBI said. It added that the bank didn't comply with multiple sections of Banking Regulation Act, 1949. "

In [18]:
#date
card.find('span', clas ='date').text

'03 Feb 2022,Thursday'

In [19]:
def parse_news_card(card):
    'Given a news card object, returns a dictionary of the relevant information.'
    card_title = card.select_one('.news-card-title')
    output = {}
    output['title'] = card.find('span', itemprop = 'headline').text
    output['author'] = card.find('span', class_ = 'author').text
    output['content'] = card.find('div', itemprop = 'articleBody').text
    output['date'] = card.find('span', clas ='date').text
    return output


def parse_inshorts_page(url):
    '''Given a url, returns a dataframe where each row is a news article from the url.
    Infers the category from the last section of the url.'''
    category = url.split('/')[-1]
    response = requests.get(url, headers={'user-agent': 'Codeup DS'})
    soup = BeautifulSoup(response.text)
    cards = soup.select('.news-card')
    df = pd.DataFrame([parse_news_card(card) for card in cards])
    df['category'] = category
    return df

def get_inshorts_articles():
    '''
    Returns a dataframe of news articles from the business, sports, technology, and entertainment sections of
    inshorts.
    '''
    url = 'https://inshorts.com/en/read/'
    categories = ['business', 'sports', 'technology', 'entertainment']
    df = pd.DataFrame()
    for cat in categories:
        df = pd.concat([df, pd.DataFrame(parse_inshorts_page(url + cat))])
    df = df.reset_index(drop=True)
    return df

In [20]:
get_inshorts_articles()

Unnamed: 0,title,author,content,date,category
0,RBI cancels licence of Maha-based Independence...,Shalini Ojha,RBI has cancelled licence of Maharashtra-based...,"03 Feb 2022,Thursday",business
1,Boost to EVs a big step: Windmill Capital,Roshan Gupta,"Increased use of EVs in public transport, spec...","03 Feb 2022,Thursday",business
2,Facebook parent Meta's $230-billion wipeout bi...,Pragya Swastik,Facebook's parent Meta's shares plunged 27% an...,"03 Feb 2022,Thursday",business
3,Mark Zuckerberg loses $31 bn in one of the big...,Pragya Swastik,Meta CEO Mark Zuckerberg's wealth dropped by $...,"03 Feb 2022,Thursday",business
4,"Tesla co-worker used N-word, threw a hot tool ...",Kiran Khatri,A former Tesla worker has filed a lawsuit agai...,"03 Feb 2022,Thursday",business
...,...,...,...,...,...
95,I don't work thinking I'm so many films old: D...,Kriti Kambiri,"Actress Deepika Padukone, who made her Bollywo...","03 Feb 2022,Thursday",entertainment
96,Court directs Honey Singh to give voice sample...,Kriti Kambiri,Rapper Yo Yo Honey Singh has been directed by ...,"03 Feb 2022,Thursday",entertainment
97,Not my style: Deepika on being asked if she ga...,Mahima Kharbanda,When asked if she had given any special dating...,"03 Feb 2022,Thursday",entertainment
98,"Farhan, Shibani to host their wedding party at...",Mahima Kharbanda,Actor Farhan Akhtar and Shibani Dandekar will ...,"03 Feb 2022,Thursday",entertainment


In [21]:
# save the dataframe as json:

today = strftime('%Y-%m-%d')
get_inshorts_articles().to_json(f'inshorts-{today}.json')