# News Scraping: APNews Health
##### Antonella Sciortino & Amil Arthur  

In [24]:
#Import Libraries 
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import csv

### Obtain list of news from the coverpage

In [25]:
# url definition
url = "https://apnews.com/apf-Health"

In [26]:
#Retrive List of news

# Request
r1 = requests.get(url)


# We'll save in coverpage the cover page content
coverpage = r1.content

# Soup creation
soup = BeautifulSoup(coverpage)

# News identification
coverpage_news = soup.find_all(class_='FeedCard Component-wireStory-0-2-94 card-0-2-95')
len(coverpage_news)

41

Now we have a list in which every element is a news article:

In [19]:
coverpage_news[5]

<div class="FeedCard Component-wireStory-0-2-94 card-0-2-95" data-key="feed-card-wire-story-with-image" data-tb-region-item="true"><div class="CardHeadline headline-0-2-97"><a class="Component-headline-0-2-106" data-key="card-headline" href="/80f119f63a644f15c25c0ce7a8e38735"><h1 class="Component-h1-0-2-107">Cambridge University scraps in-person lectures for 2020-2021</h1></a><div class="Component-signature-0-2-108"><span class="Component-bylines-0-2-109 Component-bylines-0-2-102">By JILL LAWLESS</span><span class="Timestamp Component-root-0-2-111 Component-timestamp-0-2-110" data-key="timestamp" data-source="2020-05-20T13:01:51Z" title="2020-05-20 13:01:51 - Wed May 20 2020 13:01:51 GMT+0000 (Coordinated Universal Time)">May 20, 2020 GMT</span></div></div><a class="image-0-2-98" href="/80f119f63a644f15c25c0ce7a8e38735"><div data-key="media-placeholder"></div></a><a class="firstWords-0-2-99" data-key="story-link" href="/80f119f63a644f15c25c0ce7a8e38735"><div class="content text-0-2-100

### Let's extract the text from the articles

In [20]:
# First, we'll define the number of articles we want
number_of_articles = 5

In [21]:
# Lists for content, links and titles
news_contents = []
list_links = []
list_titles = []
list_dates = []

for n in np.arange(0, number_of_articles):
        

    
    # Getting the link of the article
    link = 'https://apnews.com' + coverpage_news[n].find('a')['href']
    list_links.append(link)
    
    # Getting the title
    title = coverpage_news[n].find('a').get_text()
    list_titles.append(title)
    
    #Get Article Date 
    dates = coverpage_news[n].find('span', class_= 'Timestamp Component-root-0-2-111 Component-timestamp-0-2-110').get_text()
    list_dates.append(dates)
    
    # Reading the content (it is divided in paragraphs)
    article = requests.get(link)
    article_content = article.content
    soup_article = BeautifulSoup(article_content)
    body = soup_article.find_all('div', class_='Article')
    x = body[0].find_all('p')
    
    # Unifying the paragraphs
    list_paragraphs = []
    for p in np.arange(0, len(x)):
        paragraph = x[p].get_text()
        list_paragraphs.append(paragraph)
        final_article = " ".join(list_paragraphs)
        
    news_contents.append(final_article)

Save our news data from AP for later use into:
* a dataframe object 
* a pickle object 
* a csv

In [22]:
# df_APNews
df_APNews = pd.DataFrame(
    {'Article Date': list_dates,
     'Article Title': list_titles,
     'Article Link': list_links,
      'Article Content': news_contents})
     

In [23]:
df_APNews

Unnamed: 0,Article Date,Article Title,Article Link,Article Content
0,"May 20, 2020 GMT",Turkey’s pandemic strategy hinges on hazmat-su...,https://apnews.com/ace79485b6813e30b952f0d9b78...,ISTANBUL (AP) — They see themselves as public ...
1,"May 20, 2020 GMT",Nations reopen yet struggle to define ‘a new n...,https://apnews.com/6303d40722e4254611cf35fa8b7...,ROME (AP) — As nations around the world loosen...
2,"May 20, 2020 GMT",The Latest: Netherlands extends support packag...,https://apnews.com/26c55f5bdc8b894aeb9081fa12b...,The Latest on the coronavirus pandemic. The ne...
3,"May 19, 2020 GMT",Trump allies lining up doctors to prescribe ra...,https://apnews.com/4ee1a3a8d631b454f645b2a8d95...,WASHINGTON (AP) — Republican political operati...
4,"May 20, 2020 GMT",Uncertain future rattles Italy’s famed restaur...,https://apnews.com/8ff3843251874135c4038b9f9d6...,"ROME (AP) — Italy’s restaurants and pizzerias,..."


In [15]:
#Save to pickle
df_APNews.to_pickle("./APNews.pkl")

In [17]:
#Save to spreadsheet
df_APNews.to_csv("./APNews.csv")