In [None]:
#Required Modules
import requests
import time
import urllib
import pandas as pd
from google.colab import files
from bs4 import BeautifulSoup
from collections import defaultdict

In [None]:
def collect_articles(user_key, query, language = "en", page = 1):

  """Returns JSON Dictionary of Articles returned 
  by call to API given user-defined query"""
  
  url = "https://free-news.p.rapidapi.com/v1/search"
  querystring = {"q": query, "lang": language, "page" : page}
  headers = {
    'x-rapidapi-host': "free-news.p.rapidapi.com",
    'x-rapidapi-key': user_key
    }
  response = requests.request("GET", url, headers=headers, params=querystring)
  response_dict = response.json()
  return response_dict

In [None]:
## Sanity Check - Get Response from API
user_key = "3470351912msh3ef2f617a7f2c46p1b9fc2jsnf25363f582a4"
query = 'shooting AND Philadelphia'
response = collect_articles(user_key, query)
print(response)

{'status': 'ok', 'total_hits': 389, 'page': 1, 'total_pages': 16, 'page_size': 25, 'articles': [{'title': 'Charlotte puts home win streak on the line against Philadelphia', 'author': 'The Associated Press', 'published_date': '2021-12-06 08:03:08', 'published_date_precision': 'full', 'link': 'https://apnews.com/article/nba-sports-charlotte-philadelphia-76ers-lamelo-ball-9701166fbc0d4e3b8342cce5fe33f478', 'clean_url': 'apnews.com', 'summary': 'Philadelphia 76ers (12-11, ninth in the Eastern Conference) vs. Charlotte Hornets (14-11, sixth in the Eastern Conference)Charlotte, North Carolina; Monday, 7 p.m. ESTFANDUEL SPORTSBOOK LINE: Hornets -5.5BOTTOM LINE: Charlotte hosts Philadelphia aiming to extend its five-game home winning streak.The Hornets have gone 10-6 against Eastern Conference opponents. Charlotte ranks third in the NBA with 26.3 assists per game led by LaMelo Ball averaging 8.3.The 76ers are 7-7 in Eastern Conference play. Philadelphia ranks seventh in the Eastern Conference 

In [None]:
def extract_article_info(info, response):

  """ Returns dictionary of desired article info from
   1 page of API call response"""
   
  info_dict = defaultdict(list)
  articles = response['articles']
  for header in info:
    for article in articles:
      info_dict[header].append(article[header])
         
  return info_dict

In [None]:
## Sanity Check - Use collect_articles and extract_article_info to extract
## article info for each page of articles returned by API call
num_pages = response['total_pages']
i = 1
extracted_info = defaultdict(list)
while i <= num_pages:
  page_response = collect_articles(user_key, query, page = i)
  time.sleep(2)
  info_from_page = extract_article_info(['link', 'title', 'summary', 'published_date'], page_response)
  for key in info_from_page.keys():
    extracted_info[key] += info_from_page[key]
  i+=1

print(len(extracted_info['link']))

389


In [None]:
def create_dataframe(info_dict, unwanted_topics = [], column_name = ''):

  """Returns dataframe with desired article information, allows user 
  to remove articles that contain keywords in a user-defined column"""
  
  import pandas as pd
  df = pd.DataFrame(info_dict)
  for topic in unwanted_topics:
    mask = [topic not in column_name for column_name in df[column_name]]
    df = df[mask]
  df.reset_index(inplace = True)
  df = df.drop(['index'], axis = 1)   

  return df

In [None]:
## Sanity Check - Create Dataframe of Article Information and
## remove articles that contain "76ers" in their summary
unwanted_topic = ["76ers"]
column_name = "summary"
df = create_dataframe(extracted_info, unwanted_topic, column_name)
print(df.head)

NameError: ignored

In [None]:
def scrape_keywords(df):

  """Creates soup for each link in dataframe, then extracts keywords
  from each article"""
  
  i = 0
  master_keywords_list = ['']*len(df)
  while i < len(df):
    try: #some websites do not seem to allow scraping
      html_text = urllib.request.urlopen(df['link'][i]).read()
      soup = BeautifulSoup(html_text, 'html.parser')
      content = soup.find_all('meta')
      for line in content:
        keywords = []
        if ('name="keywords"' in str(line)):
          keywords = (str(line).split('"')[1].split(','))
          break 
        else:
          keywords = [] 

    except Exception as e:
      keywords = []
    
    
    master_keywords_list[i] = keywords
    i += 1
  df["keywords"] = master_keywords_list  
    
  
  return df   

In [None]:
## Sanity Check- Scrape Keywords and add Column to df
df = scrape_keywords(df)
print(df.head(10))

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


                                                link  ...                                           keywords
0  https://philadelphia.cbslocal.com/2021/11/29/k...  ...  [north philadelphia,  shooting,  deadly,  gun ...
1  https://philadelphia.cbslocal.com/2021/12/01/l...  ...  [latif williams,  samuel collington,  philadel...
2  https://www.chron.com/news/article/Philadelphi...  ...                                                 []
3  https://www.sfgate.com/news/article/Philadelph...  ...                                                 []
4  https://patch.com/pennsylvania/philadelphia/14...  ...                                                 []
5  https://philadelphia.cbslocal.com/2021/12/04/p...  ...  [philadelphia shooting,  philadelphia gun viol...
6  https://philadelphia.cbslocal.com/2021/12/05/p...  ...  [philadelphia shooting,  kensington shooting, ...
7  https://philadelphia.cbslocal.com/2021/12/04/p...  ...  [philadelphia shooting,  philly shooting,  law...
8  https://apnews.c

In [None]:
def scrape_external_links(df):

  """Creates soup for each link in df and extracts the
  links that link to other news articles"""
  
  master_links_list = ['']*len(df)
  i=0
  while i < len(df):
    try: #some websites do not seem to allow scraping
      html_text = urllib.request.urlopen(df['link'][i]).read()
      soup = BeautifulSoup(html_text, 'html.parser')
      external_article_soup = soup.find_all('a', {'class' : 'bump-view' })
      external_links = set()
      for link in external_article_soup:
        if link not in external_links:
          external_links.add(link['href'])
      master_links_list[i] = external_links 
    except:
      master_links_list[i] = []  
    i+=1
  df["external_links"] = master_links_list  
  return(df)     


In [None]:
## Sanity Check- Scrape Links and add Column to df
scrape_external_links(df)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Unnamed: 0,link,title,summary,published_date,keywords,external_links
0,https://philadelphia.cbslocal.com/2021/11/29/k...,16-Year-Old Shot 8 Times In Deadly North Phila...,PHILADELPHIA (CBS) — A 16-year-old boy is dead...,2021-11-29 09:15:38,"[north philadelphia, shooting, deadly, gun ...",{https://philadelphia.cbslocal.com/2021/12/04/...
1,https://philadelphia.cbslocal.com/2021/12/01/l...,"What We Know About 17-Year-Old Latif Williams,...",PHILADELPHIA (CBS) — Philadelphia police are s...,2021-12-01 11:43:44,"[latif williams, samuel collington, philadel...",{https://philadelphia.cbslocal.com/2021/12/04/...
2,https://www.chron.com/news/article/Philadelphi...,"Philadelphia shooting leaves 2 men dead, anoth...",PHILADELPHIA (AP) — A shooting in a Philadelph...,2021-12-05 16:59:12,[],{}
3,https://www.sfgate.com/news/article/Philadelph...,"Philadelphia shooting leaves 2 men dead, anoth...",PHILADELPHIA (AP) — A shooting in a Philadelph...,2021-12-05 16:59:12,[],{}
4,https://patch.com/pennsylvania/philadelphia/14...,14-Year-Old Boy Dies After Being Shot 18 Times...,The boy was on his way home from school when h...,2021-11-30 17:42:11,[],{}
...,...,...,...,...,...,...
315,https://www.espn.com/nfl/story/_/id/32796117/n...,"NFL Week 13 takeaways: What we learned, big qu...",Week 13of the 2021 NFL seasonbegan with the Co...,2021-12-05 21:39:33,[Arizona rolled to a win with Kyler Murray and...,{}
316,http://www.espn.com/page2/s/greenberg/040414.html,Greenberg: Tamed Tiger,After he finished 22nd last weekend in his que...,2021-12-06 09:00:00,[],{}
317,http://www.espn.com/page2/s/wiley/021018.html,Wiley: Hunting for answers in the land of a ma...,"Under the gun. Athletes, coaches and other peo...",2021-12-01 05:45:00,[],{}
318,https://www.cbsnews.com/news/60-minutes-morley...,Morley Safer: A Reporter's Life,Editor's Note: 60 Minutes correspondent Morley...,2021-12-02 21:15:00,"[steve kroft, retirement, Tom Brokaw, jeff ...",{}


In [None]:
def colab_to_excel(df, file_name = "data.xlsx"):

  """Downloads df as Excel spreadsheet"""

  df.to_excel(file_name)
  files.download(file_name)
  return 1

def colab_to_csv(df, file_name = "data.csv"):

  """Downloads df as CSV"""

  df.to_csv(file_name)
  files.download(file_name)
  return 1

In [None]:
## Sanity Check - Download excel spreadsheet and csv from Colab
colab_to_excel(df, "scraped_articles_with_keywords.xlsx")
colab_to_csv(df, "scraped_articles_with_keywords.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

1