### Scraping Denver CBS local news
The scraper collects data from local news, health and politics section of Denver CBS local news.

In [None]:
import requests                # to request the webpage
from bs4 import BeautifulSoup  # to make soup and pull data out of HTML
import urllib.robotparser      # to check the legitimacy to scrap the web
import json                    # to save the output as json file
import pandas as pd            # to  see saved data as dataframe 
from datetime import datetime  # to get the current datetime
#import IPython                 # to display the webpage

### Check permission to scrape the webpage

In [None]:
robotpars = urllib.robotparser.RobotFileParser()  #instantiate the RobotFileParser

#set the robots.txt url of cbs local news 
robotpars.set_url("https://denver.cbslocal.com/category/news/robots.txt")
robotpars.read() # Reads the robots.txt 

# to check if useragent can fetch the url, true means fetching is possible. 
print("Can we fetch cbslocal website?", \
      robotpars.can_fetch("*", "https://denver.cbslocal.com/category/news/")) 


Can we fetch cbslocal website? True


In [None]:
def getSoup(url):
    """make a soup for a webpage given its url""" 

    # request the webpage and get the text
    pagetext= requests.get(url).text
    #make a soup and use html parser for the content of the web page 
    soup= BeautifulSoup(pagetext, "html.parser")
    return soup

In [None]:
def getCategoryLink(soup):
    """Returns a list containing the links of news categories
       soup: is the soup of a webpage from which the links of the categories extracted.
     """
    sec_list= [] # placeholder for section link
    
    cbsn_link= soup.find("section", attrs={"class":"blogroll-item template-catalog"}).h4.a["href"] # get the link for cbs-denver- local news section 
    sec_list.append(cbsn_link)
    
    sections= soup.find_all("section", attrs={"class":"blogroll-item template-headline"}) # Health and politics section  
    for section in sections: # loop over the children sections
      try:
        link= section.h4.a # get the links under h4 from each section 
        sec_list.append(link["href"])
      except:
        pass
    
    return sec_list
  
def getNewsLinkUnderCategory(ctgry_link):
  """Returns the list of links under a category.
     ctgry_link: is a category link"""

  ctgry_soup= getSoup(ctgry_link) # make soup for a category
  news_list=[] # place holder to list all the news link under a category    

  try: 
      #find links under local news                          
      blogroll= ctgry_soup.find_all("div", attrs={"class": "embed-item embed-list blogroll-item"})
      for blog in blogroll:
        for link in  blog.find_all("a"):
          news_list.append(link.get("href"))
  except: 
      pass

  try: # to get links under Health and Politics 
    news_link=ctgry_soup.find("section", attrs={"class": "blogroll-item template-catalog"})
    for news in news_link.find_all("a"):
      news_list.append(news.get("href")) 
  except:
    pass

  return news_list 


In [None]:
soup= getSoup("https://denver.cbslocal.com/category/news/")
getCategoryLink(soup)

['https://denver.cbslocal.com/category/news/local/',
 'http://denver.cbslocal.com/category/news/health/',
 'http://denver.cbslocal.com/category/news/politics/']

### Collect the data

In [None]:
def getNewsInfo(news_link):
    ''' Returns a dictionary containing infomation about the news such as headline, content and published data.
        news_link: the link from which the information is collected'''
    
    Url = news_link  
    Source = "CBS_Denver"  # the same for all links obtained from Denver CBS local 
   
    news_soup= getSoup(news_link) # make a soup 
    
    # Get Headline 
    #==============================
    try:  # try if title can be found  
        Headline= news_soup.find("h1",attrs={"class":"title"}).string # get the title .append(title)
    except:
        Headline="NA"    
    
   #Get Published Date and Time
    #=================================
    try:
        PublishedDateTime= news_soup.find("time",attrs={"class":"post-date"}).string  # get the date      
    except:
        PublishedDateTime= "NA"
 
          
    # Get Content 
    #===================================
    
    text=[] #placeholder to collect contents from multiple paragraphs
    try:   
        cont= news_soup.find("div",attrs={"class":"main-story-wrapper"})
        for c in cont.find_all("p"): # loop over each paragraph
            #get the text in each paragraph and append them
            text.append(c.get_text(separator=" ", strip=True).replace("\xa0", " ")) 

        Content=" ".join(text) # concatnate the paragraphs to make a single string 

    except:
        Content="NA"
           
    # collect  all the data as dictionay 
    data= {"Source": Source, 'Url': Url, 'PublishedDateTime': PublishedDateTime, 'Headline': Headline, 'Content': Content}
        
        # gather all companies info 
    return data

In [None]:
url= "http://denver.cbslocal.com/category/news/" # the page we are going to scrap
category= getCategoryLink(getSoup(url))  #get list of category links
all_data=[] # place holder to collect all the data 

# collect data from the categories
for ctgry in category:  
    links= getNewsLinkUnderCategory(ctgry)
    for link in links:
        all_data.append(getNewsInfo(link))

data= pd.DataFrame(all_data) # make a dataframe

In [None]:
print(data.shape)
data.head()

(42, 5)


Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content
0,CBS_Denver,https://denver.cbslocal.com/2022/04/28/these-w...,"April 28, 2022 at 1:21 pm",Film Created At Cañon City Prison Humanizes Li...,(CBS4) – A new film goes inside Colorado’s old...
1,CBS_Denver,https://denver.cbslocal.com/2022/04/28/severe-...,"April 28, 2022 at 12:24 pm",Severe Drought In Colorado Jumps 15% In One We...,DENVER (CBS4) – Drought in Colorado has been w...
2,CBS_Denver,https://denver.cbslocal.com/2022/04/28/paul-jo...,"April 28, 2022 at 12:22 pm",Jury Awards Nearly $9M To Families Who Accused...,(CBS4) – A jury has awarded nearly $9 million ...
3,CBS_Denver,https://denver.cbslocal.com/2022/04/28/trahavo...,"April 28, 2022 at 11:30 am","Trahavonie Smith Arrested, Charged With Killin...",(CBS4) – Denver police have arrested a man sus...
4,CBS_Denver,https://denver.cbslocal.com/2022/04/28/basebal...,"April 28, 2022 at 10:36 am",Highly Publicized Efforts To Repair Kennedy Hi...,DENVER (CBS4) – A pitch to renovate the baseba...


### Add more features

In [None]:
# run this command for every first excecution of the notebook
! python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 18.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
import spacy
from spacy.lang.en import English
import glob
import os
!pip install googletrans
from googletrans import Translator

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.7 MB/s 
Collecting hstspreload
  Downloading hstspreload-2021.12.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 30.6 MB/s 
[?25hCollecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 871 kB/s 
Collecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 3.0 MB/s 
[?25hCollecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.1 MB/s 
[?25hCollecting hyperframe<6,>=5.2.0
  Downloading hyperframe-5.2.0-py2

In [None]:
def language_detect(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      a = trans.detect(data).lang
      #b= trans.translate(data, dest='en').text
    except:
      a= 'Unknown'
      #b= 'Translation Failed'

    return a

def language_translate(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      #a = trans.detect(data).lang
      b= trans.translate(data, dest='en').text
    except:
      #a= 'Unknown'
      b= 'Translation Failed'
    
    return b

def key_search(data, key):
  #print(data)
    if data.lower().find(key.lower()) <0:
      return 0
    else:
      return 1

def keyword_scores(data, keys):
    res = {}
    try:
      for key in keys:
        l = keys[key]
        #print(l)
        res[key] = sum(list( map( lambda x: key_search(data, x), l)))
    except:
      res[key] = 0

    return res

In [None]:
import json, requests, urllib, io

#user='my_github_username'
#pao='my_pao'


github_session = requests.Session()
#github_session.auth = (user, pao)

# providing raw url to download csv from github
csv_url = 'https://raw.githubusercontent.com/AR-github-AWS/testrepo/main/Keywords%20for%20Data%20Science.csv?token=GHSAT0AAAAAABTJTADX3DUWZIYFQ3PFQF34YSZYMOA'

download = github_session.get(csv_url).content
downloaded_csv = pd.read_csv(io.StringIO(download.decode('utf-8')), error_bad_lines=False)
df_keywords = downloaded_csv
df_keywords = df_keywords.astype('str').replace('nan')
keywords = df_keywords.to_dict('LIST')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
def GetFacetCols(dataframe,ColumnName):
    nlp = spacy.load("en_core_web_sm")
    OrgFacet=[]
    PersonFacet=[]
    LocationFacet=[]
    MoneyFacet = []
    LanguageFacet = []
    DateFacet = []
    #for i in range(len(dataframe)):
    for row in dataframe[ColumnName]:
        
        #data=dataframe[ColumnName][i]
        content = nlp(str(row))
        Org=[]
        Person=[]
        Location=[]
        Money = []
        Language = []
        Date = []
        
        try:
            
            for ent in content.ents:  # loop over the entities 
                if(ent.label_ == "ORG"):
                    if ent.text not in Org:
                        Org.append(ent.text)
                if(ent.label_ == "PERSON"):
                    if ent.text not in Person:
                        Person.append(ent.text)
                if(ent.label_ == "GPE"):
                    if ent.text not in Location:
                        Location.append(ent.text)
                if(ent.label_ == "MONEY"):
                    if ent.text not in Money:
                        Money.append(ent.text)
                if(ent.label_ == "LANGUAGE"):
                    if ent.text not in Language:
                        Language.append(ent.text)
                if(ent.label_ == "DATE"):
                    if ent.text not in Date:
                        Date.append(ent.text)
                        
        except:
            pass
        
        finally:
            OrgFacet.append(Org)
            PersonFacet.append(Person)
            LocationFacet.append(Location)
            MoneyFacet.append(Money)
            LanguageFacet.append(Language)
            DateFacet.append(Date)
            
                        
        
    dataframe['keyword score'] = list(map( lambda x: sum(keyword_scores(x, keywords).values()), dataframe[ColumnName]))
    dataframe['LanguageFacet'] = list( map( language_detect, dataframe[ColumnName]))
    dataframe['Translated Content'] = list( map( language_translate, dataframe[ColumnName]))
    dataframe['GeoFacet']=LocationFacet
    dataframe['OrgFacet']=OrgFacet
    dataframe['PeopleFacet']=PersonFacet
    dataframe['MoneyFacet'] = MoneyFacet
    dataframe['DateFacet'] = DateFacet
    
    return dataframe

In [None]:
GetFacetCols(data, 'Content')
# adding empty columns to the data frame
data["Threat"] = " "
data["Useful"] = " "
data["Comment"] = " "

###Data is here

In [None]:
data

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content,keyword score,LanguageFacet,Translated Content,GeoFacet,OrgFacet,PeopleFacet,MoneyFacet,DateFacet,Threat,Useful,Comment
0,CBS_Denver,https://denver.cbslocal.com/2022/04/28/these-w...,"April 28, 2022 at 1:21 pm",Film Created At Cañon City Prison Humanizes Li...,(CBS4) – A new film goes inside Colorado’s old...,0,Unknown,Translation Failed,"[Colorado, Cañon City]","[CBS4, the Territorial Correctional Facility, ...","[Eric Davis, Dean Williams, Walls, Williams, C...",[],"[last year, 150th year, today, 34 years, early...",,,
1,CBS_Denver,https://denver.cbslocal.com/2022/04/28/severe-...,"April 28, 2022 at 12:24 pm",Severe Drought In Colorado Jumps 15% In One We...,DENVER (CBS4) – Drought in Colorado has been w...,0,Unknown,Translation Failed,"[Colorado, Denver, Broomfield, Thornton, North...","[CBS, Greeley, Highland Ranch, Arvada]","[Aurora, Jefferson]",[],"[recent weeks, this April, This past week, Apr...",,,
2,CBS_Denver,https://denver.cbslocal.com/2022/04/28/paul-jo...,"April 28, 2022 at 12:22 pm",Jury Awards Nearly $9M To Families Who Accused...,(CBS4) – A jury has awarded nearly $9 million ...,0,Unknown,Translation Failed,[],"[CBS4, Grand Junction, CBS) Jones]","[Paul Jones, Jones, Floyd Elliott]","[nearly $9 million, $8.7 million]","[the 1980s, 2019]",,,
3,CBS_Denver,https://denver.cbslocal.com/2022/04/28/trahavo...,"April 28, 2022 at 11:30 am","Trahavonie Smith Arrested, Charged With Killin...",(CBS4) – Denver police have arrested a man sus...,0,Unknown,Translation Failed,[Denver],[Denver Police],"[Trahavonie Smith, Smith]",[],"[West 13th, Wednesday]",,,
4,CBS_Denver,https://denver.cbslocal.com/2022/04/28/basebal...,"April 28, 2022 at 10:36 am",Highly Publicized Efforts To Repair Kennedy Hi...,DENVER (CBS4) – A pitch to renovate the baseba...,0,Unknown,Translation Failed,"[Denver, the Colorado Rockies]","[Major League Baseball, CBS, Denver Public Sch...","[Ron Gallegos, John F. Kennedy High School, Ke...","[$5 million, thousands of dollars]","[About a year ago, 2021, about 3 years, 11-14 ...",,,
5,CBS_Denver,https://denver.cbslocal.com/2022/04/28/making-...,"April 28, 2022 at 10:30 am",Overcoming Inflation A Primary Focus For Color...,DENVER (CBS4) – Inflation is not only hurting ...,0,Unknown,Translation Failed,"[Colorado, Denver, Johnston, U.S.]","[BRL Group, CBS, CBS4, Kapitus, Fed, IRS, Kapi...","[Lilliana Luna, Luna, Kelly Werthmann, Ben Joh...","[about $1,000, 2,300]","[summer, every single day, about every day, 20...",,,
6,CBS_Denver,https://denver.cbslocal.com/2022/04/28/young-m...,"April 28, 2022 at 10:28 am",Young Colorado Motel Owner Pushes Through Pand...,"SALIDA, Colo. (CBS4) – Anita Kudasik had grown...",0,Unknown,Translation Failed,"[SALIDA, Colo., Salida]","[CBS, Metropolitan State University of Denver,...","[Anita Kudasik, Kudasik, Kudaski]",[],"[several years, 2018, the year, the past 10 ye...",,,
7,CBS_Denver,https://denver.cbslocal.com/2022/04/28/addicti...,"April 28, 2022 at 9:59 am",Denver Health Preparing For Capacity Changes I...,(CBS4) – Every week Denver Health’s Center for...,0,Unknown,Translation Failed,[],[Denver Health’s Center for Addiction Medicine...,"[Josh Blum, Blum, Brook Bender]",[],[months long],,,
8,CBS_Denver,https://denver.cbslocal.com/2022/04/28/larimer...,"April 28, 2022 at 8:57 am",Larimer Sheriff: Toddler Home With Family Afte...,"WELLINGTON, Colo. (CBS4) — A young boy is back...",0,Unknown,Translation Failed,"[WELLINGTON, Colo.]",[the Larimer County Sheriff Facebook post],[LCSO],[],[Thursday],,,
9,CBS_Denver,https://denver.cbslocal.com/2022/04/28/6th-ave...,"April 28, 2022 at 8:35 am",Crash Backs Up Traffic On 6th Avenue West of I...,(CBS4) — A crash on 6th Avenue west of Interst...,0,Unknown,Translation Failed,[],"[CBS4, CBS, CDOT, Federal Boulevard]",[Sheridan Boulevard],[],[Thursday],,,


In [None]:
######## Click the blue magic wand thing Somewhere around the above table, it is probably right above the hashtags at the beginning of this code cell

### Future use please ignore
#### Save the data

In [None]:
# storing at "output" dir
#date = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
#file_name = "CBSLocal_Denver_" +date+ ".csv"
#data.to_csv(file_name, index = False)

# Storing a copy for analysts in the "analysts" dir
#data.to_csv( "/dbfs/mnt/analysts/" + file_name, index = False)

In [None]:
# appendix, not important to the generate the above data
#def getCategoryLinkfrmMainPg(soup):
   # """Returns a list containing the links of news categories
    #   soup: is the soup of a webpage from which the links of the categories extracted.
    # """
    #sec_list= [] # placeholder for section link
    
    #cbsn_link= soup.find("div", attrs={"class":"logo-container"}).a["href"] # get the link for cbsn-denver
    #sec_list.append(cbsn_link)
    
    #sections= soup.find("section", attrs={"id": "column_block_33"}) # sections parent 
    #try:
      #for section in sections.children: # loop over the children sections
     #   link= section.h4.a # get the links under h4 from each section 
    #    if link: # if exist 
   #       sec_list.append(link["href"])
  #  except:
      #pass
    
    #return sec_list
   

In [None]:
#getCategoryLinkfrmMainPg(res)