### Scraping Denver CBS local news
The scrapper collectes data from local news, health and politics section of Denver CBS local news.

In [26]:
import requests                # to request the webpage
from bs4 import BeautifulSoup  # to make soup and pull data out of HTML
import urllib.robotparser      # to check the legitimacy to scrap the web
import json                    # to save the output as json file
import pandas as pd            # to  see saved data as dataframe 
from datetime import datetime  # to get the current datetime
#import IPython                 # to display the webpage

### Check permision to scrap the webpage

In [27]:
robotpars = urllib.robotparser.RobotFileParser()  #instantiate the RobotFileParser

#set the robots.txt url of cbs local news 
robotpars.set_url("https://denver.cbslocal.com/category/news/robots.txt")
robotpars.read() # Reads the robots.txt 

# to check if useragent can fetch the url, true means fetching is possible. 
print("Can we fetch cbslocal website?", \
      robotpars.can_fetch("*", "https://denver.cbslocal.com/category/news/")) 


Can we fetch cbslocal website? True


In [28]:
def getSoup(url):
    """make a soup for a webpage given its url""" 

    # request the webpage and get the text
    pagetext= requests.get(url).text
    #make a soup and use html parser for the content of the web page 
    soup= BeautifulSoup(pagetext, "html.parser")
    return soup

In [29]:
def getCategoryLink(soup):
    """Returns a list containing the links of news categories
       soup: is the soup of a webpage from which the links of the categories extracted.
     """
    sec_list= [] # placeholder for section link
    
    cbsn_link= soup.find("section", attrs={"class":"blogroll-item template-catalog"}).h4.a["href"] # get the link for cbs-denver- local news section 
    sec_list.append(cbsn_link)
    
    sections= soup.find_all("section", attrs={"class":"blogroll-item template-headline"}) # Health and politics section  
    for section in sections: # loop over the children sections
      try:
        link= section.h4.a # get the links under h4 from each section 
        sec_list.append(link["href"])
      except:
        pass
    
    return sec_list
  
def getNewsLinkUnderCategory(ctgry_link):
  """Returns the list of links under a category.
     ctgry_link: is a category link"""

  ctgry_soup= getSoup(ctgry_link) # make soup for a category
  news_list=[] # place holder to list all the news link under a category    

  try: 
      #find links under local news                          
      blogroll= ctgry_soup.find_all("div", attrs={"class": "embed-item embed-list blogroll-item"})
      for blog in blogroll:
        for link in  blog.find_all("a"):
          news_list.append(link.get("href"))
  except: 
      pass

  try: # to get links under Health and Politics 
    news_link=ctgry_soup.find("section", attrs={"class": "blogroll-item template-catalog"})
    for news in news_link.find_all("a"):
      news_list.append(news.get("href")) 
  except:
    pass

  return news_list 


In [30]:
soup= getSoup("https://denver.cbslocal.com/category/news/")
getCategoryLink(soup)

['https://denver.cbslocal.com/category/news/local/',
 'http://denver.cbslocal.com/category/news/health/',
 'http://denver.cbslocal.com/category/news/politics/']

### Collect the data

In [31]:
def getNewsInfo(news_link):
    ''' Returns a dictionary containing infomation about the news such as headline, content and published data.
        news_link: the link from which the information is collected'''
    
    Url = news_link  
    Source = "CBS_Denver"  # the same for all links obtained from Denver CBS local 
   
    news_soup= getSoup(news_link) # make a soup 
    
    # Get Headline 
    #==============================
    try:  # try if title can be found  
        Headline= news_soup.find("h1",attrs={"class":"title"}).string # get the title .append(title)
    except:
        Headline="NA"    
    
   #Get Published Date and Time
    #=================================
    try:
        PublishedDateTime= news_soup.find("time",attrs={"class":"post-date"}).string  # get the date      
    except:
        PublishedDateTime= "NA"
 
          
    # Get Content 
    #===================================
    
    text=[] #placeholder to collect contents from multiple paragraphs
    try:   
        cont= news_soup.find("div",attrs={"class":"main-story-wrapper"})
        for c in cont.find_all("p"): # loop over each paragraph
            #get the text in each paragraph and append them
            text.append(c.get_text(separator=" ", strip=True).replace("\xa0", " ")) 

        Content=" ".join(text) # concatnate the paragraphs to make a single string 

    except:
        Content="NA"
           
    # collect  all the data as dictionay 
    data= {"Source": Source, 'Url': Url, 'PublishedDateTime': PublishedDateTime, 'Headline': Headline, 'Content': Content}
        
        # gather all companies info 
    return data

In [32]:
url= "http://denver.cbslocal.com/category/news/" # the page we are going to scrap
category= getCategoryLink(getSoup(url))  #get list of category links
all_data=[] # place holder to collect all the data 

# collect data from the categories
for ctgry in category:  
    links= getNewsLinkUnderCategory(ctgry)
    for link in links:
        all_data.append(getNewsInfo(link))

data= pd.DataFrame(all_data) # make a dataframe

In [33]:
print(data.shape)
data.head()

(42, 5)


Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content
0,CBS_Denver,https://denver.cbslocal.com/2022/04/25/driest-...,"April 25, 2022 at 1:18 pm",Denver On Track To Have Driest April In Record...,DENVER (CBS4) – Despite a few light rain showe...
1,CBS_Denver,https://denver.cbslocal.com/2022/04/25/water-m...,"April 25, 2022 at 12:55 pm",Crews Continue With Repairs On Pipe That Burst...,DENVER (CBS4) – Crews continued to work on rep...
2,CBS_Denver,https://denver.cbslocal.com/2022/04/25/univers...,"April 25, 2022 at 12:55 pm",Gov. Jared Polis Signs Universal Pre-K Bill In...,DENVER (CBS4) – Gov. Jared Polis signed a bill...
3,CBS_Denver,https://denver.cbslocal.com/2022/04/25/total-l...,"April 25, 2022 at 12:35 pm",Total Lunar Eclipse Will Bring Full Super Flow...,DENVER (CBS4) – If the weather cooperates on M...
4,CBS_Denver,https://denver.cbslocal.com/2022/04/25/dennis-...,"April 25, 2022 at 12:22 pm","Dennis Gallagher, Colorado Political Figure, D...",(CBS4) — Dedicated Denver politician Dennis Ga...


### Add more features

In [34]:
# run this command for every first excecution of the notebook
! python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [35]:
import spacy
from spacy.lang.en import English
import glob
import os
!pip install googletrans
from googletrans import Translator



In [36]:
def language_detect(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      a = trans.detect(data).lang
      #b= trans.translate(data, dest='en').text
    except:
      a= 'Unknown'
      #b= 'Translation Failed'

    return a

def language_translate(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      #a = trans.detect(data).lang
      b= trans.translate(data, dest='en').text
    except:
      #a= 'Unknown'
      b= 'Translation Failed'
    
    return b

def key_search(data, key):
  #print(data)
    if data.lower().find(key.lower()) <0:
      return 0
    else:
      return 1

def keyword_scores(data, keys):
    res = {}
    try:
      for key in keys:
        l = keys[key]
        #print(l)
        res[key] = sum(list( map( lambda x: key_search(data, x), l)))
    except:
      res[key] = 0

    return res

In [37]:
import json, requests, urllib, io

#user='my_github_username'
#pao='my_pao'


github_session = requests.Session()
#github_session.auth = (user, pao)

# providing raw url to download csv from github
csv_url = 'https://raw.githubusercontent.com/AR-github-AWS/testrepo/main/Keywords%20for%20Data%20Science.csv?token=GHSAT0AAAAAABTJTADX3DUWZIYFQ3PFQF34YSZYMOA'

download = github_session.get(csv_url).content
downloaded_csv = pd.read_csv(io.StringIO(download.decode('utf-8')), error_bad_lines=False)
df_keywords = downloaded_csv
df_keywords = df_keywords.astype('str').replace('nan')
keywords = df_keywords.to_dict('LIST')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [38]:
def GetFacetCols(dataframe,ColumnName):
    nlp = spacy.load("en_core_web_sm")
    OrgFacet=[]
    PersonFacet=[]
    LocationFacet=[]
    MoneyFacet = []
    LanguageFacet = []
    DateFacet = []
    #for i in range(len(dataframe)):
    for row in dataframe[ColumnName]:
        
        #data=dataframe[ColumnName][i]
        content = nlp(str(row))
        Org=[]
        Person=[]
        Location=[]
        Money = []
        Language = []
        Date = []
        
        try:
            
            for ent in content.ents:  # loop over the entities 
                if(ent.label_ == "ORG"):
                    if ent.text not in Org:
                        Org.append(ent.text)
                if(ent.label_ == "PERSON"):
                    if ent.text not in Person:
                        Person.append(ent.text)
                if(ent.label_ == "GPE"):
                    if ent.text not in Location:
                        Location.append(ent.text)
                if(ent.label_ == "MONEY"):
                    if ent.text not in Money:
                        Money.append(ent.text)
                if(ent.label_ == "LANGUAGE"):
                    if ent.text not in Language:
                        Language.append(ent.text)
                if(ent.label_ == "DATE"):
                    if ent.text not in Date:
                        Date.append(ent.text)
                        
        except:
            pass
        
        finally:
            OrgFacet.append(Org)
            PersonFacet.append(Person)
            LocationFacet.append(Location)
            MoneyFacet.append(Money)
            LanguageFacet.append(Language)
            DateFacet.append(Date)
            
                        
        
    dataframe['keyword score'] = list(map( lambda x: sum(keyword_scores(x, keywords).values()), dataframe[ColumnName]))
    dataframe['LanguageFacet'] = list( map( language_detect, dataframe[ColumnName]))
    dataframe['Translated Content'] = list( map( language_translate, dataframe[ColumnName]))
    dataframe['GeoFacet']=LocationFacet
    dataframe['OrgFacet']=OrgFacet
    dataframe['PeopleFacet']=PersonFacet
    dataframe['MoneyFacet'] = MoneyFacet
    dataframe['DateFacet'] = DateFacet
    
    return dataframe

In [39]:
GetFacetCols(data, 'Content')
# adding empty columns to the data frame
data["Threat"] = " "
data["Useful"] = " "
data["Comment"] = " "

###Data is here

In [40]:
data

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content,keyword score,LanguageFacet,Translated Content,GeoFacet,OrgFacet,PeopleFacet,MoneyFacet,DateFacet,Threat,Useful,Comment
0,CBS_Denver,https://denver.cbslocal.com/2022/04/25/driest-...,"April 25, 2022 at 1:18 pm",Denver On Track To Have Driest April In Record...,DENVER (CBS4) – Despite a few light rain showe...,0,Unknown,Translation Failed,"[Denver, Colorado]",[],[],[],"[late Sunday, this month, the 10th, April, 196...",,,
1,CBS_Denver,https://denver.cbslocal.com/2022/04/25/water-m...,"April 25, 2022 at 12:55 pm",Crews Continue With Repairs On Pipe That Burst...,DENVER (CBS4) – Crews continued to work on rep...,0,Unknown,Translation Failed,[],"[CBS, Denver Water, @DenverWater, Regis Univer...",[Perry],[],"[the weekend, Monday, Sunday, April 25, 2022, ...",,,
2,CBS_Denver,https://denver.cbslocal.com/2022/04/25/univers...,"April 25, 2022 at 12:55 pm",Gov. Jared Polis Signs Universal Pre-K Bill In...,DENVER (CBS4) – Gov. Jared Polis signed a bill...,0,Unknown,Translation Failed,"[Coloradans, Colorado]","[CBS, Department of Early Childhood]","[Jared Polis, Pre-K]","[more than $4,000]","[Monday, next year, 2023]",,,
3,CBS_Denver,https://denver.cbslocal.com/2022/04/25/total-l...,"April 25, 2022 at 12:35 pm",Total Lunar Eclipse Will Bring Full Super Flow...,DENVER (CBS4) – If the weather cooperates on M...,0,Unknown,Translation Failed,"[Colorado, Denver]",[The Full Super Flower Blood Moon],[],[],"[May 15, this year, last May, later this year,...",,,
4,CBS_Denver,https://denver.cbslocal.com/2022/04/25/dennis-...,"April 25, 2022 at 12:22 pm","Dennis Gallagher, Colorado Political Figure, D...",(CBS4) — Dedicated Denver politician Dennis Ga...,0,Unknown,Translation Failed,"[Denver, Colorado]","[the State Senate, Denver City Council, Gallag...","[Dennis Gallagher, Gallagher, Amanda Sandoval]",[],"[the age of 82, Friday, 2015]",,,
5,CBS_Denver,https://denver.cbslocal.com/2022/04/25/colfax-...,"April 25, 2022 at 12:11 pm",Colfax Avenue Closed Between Victor & Ursula I...,"AURORA, Colo. (CBS4) – All lanes of Colfax Ave...",0,Unknown,Translation Failed,"[Colo., Aurora, the City of Aurora]","[CBS, @AuroraGov, Aurora Police Dept (@AuroraP...","[Victor, Ursula, Colfax]",[],"[April 25, 2022]",,,
6,CBS_Denver,https://denver.cbslocal.com/2022/04/25/adams-c...,"April 25, 2022 at 12:10 pm",1 Man Killed In Shooting On Elbert Street In A...,"ADAMS COUNTY, Colo. (CBS4) — One man was kille...",0,Unknown,Translation Failed,"[ADAMS COUNTY, Colo., Adams County, Adams Coun...","[CBS, Adams Sheriff's]",[Elbert St.],[],"[Monday, April 25, 2022]",,,
7,CBS_Denver,https://denver.cbslocal.com/2022/04/25/denver-...,"April 25, 2022 at 12:07 pm",Denver Zoo Closed Until Noon Tuesday For Emerg...,DENVER (CBS4) – The Denver Zoo will be closed ...,0,Unknown,Translation Failed,[],"[The Denver Zoo, Denver Zoo, Zoo]",[],[],"[Tuesday, April 26, Tuesday, April 26, April 2...",,,
8,CBS_Denver,https://denver.cbslocal.com/2022/04/25/superio...,"April 25, 2022 at 11:22 am",Superior Residents Debate Whether Indoor Sprin...,"BOULDER COUNTY, Colo. (CBS4) – Residents of Su...",0,Unknown,Translation Failed,"[BOULDER COUNTY, Colo., Louisville]","[CBS), Trustee Neal Shah, CBS]","[Carrie Hilton, Hilton]",[],[Monday],,,
9,CBS_Denver,https://denver.cbslocal.com/2022/04/25/water-m...,"April 25, 2022 at 10:38 am",‘Metal Tends To Corrode’: Aging Pipelines Like...,DENVER (CBS4) – A 100-year-old pipe bursting i...,0,Unknown,Translation Failed,"[Denver, Berkley]","[CBS, Denver Water, Thompson, Metro State Univ...","[Stacy Bramer, Travis Thompson, Tom Cech, Cech]",[],"[Sunday, the 1920s, this time of year, Last we...",,,


In [41]:
######## Click the blue magic wand thing Somewhere around the above table, it is probably right above the hashtags at the beginning of this code cell

### Future use please ignore
#### Save the data

In [42]:
# storing at "output" dir
#date = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
#file_name = "CBSLocal_Denver_" +date+ ".csv"
#data.to_csv(file_name, index = False)

# Storing a copy for analysts in the "analysts" dir
#data.to_csv( "/dbfs/mnt/analysts/" + file_name, index = False)

In [43]:
# appendix, not important to the generate the above data
#def getCategoryLinkfrmMainPg(soup):
   # """Returns a list containing the links of news categories
    #   soup: is the soup of a webpage from which the links of the categories extracted.
    # """
    #sec_list= [] # placeholder for section link
    
    #cbsn_link= soup.find("div", attrs={"class":"logo-container"}).a["href"] # get the link for cbsn-denver
    #sec_list.append(cbsn_link)
    
    #sections= soup.find("section", attrs={"id": "column_block_33"}) # sections parent 
    #try:
      #for section in sections.children: # loop over the children sections
     #   link= section.h4.a # get the links under h4 from each section 
    #    if link: # if exist 
   #       sec_list.append(link["href"])
  #  except:
      #pass
    
    #return sec_list
   

In [44]:
#getCategoryLinkfrmMainPg(res)