### Scraping Crime under KRDO news
The scrapper collectes data from KRDO crime news(https://krdo.com/news/crime/)

In [1]:
import requests                # to request the webpage
from bs4 import BeautifulSoup  # to make soup and pull data out of HTML
import urllib.robotparser      # to check the legitimacy to scrap the web
import json                    # to save the output as json file
import pandas as pd            # to  see saved data as dataframe 
from datetime import datetime  # to get the current datetime
#import IPython                 # to display the webpage

In [2]:
#from newspaper import Article
#url= "https://krdo.com/news/2021/09/06/pueblo-police-man-arrested-in-connection-to-shooting/"
#article= Article(url)
#article.download()
#article.parse()

### Check permision to scrap the webpage

In [3]:
robotpars = urllib.robotparser.RobotFileParser()  #instantiate the RobotFileParser

#set the robots.txt url of cbs local news 
robotpars.set_url("https://krdo.com/robots.txt")
robotpars.read() # Reads the robots.txt 

# to check if useragent can fetch the url, true means fetching is possible. 
print("Can we fetch the website?", \
      robotpars.can_fetch("*", "https://krdo.com/")) 


Can we fetch the website? True


In [4]:
def getSoup(url):
    """make a soup for a webpage given its url""" 

    # request the webpage and get the text
    pagetext= requests.get(url).text
    #make a soup and use html parser for the content of the web page 
    soup= BeautifulSoup(pagetext, "html.parser")
    return soup
  
def getNewsLink(soup):
  """Returns a list containing the links of news article.
       soup: is the soup of a webpage from which the links will be extracted.
     """
  news_list=[] # place holder to collect all the links of the news article
  
  try:
    articles = soup.find_all("h3", attrs={"class": "story__title hdg hdg--4"})
    for artc in articles:
      link= artc.a["href"] # get the link for each news article
      news_list.append(link) 
  except:
    pass
  return news_list 

In [5]:
url= 'https://krdo.com/news/crime/'
soup= getSoup(url)
getNewsLink(soup)

['https://krdo.com/news/2022/04/25/death-investigation-on-pueblos-east-side/',
 'https://krdo.com/news/crime/pueblo-county-crime/2022/04/23/overnight-crash-in-pueblo-kills-two-kids-injures-others/',
 'https://krdo.com/news/crime/pueblo-county-crime/2022/04/23/man-recovering-from-gunshot-wound-to-the-head-following-shooting-in-pueblo/',
 'https://krdo.com/news/local-news/2022/04/22/on-friday-afternoon-an-el-paso-county-jury-found-xinan-xia-guilty-of-felony-pimping-a-single-count-of-keeping-a-place-of-prostitution-and-pandering-for-prostitution/',
 'https://krdo.com/news/2022/04/22/body-found-in-hanover-sheriffs-office-treating-it-as-possible-homicide/',
 'https://krdo.com/news/2022/04/22/court-martial-for-usafa-cadet-charged-with-sexual-assault-providing-alcohol-to-a-minor/',
 'https://krdo.com/news/2022/04/22/pueblo-police-search-for-suspect-in-fatal-crash/',
 'https://krdo.com/news/2022/04/21/la-junta-police-17-year-old-arrested-accused-of-shooting-15-year-old/']

### Collect the data

In [6]:
def getNewsInfo(news_link):
    ''' Returns a dictionary containing infomation about the news such as headline, content and published data.
        news_link: the link from which the information is collected'''
    
    Url = news_link  
    Source = "Article_KRDO"  # the same for all links obtained from the main source 
   
    news_soup= getSoup(news_link) # make a soup 
    
    # Get Headline 
    #==============================
    try:  # try if title can be found  
        Headline= news_soup.find("h1",attrs={"class":"hdg hdg--3"}).get_text() # get the title .append(title)
    except:
        Headline="NA"    
    
   #Get Published Date and Time
    #=================================
    try:
        PublishedDateTime= news_soup.find("span",attrs={"class":"meta__date-time-updated"}).string  # get the date      
    except:
        PublishedDateTime= "NA"
 
          
    # Get Content 
    #===================================
    
    text=[] #placeholder to collect contents from multiple paragraphs
    try:   
        cont= news_soup.find("div",attrs={"class":"entry__content"})
        for c in cont.find_all("p"): # loop over each paragraph
            #get the text in each paragraph and append them
            text.append(c.get_text(separator=" ", strip=True).replace("\xa0", " ")) 

        Content=" ".join(text) # concatnate the paragraphs to make a single string 

    except:
        Content="NA"
           
    # collect  all the data as dictionay 
    data= {"Source": Source, 'Url': Url, 'PublishedDateTime': PublishedDateTime, 'Headline': Headline, 'Content': Content}
        
        # gather all companies info 
    return data

In [7]:
url= "https://krdo.com/news/crime/" # the page we are going to scrap
soup=getSoup(url)
news_url= getNewsLink(soup) #get list of category links
all_data=[] # place holder to collect all the data 


for link in news_url:
    all_data.append(getNewsInfo(link))

data= pd.DataFrame(all_data) # make a dataframe

In [8]:
data

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content
0,Article_KRDO,https://krdo.com/news/2022/04/25/death-investi...,,Death investigation on Pueblo’s east side,"PUEBLO, Colo. (KRDO) -- Several Pueblo Police ..."
1,Article_KRDO,https://krdo.com/news/crime/pueblo-county-crim...,"April 24, 2022 8:01 PM","Overnight crash in Pueblo kills two kids, inju...","PUEBLO, Colo. (KRDO) -- Pueblo Police report t..."
2,Article_KRDO,https://krdo.com/news/crime/pueblo-county-crim...,,Man recovering from gunshot wound to the head ...,"PUEBLO, Colo. (KRDO) -- Pueblo Police posted o..."
3,Article_KRDO,https://krdo.com/news/local-news/2022/04/22/on...,"April 22, 2022 5:59 PM",Colorado Springs illicit spa owner found guilt...,"COLORADO SPRINGS, Colo. (KRDO) -- On Friday af..."
4,Article_KRDO,https://krdo.com/news/2022/04/22/body-found-in...,"April 22, 2022 5:55 PM","Body found in Hanover, sheriff’s office treati...","EL PASO COUNTY, Colo. (KRDO) -- The El Paso Co..."
5,Article_KRDO,https://krdo.com/news/2022/04/22/court-martial...,"April 22, 2022 12:27 PM",Court martial for USAFA cadet charged with sex...,"U.S. AIR FORCE ACADEMY, Colo. (KRDO) -- A cour..."
6,Article_KRDO,https://krdo.com/news/2022/04/22/pueblo-police...,"April 22, 2022 11:52 AM",Pueblo Police search for suspect in fatal crash,"PUEBLO, Colo. (KRDO) -- The Pueblo Police Depa..."
7,Article_KRDO,https://krdo.com/news/2022/04/21/la-junta-poli...,"April 21, 2022 4:50 PM","La Junta Police: 17-year-old arrested, accused...","LA JUNTA, Colo. (KRDO) -- A teen is in custody..."


### Add more features
(Code taken from Manoji and modified a littel bit)

In [9]:
# run this command for every first excecution of the notebook
! python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [10]:
import spacy
from spacy.lang.en import English
import glob
import os

!pip install googletrans #if you get the missing translate module take out the # before !pip and run it again
from googletrans import Translator

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 1.8 MB/s 
[?25hCollecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 792 kB/s 
Collecting hstspreload
  Downloading hstspreload-2021.12.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 7.1 MB/s 
[?25hCollecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 2.5 MB/s 
[?25hCollecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.2 MB/s 
[?25hCollecting hpack<4,>=3.0
  Downloading hpack-3.0.0-py2.py3-no

In [11]:
def language_detect(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      a = trans.detect(data).lang
      #b= trans.translate(data, dest='en').text
    except:
      a= 'Unknown'
      #b= 'Translation Failed'

    return a

def language_translate(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      #a = trans.detect(data).lang
      b= trans.translate(data, dest='en').text
    except:
      #a= 'Unknown'
      b= 'Translation Failed'
    
    return b

def key_search(data, key):
  #print(data)
    if data.lower().find(key.lower()) <0:
      return 0
    else:
      return 1

def keyword_scores(data, keys):
    res = {}
    try:
      for key in keys:
        l = keys[key]
        #print(l)
        res[key] = sum(list( map( lambda x: key_search(data, x), l)))
    except:
      res[key] = 0

    return res

In [12]:
import json, requests, urllib, io

#user='my_github_username'
#pao='my_pao'


github_session = requests.Session()
#github_session.auth = (user, pao)

# providing raw url to download csv from github
csv_url = 'https://raw.githubusercontent.com/AR-github-AWS/testrepo/main/Keywords%20for%20Data%20Science.csv?token=GHSAT0AAAAAABTJTADX3DUWZIYFQ3PFQF34YSZYMOA'

download = github_session.get(csv_url).content
downloaded_csv = pd.read_csv(io.StringIO(download.decode('utf-8')), error_bad_lines=False)
df_keywords = downloaded_csv
df_keywords = df_keywords.astype('str').replace('nan')
keywords = df_keywords.to_dict('LIST')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
def GetFacetCols(dataframe,ColumnName):
    nlp = spacy.load("en_core_web_sm")
    OrgFacet=[]
    PersonFacet=[]
    LocationFacet=[]
    MoneyFacet = []
    LanguageFacet = []
    DateFacet = []
    #for i in range(len(dataframe)):
    for row in dataframe[ColumnName]:
        
        #data=dataframe[ColumnName][i]
        content = nlp(str(row))
        Org=[]
        Person=[]
        Location=[]
        Money = []
        Language = []
        Date = []
        
        try:
            
            for ent in content.ents:  # loop over the entities 
                if(ent.label_ == "ORG"):
                    if ent.text not in Org:
                        Org.append(ent.text)
                if(ent.label_ == "PERSON"):
                    if ent.text not in Person:
                        Person.append(ent.text)
                if(ent.label_ == "GPE"):
                    if ent.text not in Location:
                        Location.append(ent.text)
                if(ent.label_ == "MONEY"):
                    if ent.text not in Money:
                        Money.append(ent.text)
                if(ent.label_ == "LANGUAGE"):
                    if ent.text not in Language:
                        Language.append(ent.text)
                if(ent.label_ == "DATE"):
                    if ent.text not in Date:
                        Date.append(ent.text)
                        
        except:
            pass
        
        finally:
            OrgFacet.append(Org)
            PersonFacet.append(Person)
            LocationFacet.append(Location)
            MoneyFacet.append(Money)
            LanguageFacet.append(Language)
            DateFacet.append(Date)
            
                        
        
    dataframe['keyword score'] = list(map( lambda x: sum(keyword_scores(x, keywords).values()), dataframe[ColumnName]))
    dataframe['LanguageFacet'] = list( map( language_detect, dataframe[ColumnName]))
    dataframe['Translated Content'] = list( map( language_translate, dataframe[ColumnName]))
    dataframe['GeoFacet']=LocationFacet
    dataframe['OrgFacet']=OrgFacet
    dataframe['PeopleFacet']=PersonFacet
    dataframe['MoneyFacet'] = MoneyFacet
    dataframe['DateFacet'] = DateFacet
    
    return dataframe

In [14]:
GetFacetCols(data, 'Content')
# adding empty columns to the data frame
data["Threat"] = " "
data["Useful"] = " "
data["Comment"] = " "

###Data is here

In [15]:
data

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content,keyword score,LanguageFacet,Translated Content,GeoFacet,OrgFacet,PeopleFacet,MoneyFacet,DateFacet,Threat,Useful,Comment
0,Article_KRDO,https://krdo.com/news/2022/04/25/death-investi...,,Death investigation on Pueblo’s east side,"PUEBLO, Colo. (KRDO) -- Several Pueblo Police ...",0,Unknown,Translation Failed,[Colo.],[KRDO],[],[],[],,,
1,Article_KRDO,https://krdo.com/news/crime/pueblo-county-crim...,"April 24, 2022 8:01 PM","Overnight crash in Pueblo kills two kids, inju...","PUEBLO, Colo. (KRDO) -- Pueblo Police report t...",0,Unknown,Translation Failed,"[Colo., Midtown]","[KRDO, The Pueblo County Coroner]","[Brianna Gallegos, Michael Gerling]",[],"[the age of 18, Sunday, 17, Monday]",,,
2,Article_KRDO,https://krdo.com/news/crime/pueblo-county-crim...,,Man recovering from gunshot wound to the head ...,"PUEBLO, Colo. (KRDO) -- Pueblo Police posted o...",0,Unknown,Translation Failed,[Colo.],[KRDO],[],[],"[Saturday, Monday]",,,
3,Article_KRDO,https://krdo.com/news/local-news/2022/04/22/on...,"April 22, 2022 5:59 PM",Colorado Springs illicit spa owner found guilt...,"COLORADO SPRINGS, Colo. (KRDO) -- On Friday af...",0,Unknown,Translation Failed,"[COLORADO SPRINGS, Colo., El Paso County, Colo...","[KRDO, Rose Day Spa, the Colorado Springs Poli...","[Xinan Xia, Xia, Rose Spa, Michael Harris, Kat...","[13Investigates, an additional $50]","[56, December 2020, July 2020, Friday, Tuesday...",,,
4,Article_KRDO,https://krdo.com/news/2022/04/22/body-found-in...,"April 22, 2022 5:55 PM","Body found in Hanover, sheriff’s office treati...","EL PASO COUNTY, Colo. (KRDO) -- The El Paso Co...",0,Unknown,Translation Failed,"[PASO COUNTY, Colo., The El Paso County Sherif...","[KRDO, Longhorn Point]",[],[],[Thursday],,,
5,Article_KRDO,https://krdo.com/news/2022/04/22/court-martial...,"April 22, 2022 12:27 PM",Court martial for USAFA cadet charged with sex...,"U.S. AIR FORCE ACADEMY, Colo. (KRDO) -- A cour...",0,Unknown,Translation Failed,"[Colo., Cadet]","[U.S. AIR FORCE ACADEMY, KRDO, U.S. Air Force ...",[Dekota Douglas],[],"[Monday, April 25, April 25-29]",,,
6,Article_KRDO,https://krdo.com/news/2022/04/22/pueblo-police...,"April 22, 2022 11:52 AM",Pueblo Police search for suspect in fatal crash,"PUEBLO, Colo. (KRDO) -- The Pueblo Police Depa...",0,Unknown,Translation Failed,[Colo.],"[KRDO, The Pueblo Police Department, PPD]","[Bonforte Blvd, David Vogel, Joshua Alvarado]",[],"[Friday, April 1]",,,
7,Article_KRDO,https://krdo.com/news/2022/04/21/la-junta-poli...,"April 21, 2022 4:50 PM","La Junta Police: 17-year-old arrested, accused...","LA JUNTA, Colo. (KRDO) -- A teen is in custody...",0,Unknown,Translation Failed,"[LA JUNTA, Colo., La Junta]","[KRDO, the La Junta Police Department, the Ark...",[],[],[Thursday],,,


## FUTURE USE IGNORE 
#### Save the data

In [16]:
# storing at "output" dir
# take out the # on the 3 lines below to save to drive folders
#date = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
#file_name = "Article_KRDO_" +date+ ".csv"
#data.to_csv(file_name, index = False)




# The below code is old but a good idea for how we would store the scrape with omniscient
# Storing a copy for analysts in the "analysts" dir
#data.to_csv( "/dbfs/mnt/analysts/" + file_name, index = False)

In [None]:
#!pip install boto3

In [19]:
#import boto3

In [20]:
#s3 = boto3.resource(
 #   service_name='s3',
  #  region_name='us-east-2',
   # aws_access_key_id='AKIAUOZ7CQ6Y2T76W3PT',
   # aws_secret_access_key='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'
#)

In [21]:
#for bucket in s3.buckets.all():
 # print(bucket.name)