### Scraping Crime under KRDO news
The scrapper collectes data from WTOP crime news(https://wtop.com/local/crime/)

In [1]:
import requests                # to request the webpage
from bs4 import BeautifulSoup  # to make soup and pull data out of HTML
import urllib.robotparser      # to check the legitimacy to scrap the web
import json                    # to save the output as json file
import pandas as pd            # to  see saved data as dataframe 
from datetime import datetime  # to get the current datetime
#import IPython                 # to display the webpage

### Permission to scrap

In [2]:
robotpars = urllib.robotparser.RobotFileParser()  #instantiate the RobotFileParser

#set the robots.txt url of cbs local news 
robotpars.set_url("https://wtop.com/robots.txt")
robotpars.read() # Reads the robots.txt 

# to check if useragent can fetch the url, true means fetching is possible. 
print("Can we fetch the website?", \
      robotpars.can_fetch("*", "https://wtop.com/")) 


Can we fetch the website? False


### Get the links

In [3]:
def getSoup(url):
    """make a soup for a webpage given its url""" 

    # request the webpage and get the text
    pagetext= requests.get(url).text
    #make a soup and use html parser for the content of the web page 
    soup= BeautifulSoup(pagetext, "html.parser")
    return soup
  
def getNewsLink(soup):
  """Returns a list containing the links of news article.
       soup: is the soup of a webpage from which the links will be extracted.
     """
  news_list=[] # place holder to collect all the links of the news article
  
  try:
    articles = soup.find_all("h3", attrs={"class": "post__template-title"})
    for artc in articles:
      link= artc.a["href"] # get the link for each news article
      news_list.append(link) 
  except:
    pass
  return news_list 



In [4]:
soup =getSoup("https://wtop.com/local/crime/")
getNewsLink(soup)[:3] 

['https://wtop.com/dc/2022/04/police-looking-for-motives-into-the-van-ness-sniper-shooting/',
 'https://wtop.com/dc/2022/04/dc-woman-shot-by-police-was-wearing-bulletproof-vest-body-worn-camera-but-wasnt-special-officer/',
 'https://wtop.com/dc/2022/04/van-ness-sniper-shooting-keeps-1-school-closed/']

### Collect the data

In [5]:
def getNewsInfo(news_link):
    ''' Returns a dictionary containing infomation about the news such as headline, content and published data.
        news_link: the link from which the information is collected'''
    
    Url = news_link  
    Source = "Article_WTOP"  # the same for all links obtained from the main source 
   
    news_soup= getSoup(news_link) # make a soup 
    
    # Get Headline 
    #==============================
    try:  # try if title can be found  
        Headline= news_soup.find("h1",attrs={"class":"page__single--title schema-title"}).get_text() # get the title .append(title)
    except:
        Headline="NA"    
    
   #Get Published Date and Time
    #=================================
    try:
        PublishedDateTime= news_soup.find("p",attrs={"class":"article-post__date"}).string  # get the date      
    except:
        PublishedDateTime= "NA"
 
          
    # Get Content 
    #===================================
    
    text=[] #placeholder to collect contents from multiple paragraphs
    try:   
        cont= news_soup.find("div",attrs={"id":"Entry-content"})
        for c in cont.find_all("p"): # loop over each paragraph
            #get the text in each paragraph and append them
            text.append(c.get_text(separator=" ", strip=True).replace("\xa0", " ")) 

        Content=" ".join(text) # concatnate the paragraphs to make a single string 

    except:
        Content="NA"
           
    # collect  all the data as dictionay 
    data= {"Source": Source, 'Url': Url, 'PublishedDateTime': PublishedDateTime, 'Headline': Headline, 'Content': Content}
        
        # gather all companies info 
    return data

In [6]:
url= "https://wtop.com/local/crime/" # the page we are going to scrap
soup=getSoup(url)
news_url= getNewsLink(soup) #get list of category links
all_data=[] # place holder to collect all the data 
for link in news_url:
    all_data.append(getNewsInfo(link))
data= pd.DataFrame(all_data) # make a dataframe

In [7]:
data.head()

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content
0,Article_WTOP,https://wtop.com/dc/2022/04/police-looking-for...,"April 25, 2022, 12:46 PM",Police look for motives into the Van Ness snip...,Students at the Edmund Burke School saw classe...
1,Article_WTOP,https://wtop.com/dc/2022/04/dc-woman-shot-by-p...,"April 25, 2022, 12:04 PM",DC woman shot by police was wearing bulletproo...,The D.C. woman shot and killed by police early...
2,Article_WTOP,https://wtop.com/dc/2022/04/van-ness-sniper-sh...,"April 25, 2022, 7:56 AM",Van Ness sniper shooting keeps 1 school closed,Classes are canceled Monday for students at th...
3,Article_WTOP,https://wtop.com/prince-georges-county/2022/04...,"April 24, 2022, 10:29 PM",2 pedestrians with life-threatening injuries i...,Two pedestrians sustained life-threatening inj...
4,Article_WTOP,https://wtop.com/dc/2022/04/police-named-armed...,"April 24, 2022, 3:35 PM","Police name armed woman shot, killed by office...",District police have identified the armed woma...


### Add more features

In [8]:
# run this command for every first excecution of the notebook
! python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 8.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [9]:
import spacy
from spacy.lang.en import English
import glob
import os
!pip install googletrans
from googletrans import Translator

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.8 MB/s 
Collecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting hstspreload
  Downloading hstspreload-2021.12.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 15.4 MB/s 
[?25hCollecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.4 MB/s 
[?25hCollecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 3.2 MB/s 
[?25hCollecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.2 MB/s 
[?25hCollecting hpack<4,>=3.0
  Downloading hpack-3.0.0-py2.py3-n

In [10]:
def language_detect(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      a = trans.detect(data).lang
      #b= trans.translate(data, dest='en').text
    except:
      a= 'Unknown'
      #b= 'Translation Failed'

    return a

def language_translate(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      #a = trans.detect(data).lang
      b= trans.translate(data, dest='en').text
    except:
      #a= 'Unknown'
      b= 'Translation Failed'
    
    return b

def key_search(data, key):
  #print(data)
    if data.lower().find(key.lower()) <0:
      return 0
    else:
      return 1

def keyword_scores(data, keys):
    res = {}
    try:
      for key in keys:
        l = keys[key]
        #print(l)
        res[key] = sum(list( map( lambda x: key_search(data, x), l)))
    except:
      res[key] = 0

    return res

In [11]:
import json, requests, urllib, io

#user='my_github_username'
#pao='my_pao'


github_session = requests.Session()
#github_session.auth = (user, pao)

# providing raw url to download csv from github
csv_url = 'https://raw.githubusercontent.com/AR-github-AWS/testrepo/main/Keywords%20for%20Data%20Science.csv?token=GHSAT0AAAAAABTJTADX3DUWZIYFQ3PFQF34YSZYMOA'

download = github_session.get(csv_url).content
downloaded_csv = pd.read_csv(io.StringIO(download.decode('utf-8')), error_bad_lines=False)
df_keywords = downloaded_csv
df_keywords = df_keywords.astype('str').replace('nan')
keywords = df_keywords.to_dict('LIST')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
def GetFacetCols(dataframe,ColumnName):
    nlp = spacy.load("en_core_web_sm")
    OrgFacet=[]
    PersonFacet=[]
    LocationFacet=[]
    MoneyFacet = []
    LanguageFacet = []
    DateFacet = []
    #for i in range(len(dataframe)):
    for row in dataframe[ColumnName]:
        
        #data=dataframe[ColumnName][i]
        content = nlp(str(row))
        Org=[]
        Person=[]
        Location=[]
        Money = []
        Language = []
        Date = []
        
        try:
            
            for ent in content.ents:  # loop over the entities 
                if(ent.label_ == "ORG"):
                    if ent.text not in Org:
                        Org.append(ent.text)
                if(ent.label_ == "PERSON"):
                    if ent.text not in Person:
                        Person.append(ent.text)
                if(ent.label_ == "GPE"):
                    if ent.text not in Location:
                        Location.append(ent.text)
                if(ent.label_ == "MONEY"):
                    if ent.text not in Money:
                        Money.append(ent.text)
                if(ent.label_ == "LANGUAGE"):
                    if ent.text not in Language:
                        Language.append(ent.text)
                if(ent.label_ == "DATE"):
                    if ent.text not in Date:
                        Date.append(ent.text)
                        
        except:
            pass
        
        finally:
            OrgFacet.append(Org)
            PersonFacet.append(Person)
            LocationFacet.append(Location)
            MoneyFacet.append(Money)
            LanguageFacet.append(Language)
            DateFacet.append(Date)
            
                        
        
    dataframe['keyword score'] = list(map( lambda x: sum(keyword_scores(x, keywords).values()), dataframe[ColumnName]))
    dataframe['LanguageFacet'] = list( map( language_detect, dataframe[ColumnName]))
    dataframe['Translated Content'] = list( map( language_translate, dataframe[ColumnName]))
    dataframe['GeoFacet']=LocationFacet
    dataframe['OrgFacet']=OrgFacet
    dataframe['PeopleFacet']=PersonFacet
    dataframe['MoneyFacet'] = MoneyFacet
    dataframe['DateFacet'] = DateFacet
    
    return dataframe

In [13]:
GetFacetCols(data, 'Content')
# adding empty columns to the data frame
data["Threat"] = " "
data["Useful"] = " "
data["Comment"] = " "

### Data is here

In [14]:
data

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content,keyword score,LanguageFacet,Translated Content,GeoFacet,OrgFacet,PeopleFacet,MoneyFacet,DateFacet,Threat,Useful,Comment
0,Article_WTOP,https://wtop.com/dc/2022/04/police-looking-for...,"April 25, 2022, 12:46 PM",Police look for motives into the Van Ness snip...,Students at the Edmund Burke School saw classe...,0,Unknown,Translation Failed,"[D.C., Fairfax County]","[the Edmund Burke School, WTOP, Contee]","[Robert Contee, Raymond Spencer, Spencer, Van ...",[],"[Monday, January, Friday]",,,
1,Article_WTOP,https://wtop.com/dc/2022/04/dc-woman-shot-by-p...,"April 25, 2022, 12:04 PM",DC woman shot by police was wearing bulletproo...,The D.C. woman shot and killed by police early...,0,Unknown,Translation Failed,[D.C.],"[Contee, the Force Investigation Team, Interna...","[Robert Contee, Erica Graham, Graham, Luke Luk...",[],"[Monday, 42, Saturday]",,,
2,Article_WTOP,https://wtop.com/dc/2022/04/van-ness-sniper-sh...,"April 25, 2022, 7:56 AM",Van Ness sniper shooting keeps 1 school closed,Classes are canceled Monday for students at th...,0,Unknown,Translation Failed,"[D.C., Fairfax County]","[the Edmund Burke School, The Washington Post,...","[Van Ness, Raymond Spencer, Robert Contee, Burke]",[],"[Monday, Friday, April 22, 2022]",,,
3,Article_WTOP,https://wtop.com/prince-georges-county/2022/04...,"April 24, 2022, 10:29 PM",2 pedestrians with life-threatening injuries i...,Two pedestrians sustained life-threatening inj...,0,Unknown,Translation Failed,"[Prince George’s County, Maryland, Beltsville]","[EMS, WTOP]",[],[],"[Sunday, 11100]",,,
4,Article_WTOP,https://wtop.com/dc/2022/04/police-named-armed...,"April 24, 2022, 3:35 PM","Police name armed woman shot, killed by office...",District police have identified the armed woma...,0,Unknown,Translation Failed,"[Northwest D.C., D.C.]",[Police Robert Contee],"[Erica Graham, Graham]",[],[Saturday],,,
5,Article_WTOP,https://wtop.com/maryland/2022/04/md-deputies-...,"April 24, 2022, 1:06 PM",Md. deputies fatally shoot man described as ar...,"BEL AIR, Md. (AP) — Police in Maryland say the...",0,Unknown,Translation Failed,"[BEL AIR, Md., Maryland, Harford County]","[AP, The Associated Press]","[Jeffrey Gahler, Gahler]",[],[Saturday],,,
6,Article_WTOP,https://wtop.com/local/2022/04/20-yr-old-man-d...,"April 23, 2022, 10:56 PM",20-year-old man dead after fatal shooting in G...,"Police in Montgomery County, Maryland, say the...",0,Unknown,Translation Failed,"[Montgomery County, Maryland, Germantown, Mont...","[kin, the Major Crimes Division]",[],[],"[19500, 8477, Friday]",,,
7,Article_WTOP,https://wtop.com/dc/2022/04/man-stabbed-on-met...,"April 23, 2022, 10:16 PM",Man stabbed on Metrobus in Southeast DC,A man who was stabbed on a D.C. Metrobus Satur...,0,Unknown,Translation Failed,[],[WTOP],"[Martin Luther King Jr, Metro]",[],[Saturday],,,
8,Article_WTOP,https://wtop.com/montgomery-county/2022/04/pol...,"April 23, 2022, 9:27 PM",Police ID victim of fatal shooting in Takoma Park,"Police in Takoma Park, Maryland, say they have...",0,Unknown,Translation Failed,"[Takoma Park, Maryland]",[Takoma Park Police],[Ahmadou Bamba Gueye],[],"[early Friday, Friday]",,,
9,Article_WTOP,https://wtop.com/dc/2022/04/secret-service-ide...,"April 23, 2022, 3:31 PM","Secret Service identifies intruder shot, kille...",The U.S. Secret Service has identified the int...,0,Unknown,Translation Failed,"[Peru, Northwest D.C., Germantown, Maryland, D...","[The U.S. Secret Service, the Secret Service, ...","[Gordon Casey, Casey]",[],"[Wednesday, Saturday, that day]",,,


### Ignore

In [15]:
# storing at "output" dir
#date = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
#file_name = "Article_WTOP_" +date+ ".csv"
#data.to_csv( "/dbfs/mnt/dboutput/" + file_name, index = False)
#data.to_csv(file_name, index = False)
# Storing a copy for analysts in the "analysts" dir
#data.to_csv( "/dbfs/mnt/analysts/" + file_name, index = False)