### Scraping Crime under KRDO news
The scrapper collectes data from WTOP crime news(https://wtop.com/local/crime/)

In [None]:
import requests                # to request the webpage
from bs4 import BeautifulSoup  # to make soup and pull data out of HTML
import urllib.robotparser      # to check the legitimacy to scrap the web
import json                    # to save the output as json file
import pandas as pd            # to  see saved data as dataframe 
from datetime import datetime  # to get the current datetime
#import IPython                 # to display the webpage

### Permission to scrap

In [None]:
robotpars = urllib.robotparser.RobotFileParser()  #instantiate the RobotFileParser

#set the robots.txt url of cbs local news 
robotpars.set_url("https://wtop.com/robots.txt")
robotpars.read() # Reads the robots.txt 

# to check if useragent can fetch the url, true means fetching is possible. 
print("Can we fetch the website?", \
      robotpars.can_fetch("*", "https://wtop.com/")) 


Can we fetch the website? False


### Get the links

In [None]:
def getSoup(url):
    """make a soup for a webpage given its url""" 

    # request the webpage and get the text
    pagetext= requests.get(url).text
    #make a soup and use html parser for the content of the web page 
    soup= BeautifulSoup(pagetext, "html.parser")
    return soup
  
def getNewsLink(soup):
  """Returns a list containing the links of news article.
       soup: is the soup of a webpage from which the links will be extracted.
     """
  news_list=[] # place holder to collect all the links of the news article
  
  try:
    articles = soup.find_all("h3", attrs={"class": "post__template-title"})
    for artc in articles:
      link= artc.a["href"] # get the link for each news article
      news_list.append(link) 
  except:
    pass
  return news_list 



In [None]:
soup =getSoup("https://wtop.com/local/crime/")
getNewsLink(soup)[:3] 

['https://wtop.com/prince-william-county/2022/05/reward-offered-photos-released-in-shooting-of-teen-girl-at-prince-william-co-carnival/',
 'https://wtop.com/dc/2022/05/dc-bar-and-restaurant-loses-more-than-200k-in-cyber-crime/',
 'https://wtop.com/virginia/2022/05/va-case-focusing-debate-on-constitutionality-of-geofence-warrant-ends-with-guilty-plea/']

### Collect the data

In [None]:
def getNewsInfo(news_link):
    ''' Returns a dictionary containing infomation about the news such as headline, content and published data.
        news_link: the link from which the information is collected'''
    
    Url = news_link  
    Source = "Article_WTOP"  # the same for all links obtained from the main source 
   
    news_soup= getSoup(news_link) # make a soup 
    
    # Get Headline 
    #==============================
    try:  # try if title can be found  
        Headline= news_soup.find("h1",attrs={"class":"page__single--title schema-title"}).get_text() # get the title .append(title)
    except:
        Headline="NA"    
    
   #Get Published Date and Time
    #=================================
    try:
        PublishedDateTime= news_soup.find("p",attrs={"class":"article-post__date"}).string  # get the date      
    except:
        PublishedDateTime= "NA"
 
          
    # Get Content 
    #===================================
    
    text=[] #placeholder to collect contents from multiple paragraphs
    try:   
        cont= news_soup.find("div",attrs={"id":"Entry-content"})
        for c in cont.find_all("p"): # loop over each paragraph
            #get the text in each paragraph and append them
            text.append(c.get_text(separator=" ", strip=True).replace("\xa0", " ")) 

        Content=" ".join(text) # concatnate the paragraphs to make a single string 

    except:
        Content="NA"
           
    # collect  all the data as dictionay 
    data= {"Source": Source, 'Url': Url, 'PublishedDateTime': PublishedDateTime, 'Headline': Headline, 'Content': Content}
        
        # gather all companies info 
    return data

In [None]:
url= "https://wtop.com/local/crime/" # the page we are going to scrap
soup=getSoup(url)
news_url= getNewsLink(soup) #get list of category links
all_data=[] # place holder to collect all the data 
for link in news_url:
    all_data.append(getNewsInfo(link))
data= pd.DataFrame(all_data) # make a dataframe

In [None]:
data.head()

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content
0,Article_WTOP,https://wtop.com/prince-william-county/2022/05...,"May 10, 2022, 1:48 PM","Reward offered, photos released in shooting of...",A reward has been offered in the shooting of a...
1,Article_WTOP,https://wtop.com/dc/2022/05/dc-bar-and-restaur...,"May 10, 2022, 11:43 AM",DC bar and restaurant loses more than $200K in...,A bar and restaurant in D.C. is still struggli...
2,Article_WTOP,https://wtop.com/virginia/2022/05/va-case-focu...,"May 10, 2022, 10:18 AM",Va. case focusing debate on constitutionality ...,After more than two years of litigation in sta...
3,Article_WTOP,https://wtop.com/maryland/2022/05/maryland-upd...,"May 10, 2022, 8:13 AM","Maryland updates stalking law, includes digita...",Maryland has updated its stalking law to inclu...
4,Article_WTOP,https://wtop.com/dc/2022/05/dc-police-shoot-at...,"May 9, 2022, 11:06 PM",DC police shoot at 1 after routine traffic sto...,A man is in custody following a routine traffi...


### Add more features

In [None]:
# run this command for every first excecution of the notebook
! python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.6 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
import spacy
from spacy.lang.en import English
import glob
import os
!pip install googletrans
from googletrans import Translator

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 1.7 MB/s 
[?25hCollecting hstspreload
  Downloading hstspreload-2021.12.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 10.5 MB/s 
Collecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 764 kB/s 
[?25hCollecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.6 MB/s 
[?25hCollecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 2.9 MB/s 
[?25hCollecting hyperframe<6,>=5.2.0
  Downloading hyperframe-5.2

In [None]:
def language_detect(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      a = trans.detect(data).lang
      #b= trans.translate(data, dest='en').text
    except:
      a= 'Unknown'
      #b= 'Translation Failed'

    return a

def language_translate(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      #a = trans.detect(data).lang
      b= trans.translate(data, dest='en').text
    except:
      #a= 'Unknown'
      b= 'Translation Failed'
    
    return b

def key_search(data, key):
  #print(data)
    if data.lower().find(key.lower()) <0:
      return 0
    else:
      return 1

def keyword_scores(data, keys):
    res = {}
    try:
      for key in keys:
        l = keys[key]
        #print(l)
        res[key] = sum(list( map( lambda x: key_search(data, x), l)))
    except:
      res[key] = 0

    return res

In [None]:
import json, requests, urllib, io

#user='my_github_username'
#pao='my_pao'


github_session = requests.Session()
#github_session.auth = (user, pao)

# providing raw url to download csv from github
csv_url = 'https://raw.githubusercontent.com/AR-github-AWS/testrepo/main/Keywords%20for%20Data%20Science.csv?token=GHSAT0AAAAAABTJTADX3DUWZIYFQ3PFQF34YSZYMOA'

download = github_session.get(csv_url).content
downloaded_csv = pd.read_csv(io.StringIO(download.decode('utf-8')), error_bad_lines=False)
df_keywords = downloaded_csv
df_keywords = df_keywords.astype('str').replace('nan')
keywords = df_keywords.to_dict('LIST')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
def GetFacetCols(dataframe,ColumnName):
    nlp = spacy.load("en_core_web_sm")
    OrgFacet=[]
    PersonFacet=[]
    LocationFacet=[]
    MoneyFacet = []
    LanguageFacet = []
    DateFacet = []
    #for i in range(len(dataframe)):
    for row in dataframe[ColumnName]:
        
        #data=dataframe[ColumnName][i]
        content = nlp(str(row))
        Org=[]
        Person=[]
        Location=[]
        Money = []
        Language = []
        Date = []
        
        try:
            
            for ent in content.ents:  # loop over the entities 
                if(ent.label_ == "ORG"):
                    if ent.text not in Org:
                        Org.append(ent.text)
                if(ent.label_ == "PERSON"):
                    if ent.text not in Person:
                        Person.append(ent.text)
                if(ent.label_ == "GPE"):
                    if ent.text not in Location:
                        Location.append(ent.text)
                if(ent.label_ == "MONEY"):
                    if ent.text not in Money:
                        Money.append(ent.text)
                if(ent.label_ == "LANGUAGE"):
                    if ent.text not in Language:
                        Language.append(ent.text)
                if(ent.label_ == "DATE"):
                    if ent.text not in Date:
                        Date.append(ent.text)
                        
        except:
            pass
        
        finally:
            OrgFacet.append(Org)
            PersonFacet.append(Person)
            LocationFacet.append(Location)
            MoneyFacet.append(Money)
            LanguageFacet.append(Language)
            DateFacet.append(Date)
            
                        
        
    dataframe['keyword score'] = list(map( lambda x: sum(keyword_scores(x, keywords).values()), dataframe[ColumnName]))
    dataframe['LanguageFacet'] = list( map( language_detect, dataframe[ColumnName]))
    dataframe['Translated Content'] = list( map( language_translate, dataframe[ColumnName]))
    dataframe['GeoFacet']=LocationFacet
    dataframe['OrgFacet']=OrgFacet
    dataframe['PeopleFacet']=PersonFacet
    dataframe['MoneyFacet'] = MoneyFacet
    dataframe['DateFacet'] = DateFacet
    
    return dataframe

In [None]:
GetFacetCols(data, 'Content')
# adding empty columns to the data frame
data["Threat"] = " "
data["Useful"] = " "
data["Comment"] = " "

### Data is here

In [None]:
data

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content,keyword score,LanguageFacet,Translated Content,GeoFacet,OrgFacet,PeopleFacet,MoneyFacet,DateFacet,Threat,Useful,Comment
0,Article_WTOP,https://wtop.com/prince-william-county/2022/05...,"May 10, 2022, 1:48 PM","Reward offered, photos released in shooting of...",A reward has been offered in the shooting of a...,0,Unknown,Translation Failed,"[Prince William County, Virginia, Woodbridge, ...","[Gar-Field High School, the Bureau of Alcohol,...",[],"[6,000]","[last month, Tuesday, April 15]",,,
1,Article_WTOP,https://wtop.com/dc/2022/05/dc-bar-and-restaur...,"May 10, 2022, 11:43 AM",DC bar and restaurant loses more than $200K in...,A bar and restaurant in D.C. is still struggli...,0,Unknown,Translation Failed,[D.C.],"[BB&T, SunTrust, WTOP, FBI]","[Johnny Pistolas, Adams Morgan, Jonathan Askar...","[nearly $500,000, more than $200,000]","[months, Dec. 31]",,,
2,Article_WTOP,https://wtop.com/virginia/2022/05/va-case-focu...,"May 10, 2022, 10:18 AM",Va. case focusing debate on constitutionality ...,After more than two years of litigation in sta...,0,Unknown,Translation Failed,"[Virginia, Chesterfield County, Richmond]","[Okello Chatrie, Google, Lauck, Court]","[Chatrie, Hannah Lauck, Lauck]",[],"[more than two years, Monday, Aug. 2, March, t...",,,
3,Article_WTOP,https://wtop.com/maryland/2022/05/maryland-upd...,"May 10, 2022, 8:13 AM","Maryland updates stalking law, includes digita...",Maryland has updated its stalking law to inclu...,0,Unknown,Translation Failed,"[Maryland, Anne Arundel County]",[House],"[Sandy Bartlett, Larry Hogan]","[5,000]","[Oct. 1, 2022, April 21, 2022, five years]",,,
4,Article_WTOP,https://wtop.com/dc/2022/05/dc-police-shoot-at...,"May 9, 2022, 11:06 PM",DC police shoot at 1 after routine traffic sto...,A man is in custody following a routine traffi...,0,Unknown,Translation Failed,"[Northeast D.C., D.C.]",[Contee],[Robert Contee],[],"[1700, Tuesday]",,,
5,Article_WTOP,https://wtop.com/loudoun-county/2022/05/va-ag-...,"May 9, 2022, 12:16 AM",Va. AG office denies LGBTQ student records sub...,Attorney General Jason Miyares has denied clai...,0,Unknown,Translation Failed,"[Loudoun County LGBTQ, Loudoun County, Virginia]","[FERPA, WTOP, Stone Bridge High School, Broad ...","[Jason Miyares, Equality Loudoun, Miyares, Spo...",[],"[Saturday, May 4, May 10, May of 2021, October...",,,
6,Article_WTOP,https://wtop.com/dc/2022/05/man-injured-in-nor...,"May 8, 2022, 11:30 PM",Man injured in Northwest DC crash,D.C. police say that a man is in the hospital ...,0,Unknown,Translation Failed,"[D.C., Northwest]",[Chevy],[],[],[],,,
7,Article_WTOP,https://wtop.com/fairfax-county/2022/05/northe...,"May 8, 2022, 6:29 PM","Northern Va. gang members convicted of murder,...",Three members of Fairfax County’s “Reccless Ti...,0,Unknown,Translation Failed,"[Fairfax County, Richmond, California, Centrev...","[Reccless Tiger, Reccless Tigers, the Reccless...","[Brandon White, White, Peter Le, Young Yoo, Jo...",[thousands of pounds],"[2019, January 2019, two months, 2011, Sept. 9...",,,
8,Article_WTOP,https://wtop.com/prince-william-county/2022/05...,"May 8, 2022, 4:41 PM",Dumfries man killed in early morning Gainesvil...,A young man was shot and killed early Sunday m...,0,Unknown,Translation Failed,"[Gainesville, Virginia, Prince William County]",[Prince William County Police],"[Michael Arthur, Police, Arthur]",[],[],,,
9,Article_WTOP,https://wtop.com/crime/2022/05/silver-spring-d...,"May 8, 2022, 10:36 AM",Silver Spring dentist pleads guilty to mail an...,"A dentist from Silver Spring, Maryland, pleade...",0,Unknown,Translation Failed,"[Silver Spring, Maryland, D.C.]","[the Department of Justice, Medicaid, Buford, ...",[Edward T. Buford III],[more than $1.2 million],"[Wednesday, 70, Between January 2013, May 2018...",,,


### Ignore

In [None]:
# storing at "output" dir
#date = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
#file_name = "Article_WTOP_" +date+ ".csv"
#data.to_csv( "/dbfs/mnt/dboutput/" + file_name, index = False)
#data.to_csv(file_name, index = False)
# Storing a copy for analysts in the "analysts" dir
#data.to_csv( "/dbfs/mnt/analysts/" + file_name, index = False)