### Scraping WUSA
 In this notebook the crime section of WUSA(https://www.wusa9.com/section/crime) is scraped.

In [1]:
import requests                # to request the webpage
from bs4 import BeautifulSoup  # to make soup and pull data out of HTML
import urllib.robotparser      # to check the legitimacy to scrap the web
import json                    # to save the output as json file
import pandas as pd            # to  see saved data as dataframe 
from datetime import datetime  # to get the current datetime
!pip install newspaper3k
from newspaper import Article   # to get news information 

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[?25l[K     |█▌                              | 10 kB 16.1 MB/s eta 0:00:01[K     |███                             | 20 kB 6.2 MB/s eta 0:00:01[K     |████▋                           | 30 kB 4.4 MB/s eta 0:00:01[K     |██████▏                         | 40 kB 4.2 MB/s eta 0:00:01[K     |███████▊                        | 51 kB 3.5 MB/s eta 0:00:01[K     |█████████▎                      | 61 kB 4.1 MB/s eta 0:00:01[K     |██████████▉                     | 71 kB 4.3 MB/s eta 0:00:01[K     |████████████▍                   | 81 kB 4.0 MB/s eta 0:00:01[K     |██████████████                  | 92 kB 4.5 MB/s eta 0:00:01[K     |███████████████▌                | 102 kB 4.7 MB/s eta 0:00:01[K     |█████████████████               | 112 kB 4.7 MB/s eta 0:00:01[K     |██████████████████▋             | 122 kB 4.7 MB/s eta 0:00:01[K     |████████████████████▏           | 133 kB 4.7 MB/s eta 0:00:

In [2]:
url= "https://www.wusa9.com/video/news/crime/new-video-mail-theft-in-kensington/65-a25f2ff3-2418-4741-a372-6cf546cb75ff"
article= Article(url)
article.download()
article.parse()




### Permission to scrap

In [3]:
robotpars = urllib.robotparser.RobotFileParser()  #instantiate the RobotFileParser

#set the robots.txt url of cbs local news 
robotpars.set_url("https://www.wusa9.com/robots.txt")
robotpars.read() # Reads the robots.txt 

# to check if useragent can fetch the url, true means fetching is possible. 
print("Can we fetch the website?", \
      robotpars.can_fetch("*", "https://www.wusa9.com/")) 

Can we fetch the website? True


In [4]:
def getSoup(url):
    """make a soup for a webpage given its url""" 

    # request the webpage and get the text
    pagetext= requests.get(url).text
    #make a soup and use html parser for the content of the web page 
    soup= BeautifulSoup(pagetext, "html.parser")
    return soup
  
def getNewsLink(soup):
  """Returns a list containing the links of news article.
       soup: is the soup of a webpage from which the links will be extracted.
     """
  news_list=[] # place holder to collect all the links of the news article
  source_url= "https://www.wusa9.com"
  try:
    Spotlight_link= soup.find("div", attrs={"class":"story__meta"}).a.get("href") # get the link extension for the first news
  except:
    pass
  news_list.append(source_url+ Spotlight_link)
  try:
    # get the link for listed news   
    story_list= soup.find("ul", attrs={"class": "story-list__list"})
    header= story_list.find_all("h4")
    for h in header:
      extension_link= h.a.get("href") # get the extension
      news_list.append(source_url + extension_link)
  except:
    pass
  return news_list 
  

In [5]:
url= "https://www.wusa9.com/section/crime"
soup= getSoup(url)
getNewsLink(soup)

['https://www.wusa9.com/article/news/local/dc/northwest-dc-quadruple-shooting-suspect-raymond-spencer-motive-unclear/65-7b937ab7-ce21-4740-84a1-a328559cfba5',
 'https://www.wusa9.com/video/news/crime/this-is-serious-business-and-people-are-scared-chief-contee-begs-for-end-to-violence-in-dc/65-cab80aaa-f4e3-45d5-866c-7d021381c109',
 'https://www.wusa9.com/article/news/crime/uva-lacrosse-player-killed-yeardley-love-murder-2010-george-huguely-civil-trial-wrongful-death-lawsuit/65-ea7073a8-06c9-4c2f-89ee-7368a39f39f6',
 'https://www.wusa9.com/article/news/national/capitol-riots/capitol-riot-defendant-wants-to-ask-jury-candidates-about-hitler-nazis-timothy-hale-cusanelli-january-6-trump-colts-neck-army-reservist/65-04039710-d1fa-4438-b991-0a846f7ad1b0',
 'https://www.wusa9.com/video/news/local/dc/latest-updates-on-dc-gunmens-sniper-nest-apartment/65-541594de-b72b-444b-8888-48fdd9e721a1',
 'https://www.wusa9.com/article/news/national/capitol-riots/jan-6-defendants-face-juries/507-698a9244-dd

### Collect the data

In [6]:

def getNewsInfo(news_link):
    ''' Returns a dictionary containing infomation about the news such as headline, content and published data.
        news_link: the link from which the information is collected'''
    
        
    Url = news_link  
    Source = "Article_WUSA"  # the same for all links obtained from the main source 
   
    news_soup= getSoup(news_link) # make a soup 
    
    # Get Headline 
    #==============================
    try:  # try if articles's title can be found  
        header_article= news_soup.find("h1",attrs={"class":"article__headline"}).get_text() # get the title 
        Headline=header_article
    except: # if not article get the video title
      try:
        header_video= news_soup.find("h1",attrs={"class":"video__headline"}).get_text() # get the title for a video if not article
        Headline=header_video
      except: 
          Headline="NA" 
        
   #Get Published Date and Time
    #=================================
    try: # try to get published datetime of the article   
      dateTime_article= news_soup.find("div",attrs={"class":"article__published"}).contents[-1].strip()  # get the published date for the article    
      PublishedDateTime = dateTime_article
    except: # not article find for the video 
      try:
        dateTime_video= news_soup.find("div",attrs={"class":"video__published"}).contents[-1].strip()  # get the published date for the video i not article
        PublishedDateTime = dateTime_video
      except:
        PublishedDateTime= "NA"
          
    # Get Content 
    #===================================
    text=[] #placeholder to collect contents from multiple paragraphs
    try:
      summary_article= news_soup.find("div", attrs={"class": "article__summary"}).get_text()  #get article summary 
      summary = summary_article
    except:
      try:
        summary_video=news_soup.find("div", attrs={"class": "video__summary"}).get_text() #  # to get video summary if not article
        summary = summary_video
      except:
        summary=""       
    
    text.append(summary) 
    try:   
        cont= news_soup.find_all("div",attrs={"class":"article__section article__section_type_text utility__text"})
        for c in cont: # loop over to get each paragraph
            try:
              #get the text in each paragraph and append them
              paragraph= c.p.find(text=True, recursive=False)# to get the text only under "p" tag not from inside the children 
              text.append(paragraph.strip())#separator=" ", strip=True).replace("\xa0", " ") 
            except:
              pass
        Content=" ".join(text) # concatnate the paragraphs to make a single string 

    except:
        Content="NA"
           
    # collect  all the data as dictionay 
    data= {"Source": Source, 'Url': Url, 'PublishedDateTime': PublishedDateTime, 'Headline': Headline, 'Content': Content}
        
        # gather all companies info 
    return data

In [7]:
url= "https://www.wusa9.com/section/crime" # the page we are going to scrap
soup=getSoup(url)
news_url= getNewsLink(soup) #get list of category links
all_data=[] # place holder to collect all the data 


for link in news_url:
    all_data.append(getNewsInfo(link))

data= pd.DataFrame(all_data) # make a dataframe

###Add more features
(Code taken from Manoji and modified a littel bit)

In [8]:
# run this command for every first excecution of the notebook
! python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.0 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [9]:
import spacy
from spacy.lang.en import English
import glob
import os
!pip install googletrans
from googletrans import Translator

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.1 MB/s 
[?25hCollecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting hstspreload
  Downloading hstspreload-2021.12.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 10.5 MB/s 
[?25hCollecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 448 kB/s 
[?25hCollecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.6 MB/s 
[?25hCollecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 3.0 MB/s 
[?25hCollecting hyperframe<6,>=5.2.0
  Downloading hyperfra

In [10]:
def language_detect(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      a = trans.detect(data).lang
      #b= trans.translate(data, dest='en').text
    except:
      a= 'Unknown'
      #b= 'Translation Failed'

    return a

def language_translate(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      #a = trans.detect(data).lang
      b= trans.translate(data, dest='en').text
    except:
      #a= 'Unknown'
      b= 'Translation Failed'
    
    return b

def key_search(data, key):
  #print(data)
    if data.lower().find(key.lower()) <0:
      return 0
    else:
      return 1

def keyword_scores(data, keys):
    res = {}
    try:
      for key in keys:
        l = keys[key]
        #print(l)
        res[key] = sum(list( map( lambda x: key_search(data, x), l)))
    except:
      res[key] = 0

    return res

In [11]:
import json, requests, urllib, io

#user='my_github_username'
#pao='my_pao'


github_session = requests.Session()
#github_session.auth = (user, pao)

# providing raw url to download csv from github
csv_url = 'https://raw.githubusercontent.com/AR-github-AWS/testrepo/main/Keywords%20for%20Data%20Science.csv?token=GHSAT0AAAAAABTJTADX3DUWZIYFQ3PFQF34YSZYMOA'

download = github_session.get(csv_url).content
downloaded_csv = pd.read_csv(io.StringIO(download.decode('utf-8')), error_bad_lines=False)
df_keywords = downloaded_csv
df_keywords = df_keywords.astype('str').replace('nan')
keywords = df_keywords.to_dict('LIST')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
def GetFacetCols(dataframe,ColumnName):
    nlp = spacy.load("en_core_web_sm")
    OrgFacet=[]
    PersonFacet=[]
    LocationFacet=[]
    MoneyFacet = []
    LanguageFacet = []
    DateFacet = []
    #for i in range(len(dataframe)):
    for row in dataframe[ColumnName]:
        
        #data=dataframe[ColumnName][i]
        content = nlp(str(row))
        Org=[]
        Person=[]
        Location=[]
        Money = []
        Language = []
        Date = []
        
        try:
            
            for ent in content.ents:  # loop over the entities 
                if(ent.label_ == "ORG"):
                    if ent.text not in Org:
                        Org.append(ent.text)
                if(ent.label_ == "PERSON"):
                    if ent.text not in Person:
                        Person.append(ent.text)
                if(ent.label_ == "GPE"):
                    if ent.text not in Location:
                        Location.append(ent.text)
                if(ent.label_ == "MONEY"):
                    if ent.text not in Money:
                        Money.append(ent.text)
                if(ent.label_ == "LANGUAGE"):
                    if ent.text not in Language:
                        Language.append(ent.text)
                if(ent.label_ == "DATE"):
                    if ent.text not in Date:
                        Date.append(ent.text)
                        
        except:
            pass
        
        finally:
            OrgFacet.append(Org)
            PersonFacet.append(Person)
            LocationFacet.append(Location)
            MoneyFacet.append(Money)
            LanguageFacet.append(Language)
            DateFacet.append(Date)
            
                        
        
    dataframe['keyword score'] = list(map( lambda x: sum(keyword_scores(x, keywords).values()), dataframe[ColumnName]))
    dataframe['LanguageFacet'] = list( map( language_detect, dataframe[ColumnName]))
    dataframe['Translated Content'] = list( map( language_translate, dataframe[ColumnName]))
    dataframe['GeoFacet']=LocationFacet
    dataframe['OrgFacet']=OrgFacet
    dataframe['PeopleFacet']=PersonFacet
    dataframe['MoneyFacet'] = MoneyFacet
    dataframe['DateFacet'] = DateFacet
    
    return dataframe

In [13]:
GetFacetCols(data, 'Content')
# adding empty columns to the data frame
data["Threat"] = " "
data["Useful"] = " "
data["Comment"] = " "

### Data here

In [14]:
data

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content,keyword score,LanguageFacet,Translated Content,GeoFacet,OrgFacet,PeopleFacet,MoneyFacet,DateFacet,Threat,Useful,Comment
0,Article_WUSA,https://www.wusa9.com/article/news/local/dc/no...,"10:39 AM EDT April 25, 2022",'An evil act' | Police have not yet determined...,Police Chief Robert Contee said investigators ...,0,Unknown,Translation Failed,"[Northwest DC, WASHINGTON, Fairfax, Virginia, ...","[Metropolitan Police Department, quadruple Con...","[Robert Contee, Raymond Spencer, Robert Contee...",[],"[23, Friday, Monday, the weekend, the past month]",,,
1,Article_WUSA,https://www.wusa9.com/video/news/crime/this-is...,"4:03 PM EDT April 25, 2022",'This is serious business and people are scare...,Mayor Bowser and Chief Contee jointly announce...,0,Unknown,Translation Failed,[],"[Chief Contee, the Violent Crime Impact Team]",[Bowser],[],[],,,
2,Article_WUSA,https://www.wusa9.com/article/news/crime/uva-l...,"12:06 PM EDT April 25, 2022",More than a decade after UVA lacrosse player Y...,Jury selection is to start in Charlottesville ...,0,Unknown,Translation Failed,"[Va., North Carolina]","[Charlottesville Circuit Court, UVA]","[George Huguely, Huguely, Matthew Green, Green]","[$29.5 million, $1 million]","[Monday, 2010, Nearly 12 years, weeks, the day...",,,
3,Article_WUSA,https://www.wusa9.com/article/news/national/ca...,"12:25 PM EDT April 25, 2022",'He can be a racist but not guilty of insurrec...,The allegedly avowed white supremacist and ant...,0,Unknown,Translation Failed,"[WASHINGTON, New Jersey, U.S., D.C.]","[anti-Semite, U.S. Army, Hale-Cusanelli, Naval...","[Adolf Hitler, Jonathan Crisp, Crisp, Timothy ...",[],"[next month, Friday, Jan. 6, less than two wee...",,,
4,Article_WUSA,https://www.wusa9.com/video/news/local/dc/late...,"6:02 AM EDT April 25, 2022",Latest updates on DC gunmen's 'sniper nest' ap...,Police say the shooter fired bullets indiscrim...,0,Unknown,Translation Failed,[],[],[],[],[the day],,,
5,Article_WUSA,https://www.wusa9.com/article/news/national/ca...,"6:00 AM EDT April 25, 2022",Jurors reject array of defenses at Capitol rio...,Video evidence and self-incriminating behavior...,0,Unknown,Translation Failed,"[Ohio, Virginia, Texas, U.S., Washington, Flor...","[NYPD, Capitol, U.S. Marine Corps, Georgetown ...","[Webster, Donald Trump, Amit Mehta, Mehta, Mar...",[],"[Jan. 6, 2021, that day, 56, Jan. 6, last June...",,,
6,Article_WUSA,https://www.wusa9.com/article/news/crime/washi...,"7:10 AM EDT April 22, 2022",Surveillance video shows part of 13-year-old's...,Police said they've arrested the boy for a tot...,0,Unknown,Translation Failed,"[WASHINGTON, Northeast D.C., Maryland, Kingman...","[DMV, D.C. Police, MPD, Chiefs, Contee, Chief ...","[Forest Krueger, Morgan Kane, Kane, Muriel Bow...",[],"[one day, Friday, Wednesday, April 27, April 1...",,,
7,Article_WUSA,https://www.wusa9.com/article/news/local/maryl...,"10:19 PM EDT April 24, 2022",Takoma Park police arrest Silver Spring man in...,Abraham Douglas of Silver Spring faces charges...,0,Unknown,Translation Failed,"[PARK, Md., The City of Takoma Park Police Dep...","[Silver Spring, the Montgomery County Central ...","[Abraham Douglas, Douglas]",[],[Friday],,,
8,Article_WUSA,https://www.wusa9.com/article/news/local/dc/va...,"9:13 PM EDT April 24, 2022",'Hold your children close' | Private school ne...,"A man with a ""sniper-style"" setup fired more t...",0,Unknown,Translation Failed,"[WASHINGTON, Northwest D.C., D.C.]","[Edmund Burke School, The Edmund Burke School,...","[Damian Jones, Edmund Burke, Van Ness, Robert ...",[],"[sixth through 12th, Friday, Saturday, the day...",,,
9,Article_WUSA,https://www.wusa9.com/article/news/crime/dc-po...,"9:16 PM EDT April 21, 2022",Woman with legs bound by rope jumps from 8th-f...,DC Police said a man who called himself the wo...,0,Unknown,Translation Failed,"[WASHINGTON, Northwest]","[Metropolitan Police Department, EMS, MPD, Bed...",[Duncan Bedlion],[],"[Thursday, Friday, earlier in the day]",,,


### Ignore

In [15]:
# storing at "output" dir
#date = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
#file_name = "Article_WUSA_" +date+ ".csv"
#data.to_csv(file_name, index = False)

# Storing a copy for analysts in the "analysts" dir
#data.to_csv( "/dbfs/mnt/analysts/" + file_name, index = False)