### Scraping WUSA
 In this notebook the crime section of WUSA(https://www.wusa9.com/section/crime) is scraped.

In [None]:
import requests                # to request the webpage
from bs4 import BeautifulSoup  # to make soup and pull data out of HTML
import urllib.robotparser      # to check the legitimacy to scrap the web
import json                    # to save the output as json file
import pandas as pd            # to  see saved data as dataframe 
from datetime import datetime  # to get the current datetime
!pip install newspaper3k
from newspaper import Article   # to get news information

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[K     |████████████████████████████████| 211 kB 4.9 MB/s 
[?25hCollecting tinysegmenter==0.3
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
Collecting tldextract>=2.0.1
  Downloading tldextract-3.3.0-py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 544 kB/s 
[?25hCollecting feedfinder2>=0.0.4
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
Collecting jieba3k>=0.35.1
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[K     |████████████████████████████████| 7.4 MB 36.5 MB/s 
Collecting feedparser>=5.2.1
  Downloading feedparser-6.0.8-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 5.6 MB/s 
Collecting cssselect>=0.9.2
  Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)
Collecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
Collecting requests-file>=1.4
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Building wheels for collec

In [None]:
url= "https://www.wusa9.com/video/news/crime/new-video-mail-theft-in-kensington/65-a25f2ff3-2418-4741-a372-6cf546cb75ff"
article= Article(url)
article.download()
article.parse()




### Permission to scrap

In [None]:
robotpars = urllib.robotparser.RobotFileParser()  #instantiate the RobotFileParser

#set the robots.txt url of cbs local news 
robotpars.set_url("https://www.wusa9.com/robots.txt")
robotpars.read() # Reads the robots.txt 

# to check if useragent can fetch the url, true means fetching is possible. 
print("Can we fetch the website?", \
      robotpars.can_fetch("*", "https://www.wusa9.com/")) 

Can we fetch the website? True


In [None]:
def getSoup(url):
    """make a soup for a webpage given its url""" 

    # request the webpage and get the text
    pagetext= requests.get(url).text
    #make a soup and use html parser for the content of the web page 
    soup= BeautifulSoup(pagetext, "html.parser")
    return soup
  
def getNewsLink(soup):
  """Returns a list containing the links of news article.
       soup: is the soup of a webpage from which the links will be extracted.
     """
  news_list=[] # place holder to collect all the links of the news article
  source_url= "https://www.wusa9.com"
  try:
    Spotlight_link= soup.find("div", attrs={"class":"story__meta"}).a.get("href") # get the link extension for the first news
  except:
    pass
  news_list.append(source_url+ Spotlight_link)
  try:
    # get the link for listed news   
    story_list= soup.find("ul", attrs={"class": "story-list__list"})
    header= story_list.find_all("h4")
    for h in header:
      extension_link= h.a.get("href") # get the extension
      news_list.append(source_url + extension_link)
  except:
    pass
  return news_list 
  

In [None]:
url= "https://www.wusa9.com/section/crime"
soup= getSoup(url)
getNewsLink(soup)

['https://www.wusa9.com/article/news/crime/18th-street-gang-members-to-plead-guilty-in-confrontation-with-ms-13-at-petworth-metro-cleaver-gun/65-84863ed9-17c4-4c02-9fde-442f6be8f8a0',
 'https://www.wusa9.com/article/news/crime/police-release-surveillance-video-of-northeast-assault-suspect/65-7434fff0-2510-4608-bc98-4d1de27afd30',
 'https://www.wusa9.com/video/news/crime/suspect-sought-in-an-assault-at-convenience-store/65-fbddd487-8bdd-440b-aa09-24f8b204fcfe',
 'https://www.wusa9.com/video/news/local/dc/hear-me-out-its-a-sad-situation-daycare-in-district-heights-closes-due-to-surrounding-crime/65-de334bce-62b1-4cde-ad87-bea1068471f7',
 'https://www.wusa9.com/article/news/local/virginia/virginia-18-year-old-killed-in-gainesville-sunday/65-43c8c4cf-00e3-4bad-9302-42c708cd132c',
 'https://www.wusa9.com/article/news/local/dc/3-northwest-dc-stabbings-police-investigating/65-2605db98-ba6d-4a13-b404-2d4e9f20a8b1',
 'https://www.wusa9.com/article/news/local/virginia/employee-at-spa-takes-fathe

### Collect the data

In [None]:

def getNewsInfo(news_link):
    ''' Returns a dictionary containing infomation about the news such as headline, content and published data.
        news_link: the link from which the information is collected'''
    
        
    Url = news_link  
    Source = "Article_WUSA"  # the same for all links obtained from the main source 
   
    news_soup= getSoup(news_link) # make a soup 
    
    # Get Headline 
    #==============================
    try:  # try if articles's title can be found  
        header_article= news_soup.find("h1",attrs={"class":"article__headline"}).get_text() # get the title 
        Headline=header_article
    except: # if not article get the video title
      try:
        header_video= news_soup.find("h1",attrs={"class":"video__headline"}).get_text() # get the title for a video if not article
        Headline=header_video
      except: 
          Headline="NA" 
        
   #Get Published Date and Time
    #=================================
    try: # try to get published datetime of the article   
      dateTime_article= news_soup.find("div",attrs={"class":"article__published"}).contents[-1].strip()  # get the published date for the article    
      PublishedDateTime = dateTime_article
    except: # not article find for the video 
      try:
        dateTime_video= news_soup.find("div",attrs={"class":"video__published"}).contents[-1].strip()  # get the published date for the video i not article
        PublishedDateTime = dateTime_video
      except:
        PublishedDateTime= "NA"
          
    # Get Content 
    #===================================
    text=[] #placeholder to collect contents from multiple paragraphs
    try:
      summary_article= news_soup.find("div", attrs={"class": "article__summary"}).get_text()  #get article summary 
      summary = summary_article
    except:
      try:
        summary_video=news_soup.find("div", attrs={"class": "video__summary"}).get_text() #  # to get video summary if not article
        summary = summary_video
      except:
        summary=""       
    
    text.append(summary) 
    try:   
        cont= news_soup.find_all("div",attrs={"class":"article__section article__section_type_text utility__text"})
        for c in cont: # loop over to get each paragraph
            try:
              #get the text in each paragraph and append them
              paragraph= c.p.find(text=True, recursive=False)# to get the text only under "p" tag not from inside the children 
              text.append(paragraph.strip())#separator=" ", strip=True).replace("\xa0", " ") 
            except:
              pass
        Content=" ".join(text) # concatnate the paragraphs to make a single string 

    except:
        Content="NA"
           
    # collect  all the data as dictionay 
    data= {"Source": Source, 'Url': Url, 'PublishedDateTime': PublishedDateTime, 'Headline': Headline, 'Content': Content}
        
        # gather all companies info 
    return data

In [None]:
url= "https://www.wusa9.com/section/crime" # the page we are going to scrap
soup=getSoup(url)
news_url= getNewsLink(soup) #get list of category links
all_data=[] # place holder to collect all the data 


for link in news_url:
    all_data.append(getNewsInfo(link))

data= pd.DataFrame(all_data) # make a dataframe

###Add more features
(Code taken from Manoji and modified a littel bit)

In [None]:
# run this command for every first excecution of the notebook
! python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
import spacy
from spacy.lang.en import English
import glob
import os
!pip install googletrans
from googletrans import Translator

Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.0 MB/s 
Collecting hstspreload
  Downloading hstspreload-2021.12.1-py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 12.0 MB/s 
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.1 MB/s 
[?25hCollecting sniffio
  Downloading sniffio-1.2.0-py3-none-any.whl (10 kB)
Collecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 3.0 MB/s 
[?25hCollecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.3 MB/s 
[?25hCollecting hyperframe<6,>=5.2.0
  Downloading hyperframe-5.2.0-py2

In [None]:
def language_detect(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      a = trans.detect(data).lang
      #b= trans.translate(data, dest='en').text
    except:
      a= 'Unknown'
      #b= 'Translation Failed'

    return a

def language_translate(data):
    trans = Translator()
    #print(trans.translate(data).text)
    try:
      #a = trans.detect(data).lang
      b= trans.translate(data, dest='en').text
    except:
      #a= 'Unknown'
      b= 'Translation Failed'
    
    return b

def key_search(data, key):
  #print(data)
    if data.lower().find(key.lower()) <0:
      return 0
    else:
      return 1

def keyword_scores(data, keys):
    res = {}
    try:
      for key in keys:
        l = keys[key]
        #print(l)
        res[key] = sum(list( map( lambda x: key_search(data, x), l)))
    except:
      res[key] = 0

    return res

In [None]:
import json, requests, urllib, io

#user='my_github_username'
#pao='my_pao'


github_session = requests.Session()
#github_session.auth = (user, pao)

# providing raw url to download csv from github
csv_url = 'https://raw.githubusercontent.com/AR-github-AWS/testrepo/main/Keywords%20for%20Data%20Science.csv?token=GHSAT0AAAAAABTJTADX3DUWZIYFQ3PFQF34YSZYMOA'

download = github_session.get(csv_url).content
downloaded_csv = pd.read_csv(io.StringIO(download.decode('utf-8')), error_bad_lines=False)
df_keywords = downloaded_csv
df_keywords = df_keywords.astype('str').replace('nan')
keywords = df_keywords.to_dict('LIST')



  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
def GetFacetCols(dataframe,ColumnName):
    nlp = spacy.load("en_core_web_sm")
    OrgFacet=[]
    PersonFacet=[]
    LocationFacet=[]
    MoneyFacet = []
    LanguageFacet = []
    DateFacet = []
    #for i in range(len(dataframe)):
    for row in dataframe[ColumnName]:
        
        #data=dataframe[ColumnName][i]
        content = nlp(str(row))
        Org=[]
        Person=[]
        Location=[]
        Money = []
        Language = []
        Date = []
        
        try:
            
            for ent in content.ents:  # loop over the entities 
                if(ent.label_ == "ORG"):
                    if ent.text not in Org:
                        Org.append(ent.text)
                if(ent.label_ == "PERSON"):
                    if ent.text not in Person:
                        Person.append(ent.text)
                if(ent.label_ == "GPE"):
                    if ent.text not in Location:
                        Location.append(ent.text)
                if(ent.label_ == "MONEY"):
                    if ent.text not in Money:
                        Money.append(ent.text)
                if(ent.label_ == "LANGUAGE"):
                    if ent.text not in Language:
                        Language.append(ent.text)
                if(ent.label_ == "DATE"):
                    if ent.text not in Date:
                        Date.append(ent.text)
                        
        except:
            pass
        
        finally:
            OrgFacet.append(Org)
            PersonFacet.append(Person)
            LocationFacet.append(Location)
            MoneyFacet.append(Money)
            LanguageFacet.append(Language)
            DateFacet.append(Date)
            
                        
        
    dataframe['keyword score'] = list(map( lambda x: sum(keyword_scores(x, keywords).values()), dataframe[ColumnName]))
    dataframe['LanguageFacet'] = list( map( language_detect, dataframe[ColumnName]))
    dataframe['Translated Content'] = list( map( language_translate, dataframe[ColumnName]))
    dataframe['GeoFacet']=LocationFacet
    dataframe['OrgFacet']=OrgFacet
    dataframe['PeopleFacet']=PersonFacet
    dataframe['MoneyFacet'] = MoneyFacet
    dataframe['DateFacet'] = DateFacet
    
    return dataframe

In [None]:
GetFacetCols(data, 'Content')
# adding empty columns to the data frame
data["Threat"] = " "
data["Useful"] = " "
data["Comment"] = " "

### Data here

In [None]:
data

Unnamed: 0,Source,Url,PublishedDateTime,Headline,Content,keyword score,LanguageFacet,Translated Content,GeoFacet,OrgFacet,PeopleFacet,MoneyFacet,DateFacet,Threat,Useful,Comment
0,Article_WUSA,https://www.wusa9.com/article/news/crime/18th-...,"12:11 PM EDT May 10, 2022",18th Street gang members to plead guilty in co...,The Justice Department has linked the D.C. cha...,0,Unknown,Translation Failed,"[D.C., WASHINGTON, California, U.S., Los Angel...","[The Justice Department, Christian Figueroa-Gu...","[Christopher Molina-Garcia, Molina-Garcia, Emm...",[],"[2019, Sept. 17, 2019, Sept. 27, 2019, more th...",,,
1,Article_WUSA,https://www.wusa9.com/article/news/crime/polic...,"10:41 AM EDT May 10, 2022",Authorities seek suspect seen on video brutall...,Police are asking for help finding a man and w...,0,Unknown,Translation Failed,[WASHINGTON],[Metropolitan Police Department],[],[],[50411],,,
2,Article_WUSA,https://www.wusa9.com/video/news/crime/suspect...,"10:00 AM EDT May 10, 2022",Video shows suspect brutally assaulting indivi...,Police need the public's assistance in identif...,0,Unknown,Translation Failed,[],[],[],[],[],,,
3,Article_WUSA,https://www.wusa9.com/video/news/local/dc/hear...,"6:58 AM EDT May 10, 2022",Hear Me Out: 'It's a sad situation' | Daycare ...,"No parent should have to go through this, Tony...",0,Unknown,Translation Failed,[],[],[Tony Perkins],[],[],,,
4,Article_WUSA,https://www.wusa9.com/article/news/local/virgi...,"5:47 AM EDT May 10, 2022",18-year-old killed in Gainesville Sunday,An acquaintance brought the man with multiple ...,0,Unknown,Translation Failed,"[Va., Gainesville]",[Dumfries],"[Michael Arthur, Arthur]",[],"[Sunday, May 8]",,,
5,Article_WUSA,https://www.wusa9.com/article/news/local/dc/3-...,"4:28 AM EDT May 10, 2022",DC Police investigating after pair Northwest D...,Police are still working to determine what led...,0,Unknown,Translation Failed,"[WASHINGTON, District, Newton Place]","[Metropolitan Police Department, L St. Northwe...",[stab],[],"[late Monday, Tuesday, 1300]",,,
6,Article_WUSA,https://www.wusa9.com/article/news/local/virgi...,"10:57 PM EDT May 9, 2022",Ashburn woman takes man's infant son at Loudou...,An employee at Lansdowne Resort was charged wi...,0,Unknown,Translation Failed,"[LOUDOUN COUNTY, Va., Loudoun County, Leesburg...","[the Loudoun County Sheriff's Office, the Loud...",[Sandra J. Brown],[],"[Saturday, May 7]",,,
7,Article_WUSA,https://www.wusa9.com/article/news/crime/fairf...,"8:37 PM EDT May 9, 2022",Fairfax County Police search for missing 12-ye...,Officers have listed Zaydie as an endangered j...,0,Unknown,Translation Failed,"[FAIRFAX COUNTY, Va.]",[],[Zaydie],[],[],,,
8,Article_WUSA,https://www.wusa9.com/article/news/local/dc/po...,"12:39 PM EDT May 8, 2022",Court documents offer further details in DC f...,"Sedrick Miller, 42, was shot 11 times while wa...",0,Unknown,Translation Failed,"[WASHINGTON, Southeast D.C., Northeast D.C., m...","[The Metropolitan Police Department, Metropoli...","[Sedrick Miller, Jarrell David Harris, Miller,...",[],"[42, March, Two months, 27, March 4 around 8:3...",,,
9,Article_WUSA,https://www.wusa9.com/video/features/originals...,"8:01 AM EDT May 3, 2022",'I'm tired of losing people I love' | DC youth...,"At 12 years old, Rashad Bates has already lost...",0,Unknown,Translation Failed,[],[],[Rashad Bates],[],[12 years old],,,


### Ignore

In [None]:
# storing at "output" dir
#date = datetime.now().strftime("%Y-%m-%d--%H-%M-%S")
#file_name = "Article_WUSA_" +date+ ".csv"
#data.to_csv(file_name, index = False)

# Storing a copy for analysts in the "analysts" dir
#data.to_csv( "/dbfs/mnt/analysts/" + file_name, index = False)