In [1]:
import time
import numpy as np
import re
import pandas as pd
from collections import OrderedDict

!pip install selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common import exceptions
from selenium.webdriver.common.proxy import Proxy, ProxyType

pd.set_option('display.max_colwidth', None)




[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
#Compile webdriver

options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--no-sandbox")
options.add_argument("--disable-cookies")

browser = webdriver.Chrome(options=options)

# Scraping

## Functions

In [2]:
def compile_webdriver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-cookies")

    browser = webdriver.Chrome(options=options)

    return browser

In [3]:
def get_hrefs(browser, index):
    site = f'https://www.arlingtonva.us/About-Arlington/Newsroom?dlv_ARL%20CL%20Public%20News%20Listing%20without%20Image=(dd_OC%20News%20Categories=Daily%20Crime%20Report)(pageindex={index})'
    browser.get(site)

    ref_list = browser.find_elements(By.XPATH, '//*[@href]')
    raw_hrefs = [elem.get_attribute('href') for elem in ref_list]

    pattern = 'https://www.arlingtonva.us/About-Arlington/Newsroom/(?:Articles/)?(?:\d{4}/)?Crime-Report-.*'
    href_list = [url for url in raw_hrefs if re.match(pattern, url)]

    return href_list

In [4]:
def get_data(link_list, start=None, end=None):

    published_date = []
    data_list = []

    #Start browser 
    browser = compile_webdriver()
    
    
    for link in link_list[start:end]:

        #Visit site
        browser.get(link)
        #Get all relevant information in single string
        try:
            crime = browser.find_element(By.CSS_SELECTOR, 'div.col-m-8:nth-child(1)').text
            
        except NoSuchElementException:
            crime = browser.find_element(By.CSS_SELECTOR, '#main-content > div:nth-child(1) > div:nth-child(1)').text

        #Brief cleaning for storage
        crime_split = crime.split('\n')
        full_list = [string.strip() for string in crime_split if string.strip()]

        report_date = full_list[0]

        if 'REPORTS' in full_list:
            report_index = full_list.index('REPORTS')
            crime_list = full_list[report_index:]

        elif 'REPORT' in full_list:
            report_index = full_list.index('REPORT')
            crime_list = full_list[report_index:]

            # if 'STOLEN VEHICLES' in full_list:
            #     sv_index = full_list.index('STOLEN VEHICLES')

            #     stolen_vehicles = []
            #     for i in range(sv_index+1, len(full_list) - 1, 2):
            #         line_item = full_list[i] + ' ' + full_list[i+1]
            #         stolen_vehicles.append(line_item)

            #     crime_list = full_list[:sv_index+1] + stolen_vehicles
            #     crime_list = crime_list[report_index:]

            # else:
            #     crime_list = full_list[report_index:]

        else:
            crime_list = full_list[1:]

        #Append to data  lists and track
        print(f'Appending {len(crime_list)} records from {report_date}...')
        published_date.append(report_date)
        data_list.append(crime_list)

    browser.close()
    
    return published_date, data_list

In [5]:
def segment_list(input, divisor):
    record_length = len(input)

    base_size = record_length // divisor
    remainders = record_length % divisor

    parts = []
    parts.append(0)

    for i in range(divisor):
        parts.append(base_size + (1 if i < remainders else 0))

    assert sum(parts) == record_length

    indices = np.cumsum(parts).tolist()

    return indices

## Execute

In [6]:
#Start clock
start_time = time.time()

#Empty list to append hrefs to
all_hrefs = []

#Compile webdriver 
browser = compile_webdriver()

#Get info for first page and determine max number of pages
href_list = get_hrefs(browser, 1)
ref_list = browser.find_elements(By.XPATH, '//*[@href]')
raw_hrefs = [elem.get_attribute('href') for elem in ref_list]

all_hrefs.append(href_list)
length = len(href_list)
print(f'Appending {length} urls to list for page 1...')

#Get the max page index
pattern = r'#page-(\d+)'
list_of_lists = [re.findall(pattern, s) for s in raw_hrefs]
flatten = [int(list) for list_of_lists in list_of_lists for list in list_of_lists if len(list) >0]

max_pi = max(flatten)

#Now iterate through rest of pages using the max_pi
for i in range(2, max_pi):
    href_list = get_hrefs(browser, i)

    all_hrefs.append(href_list)
    length = len(href_list)
    print(f'Appending {length} urls to list for page {i}...')

#Flatten href list
ah_flat = [flatten for all_hrefs in all_hrefs for flatten in all_hrefs]

#Close browser
browser.close()

#End stopclock and return length of runtime
end_time = time.time()
minutes = (end_time - start_time) / 60

print(f'Collecting time = {round(minutes, 2)} minutes')

Appending 10 urls to list for page 1...
Appending 10 urls to list for page 2...
Appending 10 urls to list for page 3...
Appending 10 urls to list for page 4...
Appending 10 urls to list for page 5...
Appending 10 urls to list for page 6...
Appending 10 urls to list for page 7...
Appending 10 urls to list for page 8...
Appending 10 urls to list for page 9...
Appending 10 urls to list for page 10...
Appending 10 urls to list for page 11...
Appending 10 urls to list for page 12...
Appending 10 urls to list for page 13...
Appending 10 urls to list for page 14...
Appending 10 urls to list for page 15...
Appending 10 urls to list for page 16...
Appending 10 urls to list for page 17...
Appending 10 urls to list for page 18...
Appending 10 urls to list for page 19...
Appending 10 urls to list for page 20...
Appending 10 urls to list for page 21...
Appending 10 urls to list for page 22...
Appending 10 urls to list for page 23...
Appending 10 urls to list for page 24...
Appending 10 urls to list

In [108]:
ah_flat[1000:]

['https://www.arlingtonva.us/About-Arlington/Newsroom/Articles/2020/Crime-Report-January-23-2020',
 'https://www.arlingtonva.us/About-Arlington/Newsroom/Articles/2020/Crime-Report-January-22-2020',
 'https://www.arlingtonva.us/About-Arlington/Newsroom/Articles/2020/Crime-Report-January-21-2020',
 'https://www.arlingtonva.us/About-Arlington/Newsroom/Articles/2020/Crime-Report-January-17-2020',
 'https://www.arlingtonva.us/About-Arlington/Newsroom/Articles/2020/Crime-Report-January-16-2020',
 'https://www.arlingtonva.us/About-Arlington/Newsroom/Articles/2020/Crime-Report-January-15-2020',
 'https://www.arlingtonva.us/About-Arlington/Newsroom/Articles/2020/Crime-Report-January-14-2020',
 'https://www.arlingtonva.us/About-Arlington/Newsroom/Articles/2020/Crime-Report-January-13-2020',
 'https://www.arlingtonva.us/About-Arlington/Newsroom/Articles/2020/Crime-Report-January-10-2020',
 'https://www.arlingtonva.us/About-Arlington/Newsroom/Articles/2020/Crime-Report-January-9-2020',
 'https://w

In [8]:
#Start stopclock
start_time = time.time()

#Create empty lists for final storage
pb_list = []
dt_list = []

#Call segmenting function to split out scraping sessions to avoid time-out
index_list = segment_list(ah_flat, 3)

#Iterate through index list to segment scraping task
for i in range(len(index_list) - 1):
    published_date, data_list = get_data(ah_flat, index_list[i], index_list[i+1])

    pb_list.append(published_date)
    dt_list.append(data_list)

    print(f'\nFinished Round {i+1}')

#Stop clock, format and print runtime   
end_time = time.time()
minutes = (end_time - start_time) / 60

print(f'Time to append data = {round(minutes, 2)} minutes')

Appending 55 records from Crime Report: February 21, 2024...
Appending 79 records from Crime Report: February 20, 2024...
Appending 14 records from Crime Report: February 16, 2024...
Appending 33 records from Crime Report: February 15, 2024...
Appending 41 records from Crime Report: February 14, 2024...
Appending 48 records from Crime Report: February 13, 2024...
Appending 52 records from Crime Report: February 12, 2024...
Appending 33 records from Crime Report: February 9, 2024...
Appending 12 records from Crime Report: February 8, 2024...
Appending 47 records from Crime Report: February 7, 2024...
Appending 42 records from Crime Report: February 6, 2024...
Appending 64 records from Crime Report: February 5, 2024...
Appending 34 records from Crime Report: February 2, 2024...
Appending 20 records from Crime Report: February 1, 2024...
Appending 45 records from Crime Report: January 31, 2024...
Appending 29 records from Crime Report: January 30, 2024...
Appending 70 records from Crime R

In [None]:
pb = [pb for pb_list in pb_list for pb in pb_list]
dt = [dt for dt_list in dt_list for dt in dt_list]

clean_pb = [line.replace('Crime Report: ', '').strip() for line in pb]
raw_df = pd.DataFrame({'report_date': clean_pb, 
             'raw_data': dt})

df1 = raw_df.explode('raw_data').reset_index().drop(columns='index')

df1['report_date'] = pd.to_datetime(df1['report_date'])
df1['category'] = df1['raw_data'].str.extract(r'^([A-Z\s]{3,})')
df1['category'] = df1['category'].str.strip()

# df1.to_csv('raw_table.csv')

## Cleaning

In [45]:
#Read in table after outputting 

df1 = pd.read_csv('raw_table.csv')
df1 = df1.drop(columns='Unnamed: 0')

In [47]:
#Destruction of Property clean 
dop_oct_raw = df1[(df1['report_date'] == '2022-10-28') & (df1['category'].isna()) 
                 | (df1['report_date'] == '2022-10-28') & (df1['category'] == ' ')
                 | (df1['report_date'] == '2022-10-24') & (df1['category'].isna())
                 | (df1['report_date'] == '2022-10-24') & (df1['category'] == ' ')][1:]

dop_oct = dop_oct_raw.copy()
dop_oct['category'] = 'DESTRUCTION OF PROPERTY'
dop_oct['report_type'] = 'REPORTS'
dop_oct['address'] = dop_oct['raw_data'].shift(-1)
dop_oct['notes'] = 'Vehicles involved: ' + dop_oct['raw_data'].shift(-2)

dop_oct = dop_oct[dop_oct['raw_data'].str.contains('\d{4}-.*')]

dop_oct['incident_date'] = dop_oct['raw_data']
dop_oct['incident_date'] = dop_oct['incident_date'].str.extract('(\d{4}-\d{4})')
dop_oct['incident_date'] = pd.to_datetime(dop_oct['incident_date'], format='%Y-%m%d')

dop_oct['incident_id'] = dop_oct['raw_data']

df2 = df1.drop(index=dop_oct_raw.index)

In [48]:
#Clean Report Type

#Script to label report type column 
r_index = 0
or_index = 0
ct_index = 0

# List to store tuples
result_list = [('first_item', 'REPORTS')]

# Iterate over the original list
for item in df2['raw_data']:

    if item == 'REPORTS' or item == 'REPORT':
        r_index = len(result_list)
    elif item == 'ONLINE REPORTS' or item == 'ONLINE REPORT' or item == 'ONLINE RPEORTS':
        or_index = len(result_list)
    elif item == 'STOLEN VEHICLES':
        ct_index = len(result_list)


    # Append tuple to result list
    # result_list.append((item, 
    #                     'REPORTS' if r_index > or_index and r_index > ct_index else 'ONLINE REPORTS' if or_index > r_index and or_index > ct_index else 'STOLEN VEHICLES'))
    
    result_list.append((item, 
                        'ONLINE REPORTS' if or_index > r_index and or_index > ct_index else 'STOLEN VEHICLES' if ct_index > r_index and ct_index > or_index else 'REPORTS'))

report_type = result_list[1:]
df2['report_type'] = [b for a,b in report_type]

In [49]:
#Stolen Vehicle Clean
#Isolate all stolen vehicle instances
sv = df2[(df2['report_type'].isin(['STOLEN VEHICLES'])) | (df2['category'].isna()) | (df2['category'].isin(['VA']))]

#Account for all sv issues 
sv_fix = sv.copy()
sv_fix = sv_fix[~sv_fix['raw_data'].str.contains(r'^[A-Z]{3,}')]
#sv_fix = sv_fix[~sv_fix['raw_data'].str[0].str.isupper()]

sv_fix['category'] = 'STOLEN VEHICLES'
sv_fix['report_type'] = 'STOLEN VEHICLES'
sv_fix['raw_address'] = sv_fix['raw_data'].shift(-1)

#Lagging to concat addresses, cleaning, etc..
sv_final = sv_fix.copy()
sv_final = sv_final[(sv_final['raw_data'].str.contains('\d{1,2}\/\d{1,2}\/\d{1,2}')) | (sv_final['raw_data'].str.contains(r'^[A-Z]{2,}'))]

sv_final['address'] = np.where(sv_final['raw_address'].str.contains('\d{1,2}\/\d{1,2}\/\d{1,2}'), np.nan, sv_final['raw_address'])
sv_final['notes'] = sv_final['raw_data'].str.extract(r'\d{1,2}/\d{1,2}/\d{1,2},(.*)')

#Incident date
sv_final['incident_date'] = sv_final['raw_data'].str.extract('(\d{1,2}\/\d{1,2}\/\d{1,2})')
sv_final['incident_id'] = sv_final['incident_date']

fi_list = []
for incident in sv_final['incident_date']:
    if incident == '03/03/26':
        fixed_incident='03/03/16'

    elif incident == '2/29/19':
        fixed_incident='09/29/19'

    else:
        fixed_incident = incident
    
    fi_list.append(fixed_incident)

sv_final['incident_date'] = fi_list
sv_final['incident_date'] = pd.to_datetime(sv_final['incident_date'], format='%m/%d/%y')

#Final df to concat in future
sv_final = sv_final.drop(columns='raw_address')

#Drop dirty sv data from original df 
df3 = df2.drop(index=sv.index)


#Fix mislabelled Stolen Vehicles
other = sv.copy()
other = other[other['raw_data'].str.contains(r'^[A-Z]{3,}')]

no_rt = other.copy()
no_rt = no_rt[~no_rt['raw_data'].isin(['STOLEN VEHICLES'])]
no_rt['report_type'] = 'REPORTS'

no_rt

df4 =pd.concat([df3, no_rt])

In [51]:
# Extract address (not perfect)

df4['raw_data'] = df4['raw_data'].str.replace('  ', ' ')
df4['incident_id'] = df4['raw_data'].str.extract('(?:\d{4}-)?(\d{6,9}),(.*)')[0]
df4['one'] = df4['raw_data'].str.extract('(?:\d{4}-)?(\d{6,9}),(.*)')[1]

def process_address(column, pattern):
    match = pattern.search(column)
    
    if match:
        return match.group(1)
    else:
        return column

df4['one'].fillna('NONE', inplace=True) 
df4['notes'] = df4['one'].str.extract(r'\.(?: At| Or| Between)(.*)')

at = re.compile(r'(.+?)\. At')
df4['at'] = [process_address(value, at).strip() for value in df4['one']]

on = re.compile(r'(.+?)\. On')
df4['on'] = [process_address(value, on).strip() for value in df4['at']]

off = re.compile(r'(.+?)\. Officers')
df4['off'] = [process_address(value, off).strip() for value in df4['on']]

between = re.compile(r'(.+?)\. Between')
df4['address'] = [process_address(value, between).strip() for value in df4['off']]

df5 = df4[~df4['address'].isin(['NONE'])]
df5['incident_date'] = np.nan

In [60]:
#Order columns of interest and conact all cleaned dfs

final_cols = ['report_date', 'incident_id', 'raw_data', 'category', 'report_type', 'address', 'notes', 'incident_date']

concat_dfs = [df5[final_cols], sv_final[final_cols], dop_oct[final_cols]]

final_df = pd.concat(concat_dfs)

# Output

In [61]:
#Output df
final_df.to_csv('arlington_crime.csv')