# I. Import Needed Libraries

Selenium: accesses and controls web browser to submit search and locate web elements by xpath

re: regular expression for pattern matching

requests: makes HTTP requests

BeautifulSoup:parses HTML

pandas: data analysis library to manage dataframes


In [285]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import re, requests
from bs4 import BeautifulSoup
import pandas as pd

# II. Scrape for GovLoop website

In [121]:
url = r'https://www.govloop.com/community/blog/best-government-conferences-2018/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')

In [138]:
conferences = [re.sub(r'\n', '', item.find('strong').text) for item in soup.find_all('p')[6:-27] if item.find('a') != None]
conferences = [re.sub(r'\xa0', ' ', item) for item in conferences]
conferences

['Association of Government Accountants (AGA) Financial Systems Summit (FSS)',
 'National Council for Science and the Environment (NCSE) National Conference and Global Forum',
 'New Partners For Smart Growth Conference',
 'AcademyHealth National Health Policy Conference (NHPC)',
 'Federal Networks Conference',
 'Public Sector CIO Academy: California',
 'Association of Government Accountants (AGA) National Leadership Training (NLT)',
 'Climate Leadership Conference',
 'International Wireless Communications Expo (IWCE)',
 'American Society for Public Administration (ASPA) Annual Conference',
 'South by Southwest (SXSW)',
 'Federal Managers Association (FMA) National Convention and Management Training Seminar',
 'Esri Federal GIS Conference',
 "Digital Government Institute's (DGI) Discovery, Records & Information Management Conference",
 'Smart Cities Connect Conference and Expo',
 'Transforming Local Government Conference',
 'National Community Reinvestment Coalition (NCRC) Annual Confer

In [140]:
info = [re.sub(r'\n', '', item.find('em').text) for item in soup.find_all('p')[6:-27] if item.find('a') != None]
info = [re.sub(r'\xa0', ' ', item) for item in info]
info

['January 17, 2018 | Washington, D.C. or virtual | #FSS2018',
 'January 23 – 24, 2018 | Washington, D.C. | #NCSE2018',
 'February 1 – 3, 2018 | San Francisco, CA | #npsg',
 'February 5 – 6, 2018 | Washington, D.C. | #NHPC18',
 'February 12 – 13, 2018 | Washington, D.C.',
 'February 27 – 28, 2018 | Sacramento, CA | #govtechlive',
 'February 27 – 28, 2018 | Washington, D.C. or virtual',
 'February 28 – March 2, 2018 | Denver, CO | #TheCLC',
 'March 5 – 9, 2018 | Orlando, FL | #IWCE2018',
 'March 9 – 13, 2018 | Denver, CO | #ASPA2018',
 'March 9 – 18, 2018 | Austin, TX | #SXSW',
 'March 11 – 14, 2018 | Alexandria, VA',
 'March 20 – 21, 2018 | Washington, D.C. | #FedGIS',
 'March 22, 2018 | Washington, D.C.',
 'March 26 – 29, 2018 | Kansas City, MO | #smartcitiesconnect',
 'April 3 – 6, 2018 | Tacoma, WA | #TLG2018',
 'April 9 – 11, 2018 | Washington, D.C. | #JustEconomy',
 'April 11 – 13, 2018 | New Orleans, LA | #18NTC',
 'April 10 – 13, 2018 | San Antonio, TX | #AIIM18',
 'April 15 – 19

In [157]:
date = [item.split('|')[0] for item in info]
location = [item.split('|')[1] if len(item.split('|')) > 2 else None for item in info]
hashtag = [item.split('|')[-1].strip() if item.split('|')[-1].strip().startswith('#') else None for item in info]

In [158]:
descrip = [re.sub(r'\n', '', item.text) for item in soup.find_all('p')[6:-27] if item.find('a') != None]
descrip = [re.sub(r'\xa0', ' ', item) for item in descrip]
descrip = [item.split('|')[-1].strip() for item in descrip]
descrip = [re.sub(r'#\w*(.*)', r'\1', item).strip() for item in descrip]

In [159]:
len(conferences) == len(date) == len(location) == len(hashtag) == len(descrip)

True

In [160]:
df = pd.DataFrame({'Conferences': conferences,
                  'Date': date,
                  'Location': location,
                  'Hashtag': hashtag,
                  'Description': descrip})

In [162]:
dest = r'C:\Users\Thaunga\Scripts\05. BD\Conferences'
filename = r'2018Conferences.xlsx'
df.to_excel(r'{}\{}'.format(dest, filename))

# III. Scrape from GovEvents

In [292]:
titles = []
descriptions = []
dates = []
organizers = []
locations = []

for i in range(10):
    url = r'https://www.govevents.com/listings.php?q=conference&agency=18&page={}'.format(i)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')
    
    # get title of conference
    title = [item.get('title') for item in soup.find_all('a') if (item.get('class') != None and 'pt-mbli' in item.get('class'))]
    title = [item for item in title if item != None]
    titles += title
    
    # get description of conferences
    description = [item.text for item in soup.find_all('div') if 
               ((item.get('class') != None ) and 'short-description' in item.get('class'))]
    descriptions += description
    
    # get date section info (includes date, organizer, location)
    dateInfo = [item.text for item in soup.find_all('div') 
        if ((item.get('class') != None ) and 'date' in item.get('class'))]

    dateInfo = [re.sub(r'(\r|\n)', '', item).strip() for item in dateInfo]
    
    # get dates
    date = [re.search(r'(.*?)\s\s+', item).group(1) for item in dateInfo]
    dates += date
    
    # get organizers
    organizer = [re.search(r'Organizer:(.*?)Location', item).group(1).strip() if 
             re.search(r'Organizer:(.*?)Location', item) != None else None for item in dateInfo]
    organizers += organizer
    
    # get location
    location = [re.search(r'Location:(.*)', item).group(1).strip() if 
             re.search(r'Location:(.*)', item) != None else None for item in dateInfo]
    
    locations += location

    

In [293]:
len(titles) == len(descriptions) == len(dates) == len(organizers) == len(locations)

True

In [295]:
df2 = pd.DataFrame({'Conferences': titles,
                  'Date': dates,
                  'Description': descriptions,
                  'Location': locations,
                  'Organizers': organizers})

In [297]:
df2.head()

Unnamed: 0,Conferences,Date,Description,Location,Organizers
0,Continuity of Operations (COOP) and Emergency ...,"February 6-7, 2018",With the latest series of natural disasters in...,"Washington, DC","Potomac Forum, Ltd"
1,HIPAA Compliance Officer Training | HIPAA Prof...,"February 8-9, 2018",Overview: This lesson will be addressing how p...,"Washington, DC",GlobalCompliancePanel
2,Global Waste Management Symposium 2018,"February 11-14, 2018",Global Waste Management Symposium (GWMS) is No...,"Indian Wells, CA",Informa
3,6th Automated ISR & Battle Management Symposium,"February 13-14, 2018",The 6th Annual Automated ISR and Battle Manage...,"Alexandria, VA",Defense Strategies Institute
4,Lifting the Curtain with Reverse Industry Day,"February 21, 2018",The American Council for Technology and Indust...,"Washington, DC",ACT-IAC


# IV. Concatenate data scraped from different websites

In [299]:
final_df = pd.concat([df,df2], axis=0, ignore_index=True)

In [300]:
final_df.to_excel('{}\{}'.format(dest, filename))