# First I will scrape from just one page to see how that works

The page I will scrape from has URL: https://www.guide2research.com/conference/asp-dac-2022-asia-and-south-pacific-design-automation-conference

In [23]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import json 
import re

In [83]:
url = "https://www.guide2research.com/conference/aivr-2021-ieee-international-conference-on-artificial-intelligence-and-virtual-reality"
r = requests.get(url, headers={'user-agent': 'Yayi Feng (yf7qq@virginia.edu) for Data Science 6001 at University of Virginia'})
r

<Response [200]>

In [84]:
aspconf = BeautifulSoup(r.text)

In [85]:
aspconf.find_all('p')

[<p>Research in Virtual Reality (VR) is concerned with computing technologies that allow humans to see, hear, talk, think, learn, and solve problems in virtual and augmented environments. Research in Artificial Intelligence (AI) addresses technologies that allow computing machines to mimic these same human abilities. Although these two fields evolved separately, they share an interest in human senses, skills, and knowledge production. Thus, bringing them together will enable us to create more natural and realistic virtual worlds and develop better, more effective applications. Ultimately, this will lead to a future in which humans and humans, humans and machines, and machines and machines are interacting naturally in virtual worlds, with use cases and benefits we are only just beginning to imagine.<br/>
 IEEE International Conference on Artificial Intelligence and Virtual Reality (AIVR) is a unique event, addressing researchers and industries from all areas of AI as well as Virtual, Au

## Extract feature: conference name

In [86]:
nametag = aspconf.title
name = nametag.text.split(' «')[0]
name

'AIVR 2021 : IEEE International Conference on Artificial Intelligence and Virtual Reality'

## Extract feature: submission deadline

In [87]:
submissiontag = aspconf.find("td", text=re.compile('Submission Deadline'))
submissiontag = submissiontag.find_next()
submit = submissiontag.text.replace("\n","")
submit

'Thursday 15 Jul 2021'

## Extract feature: conference date

In [88]:
datetag = aspconf.find("td", text=re.compile('Conference Date'))
datetag = datetag.find_next()
date = datetag.text.replace("\n","")
date = date.replace("\t","")
date

'Nov 15, 2021 - Nov 17, 2021'

## Extract feature: conference location

In [89]:
loctag = aspconf.find("td", text=re.compile('Conference Address'))
loctag = loctag.find_next()
loc = loctag.text.strip()
loc

'Taichung, Taiwan'

## Extract feature: conference and submission link

In [90]:
linktag = aspconf.find("td", text=re.compile('Conference & Submission Link'))
linktag = linktag.find_next()
link = linktag.text.strip()
link

'http://www.ieee-aivr.org/'

In [91]:
table = aspconf.find_all('table')
df = pd.read_html(str(table))[0]
df

Unnamed: 0,0,1,2
0,Submission Deadline,Thursday 15 Jul 2021,Proceedings indexed by :
1,Conference Dates,"Nov 15, 2021 - Nov 17, 2021",Proceedings indexed by :
2,Conference Address,"Taichung, Taiwan",Proceedings indexed by :
3,Conference & Submission Link,http://www.ieee-aivr.org/,http://www.ieee-aivr.org/


In [125]:
table = aspconf.find_all('table')
df1 = pd.read_html(str(table[1]))[1]
df1.index = df1.iloc[0:,0]
df1 = df1.drop(columns=0)
df1 = df1.to_dict()
df1

{1: {'Impact Score': 0.82,
  '#Contributing Top Scientists': 8.0,
  '#Papers published by Top Scientists': 10.0,
  'Google Scholar H5-index': 4.0}}

In [124]:
table = aspconf.find_all('table')
df2 = pd.read_html(str(table))[2]
df2.index = df2.iloc[0:,0]
df2 = df2.drop(columns=0).to_dict()
df2

{1: {'Impact Score': 0.82,
  '#Contributing Top Scientists': 8.0,
  '#Papers published by Top Scientists': 10.0,
  'Google Scholar H5-index': 4.0}}

In [62]:
subjecttags = aspconf.find_all("i")
subjects = [x.a for x in subjecttags if x.a is not None]
subs = [i.text for i in subjects[2:]]
subs

['Hardware, Robotics & Electronics',
 'Signal Processing',
 'Software Engineering & Programming']

## Bringing all to one dictionary

In [112]:
datadict = {'conference name': name,
           'submission date' : submit,
           'conference date': date,
           'conference location': loc,
           'conference and submission link': link,
           'impact score': df1[1].get('impact score'),
           'contributing top scientists': df1[1].get('#Contributing Top Scientists'),
           'papers published by top scientists': df1[1].get('#Papers published by Top Scientists'),
           'Google scholar H5-index': df1[1].get('Google Scholar H5-index'),
           'Number of editions': df1[1].get('Number of Editions'),
           'subjects': subs}
datadict

{'conference name': 'AIVR 2021 : IEEE International Conference on Artificial Intelligence and Virtual Reality',
 'submission date': 'Thursday 15 Jul 2021',
 'conference date': 'Nov 15, 2021 - Nov 17, 2021',
 'conference location': 'Taichung, Taiwan',
 'conference and submission link': 'http://www.ieee-aivr.org/',
 'impact score': None,
 'contributing top scientists': 8.0,
 'papers published by top scientists': 10.0,
 'Google scholar H5-index': 4.0,
 'Number of editions': None,
 'subjects': ['Hardware, Robotics & Electronics',
  'Signal Processing',
  'Software Engineering & Programming']}

## Put all the code to scrape one page into a function:
input: URL
Output: dictionary with the data we want

In [137]:
def scrape_one_conference(url):
    
    print('Now scraping from' + url)
    
    r = requests.get(url, headers={'user-agent': 'Yayi Feng (yf7qq@virginia.edu) for Data Science 6001 at University of Virginia'})
    aspconf = BeautifulSoup(r.text)
    
    nametag = aspconf.title
    name = nametag.text.split(' «')[0]
    
    submissiontag = aspconf.find("td", text=re.compile('Submission Deadline'))
    submissiontag = submissiontag.find_next()
    submit = submissiontag.text.replace("\n","")
    
    datetag = aspconf.find("td", text=re.compile('Conference Date'))
    datetag = datetag.find_next()
    date = datetag.text.replace("\n","")
    date = date.replace("\t","")
    
    loctag = aspconf.find("td", text=re.compile('Conference Address'))
    loctag = loctag.find_next()
    loc = loctag.text.strip()
    
        
    linktag = aspconf.find("td", text=re.compile('Conference & Submission Link'))
    linktag = linktag.find_next()
    link = linktag.text.strip()
    
    table = aspconf.find_all('table')
    df2 = pd.read_html(str(table))[2]
    df2.index = df2.iloc[0:,0]
    df2 = df2.drop(columns=0).to_dict()
    
    subjecttags = aspconf.find_all("i")
    subjects = [x.a for x in subjecttags if x.a is not None]
    subs = [i.text for i in subjects[2:]]
    
    datadict = {'conference name': name,
           'submission date' : submit,
           'conference date': date,
           'conference location': loc,
           'conference and submission link': link,
           'impact score': df1[1].get('impact score'),
           'contributing top scientists': df1[1].get('#Contributing Top Scientists'),
           'papers published by top scientists': df1[1].get('#Papers published by Top Scientists'),
           'Google scholar H5-index': df1[1].get('Google Scholar H5-index'),
           'Number of editions': df1[1].get('Number of Editions'),
           'subjects': subs}
    return datadict

In [127]:
scrape_one_conference('https://www.guide2research.com/conference/sds-2021-international-conference-on-software-defined-systems')

{'conference name': 'SDS 2021 : International Conference on Software Defined Systems',
 'submission date': 'Thursday 15 Jul 2021',
 'conference date': 'Dec 6, 2021 - Dec 9, 2021',
 'conference location': 'Valencia, Spain',
 'conference and submission link': 'http://emergingtechnet.org/SDS2021/',
 'impact score': None,
 'contributing top scientists': 8.0,
 'papers published by top scientists': 10.0,
 'Google scholar H5-index': 4.0,
 'Number of editions': None,
 'subjects': ['Databases & Information Systems',
  'Software Engineering & Programming']}

# Next build the Spider:
First, collect the URLs of the pages we want to scrape

In [128]:
page1 = 'https://www.guide2research.com/conferences/page-1'
r = requests.get(page1, headers={'user-agent': 'Yayi Feng (yf7qq@virginia.edu) for Data Science 6001 at University of Virginia'})
page1html = BeautifulSoup(r.text)

In [129]:
confurls = page1html.find_all('h4')
confurls = [i.a['href'] for i in confurls]

In [130]:
scrape_one_conference(confurls[7])

{'conference name': 'ICERI 2021 : 14th International Conference of Education, Research and Innovation',
 'submission date': 'Thursday 15 Jul 2021',
 'conference date': 'Nov 8, 2021 - Nov 10, 2021',
 'conference location': 'Seville, Spain',
 'conference and submission link': 'http://iated.org/iceri',
 'impact score': None,
 'contributing top scientists': 8.0,
 'papers published by top scientists': 10.0,
 'Google scholar H5-index': 4.0,
 'Number of editions': None,
 'subjects': []}

In [132]:
conflist = [scrape_one_conference(i) for i in confurls]
pd.DataFrame.from_records(conflist).head()

Unnamed: 0,conference name,submission date,conference date,conference location,conference and submission link,impact score,contributing top scientists,papers published by top scientists,Google scholar H5-index,Number of editions,subjects
0,SDS 2021 : International Conference on Softwar...,Thursday 15 Jul 2021,"Dec 6, 2021 - Dec 9, 2021","Valencia, Spain",http://emergingtechnet.org/SDS2021/,,8.0,10.0,4.0,,"[Databases & Information Systems, Software Eng..."
1,AIVR 2021 : IEEE International Conference on A...,Thursday 15 Jul 2021,"Nov 15, 2021 - Nov 17, 2021","Taichung, Taiwan",http://www.ieee-aivr.org/,,8.0,10.0,4.0,,"[Graphics & Computer-Aided Design, Human Compu..."
2,5GWF 2021 : IEEE 5G World Forum,Thursday 15 Jul 2021,"Oct 13, 2021 - Oct 15, 2021","Montreal, Canada",https://ieee-wf-5g.org/,,8.0,10.0,4.0,,"[Networks and Communications, Signal Processing]"
3,APSIPA 2021 : Asia-Pacific Signal and Informat...,Thursday 15 Jul 2021,"Dec 14, 2021 - Dec 17, 2021","Tokyo, Japan",https://www.apsipa2021.org/,,8.0,10.0,4.0,,"[Databases & Information Systems, Signal Proce..."
4,ICSPCS 2021 : International Conference on Sign...,Thursday 15 Jul 2021,"Dec 13, 2021 - Dec 15, 2021","Online, Online",https://www.dspcs-witsp.com/icspcs_2021,,8.0,10.0,4.0,,"[Networks and Communications, Signal Processing]"


In [133]:
# collect urls from every page until runs out of pages

p = 1
newurls = 1
urls = []

In [135]:
while newurls > 0:
    print('Collecting URLs from page' + str(p))
    url = "https://www.guide2research.com/conferences/page-" + str(p)
    headers = {"user-agent": "yf7qq"}
    r = requests.get(url, headers=headers)
    
    pagehtml = BeautifulSoup(r.text, 'html.parser')
    
    confurls = pagehtml.find_all("h4")
    confurls=[i.a['href'] for i in confurls]
    
    urls = urls + confurls
    newurls = len(confurls)
    
    p += 1

Collecting URLs from page1
Collecting URLs from page2
Collecting URLs from page3
Collecting URLs from page4
Collecting URLs from page5
Collecting URLs from page6


In [138]:
[scrape_one_conference(i) for i in urls]

Now scraping fromhttps://www.guide2research.com/conference/sds-2021-international-conference-on-software-defined-systems
Now scraping fromhttps://www.guide2research.com/conference/aivr-2021-ieee-international-conference-on-artificial-intelligence-and-virtual-reality
Now scraping fromhttps://www.guide2research.com/conference/5gwf-2021-ieee-5g-world-forum
Now scraping fromhttps://www.guide2research.com/conference/apsipa-2021-asia-pacific-signal-and-information-processing-association-annual-summit-and-conference
Now scraping fromhttps://www.guide2research.com/conference/icspcs-2021-international-conference-on-signal-processing-and-communication-systems
Now scraping fromhttps://www.guide2research.com/conference/asicon-2021-ieee-international-conference-on-asic
Now scraping fromhttps://www.guide2research.com/conference/fmec-2021-international-conference-on-fog-and-mobile-edge-computing
Now scraping fromhttps://www.guide2research.com/conference/iceri-2021-14th-international-conference-of-edu

KeyboardInterrupt: 