<a href="https://colab.research.google.com/github/akbarriki/Scraping-Conferences/blob/main/Scraping_Conferences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Initialisation

In [54]:
import requests, time
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, date
from tqdm import tqdm

base_url = 'https://conferenceindex.org/'
dest_url = f'{base_url}conferences?page='

def scraping(url=dest_url, page=1):
  records = []
  maxpage = None
  resp = requests.get(url+str(page))
  
  if resp.status_code == 200:
    soup = BeautifulSoup(resp.content, 'html.parser')

    if current_page == 1:
      # get maximum number of pages to scrape
      maxpage = int(soup.find_all("a",{"class":"page-link"})[-2].get_text().strip())
  
    # get month and year conferences will be held
    month, year = [entry.strip() for entry in soup.find("div",{"class":"card-header"}).get_text().strip().split(",")]

    for li in soup.find("ul",{"class":"list-unstyled"}).find_all("li"):
      dct = {}
      date = li.next.strip().split()[-1]
      
      # get the exact conference date
      dct['conf_date'] = datetime.strftime(datetime.strptime(date+'-'+month+'-'+year, "%d-%B-%Y"), "%d-%m-%Y")
      
      # get the conference dedicated page at conferenceindex.org  
      dct['conf_link'] = li.find("a")['href']

      # get the conference name
      dct['conf_name'] = li.find("a").get_text().strip()

      # get the city and country each conference will be held
      dct['conf_city'], dct['conf_country'] = [entry.strip() for entry in li.find("a").nextSibling.replace("-","").strip().split(",")]
      records.append(dct)
  return records, maxpage

### Scraping

In [58]:
conferences = [] # all conferences will be stored in this list
current_page = 1 # initial current page
max_page = 99 # initial maximum number of pages
factor = 10 # the number of pages at which the scraping notification text appears. By default, the scraping notification appears at every 10 scraped pages

while current_page <= max_page:
  if current_page == 1 or current_page % factor == 1:
    last_factor_page = (current_page + factor) - 1
    print(f"Scraping Page {current_page} to {last_factor_page} ... ", end="")

  if current_page == 1:
    records, max_page = scraping(page=current_page)
    max_page = 100 # scraping first 100 pages
  else:
    records, _ = scraping(page=current_page)
  
  if records:
    conferences += records
  
  if current_page % factor == 0:
    print("done")  
  current_page += 1

print()
print(f"Scraping complete. {len(conferences)} conferences has been collected.")

Scraping Page 1 to 10 ... done
Scraping Page 11 to 20 ... done
Scraping Page 21 to 30 ... done
Scraping Page 31 to 40 ... done
Scraping Page 41 to 50 ... done
Scraping Page 51 to 60 ... done
Scraping Page 61 to 70 ... done
Scraping Page 71 to 80 ... done
Scraping Page 81 to 90 ... done
Scraping Page 91 to 100 ... done

Scraping complete. 49632 conferences has been collected.


### Sanity Check

In [61]:
df = pd.DataFrame(conferences)
df.sample(10)

Unnamed: 0,conf_date,conf_link,conf_name,conf_city,conf_country
36761,06-03-2023,https://conferenceindex.org/event/internationa...,International Conference on Pedagogy and Psych...,Rome,Italy
1056,06-02-2023,https://conferenceindex.org/event/internationa...,"International Conference on Mycology, Fungi an...",Kuala Lumpur,Malaysia
22244,20-02-2023,https://conferenceindex.org/event/internationa...,International Conference on Aeronautics and Ae...,Rome,Italy
18613,18-02-2023,https://conferenceindex.org/event/internationa...,International Conference on Computer Security ...,Rome,Italy
48200,22-03-2023,https://conferenceindex.org/event/internationa...,International Conference on Traffic Safety Stu...,Istanbul,Turkey
30478,25-02-2023,https://conferenceindex.org/event/internationa...,International Conference on Information Visual...,Sydney,Australia
6068,11-02-2023,https://conferenceindex.org/event/internationa...,International Conference on Labor Economics an...,Barcelona,Spain
32569,04-03-2023,https://conferenceindex.org/event/internationa...,International Conference on Environmental Poll...,Rio de Janeiro,Brazil
24119,22-02-2023,https://conferenceindex.org/event/internationa...,International Conference on Behaviour and Beha...,Paris,France
22057,20-02-2023,https://conferenceindex.org/event/internationa...,International Conference on Dementia and Alzhe...,Buenos Aires,Argentina


### Export to CSV

In [63]:
df.to_csv("conferences.csv", index=False, sep="|")