# Web scraping

## Import libraries

In [None]:
from pathlib import Path

import pandas as pd
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from functions.data_etl.scrapy.datacenters_com.spiders.datacenters_com import DataCenterSpider
from functions.data_etl.web_scraping import fetch_datacenter_com_master_list


In [2]:
# Set csv paths
DATA_PATH_ROOT = Path("data/outputs/0_webscraping/")
BASIC_INFO_CSV_PATH = DATA_PATH_ROOT / "datacenters_com_basic_info.csv"
DETAILED_INFO_CSV_PATH = DATA_PATH_ROOT / "datacenters_com_details.csv"
FULL_INFO_CSV_PATH = DATA_PATH_ROOT / "datacenters_com.csv"

In [None]:
# Fetch basic datacenter info
basic_info = fetch_datacenter_com_master_list(BASIC_INFO_CSV_PATH)

In [None]:
# Set up the Scrapy spider output path
scrapy_settings = get_project_settings()
scrapy_settings.update({"FEEDS": {str(DETAILED_INFO_CSV_PATH.resolve()): {"format": "csv", "overwrite": True}}})

# Create the crawler process and run the Scrapy spider to fetch detailed info
scrapy_process = CrawlerProcess(scrapy_settings)
scrapy_process.crawl(DataCenterSpider)
scrapy_process.start()

In [5]:
# Merge back the data from the two sources and output the final dataset
basic_info = pd.read_csv(BASIC_INFO_CSV_PATH)
detailed_info = pd.read_csv(DETAILED_INFO_CSV_PATH)

full_info = basic_info.merge(detailed_info, on="url")
full_info = full_info.drop(columns=["url"])

full_info.to_csv(FULL_INFO_CSV_PATH, index=False)