# Scraping Wikipedia Pages To Collect Airport Information

1. Scrape basic airport information such as Name, Location, IATA code, IACO code.
2. Going to the links provided for each airport and scraping the latitude and longitude information
3. After data for airports with IATA codes beginning with a letter has been collected, we store it into the CSV file. This avoids a large amount of data accumulating in the memory.
4. Ideally, we can also do this 5 or 6 letters at a time.

In [1]:
# Installing the required packages
!pip install -q -r requirements.txt

In [2]:
# To be executed only if working with Google Collab
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# Importing required packages, libraries and functions
from pprint import pprint
from bs4 import BeautifulSoup
import urllib
import requests
import sys
import re
import numpy as np
from string import ascii_lowercase, ascii_uppercase
import lxml
import lxml.html as lh
import pandas as pd
from unicodedata import normalize
import robotexclusionrulesparser
from lat_lon_parser import parse as l_parse
import re 
import csv
import time
import os
from pathlib import Path

In [4]:
# Initializing the base URL for Wikipedia
# And the URL string for alphabetical scraping of airports based on IATA codes
wiki_start = "https://en.wikipedia.org"
alpha_template = "https://en.wikipedia.org/wiki/List_of_airports_by_IATA_code:_{}"

In [5]:
# Web scraping restrictions on Wikipedia
resp = requests.get("{}/robots.txt".format(wiki_start))
robots = resp.text
rp = robotexclusionrulesparser.RobotFileParserLookalike()

## parse the robots file
rp.parse(robots)

# If the response code == 429 we hit the rate limit. 
RATE_LIMIT = 429

In [6]:
# Html & nav helper functions

def add_airport(one_row):
    stupid_dict = {
        'Link': wiki_start + one_row[2].find('a').get('href'),
        'Name': one_row[2].text,
        'IATA': one_row[0].text,
        'ICAO': one_row[1].text, 'Location': one_row[3].text,
        'Latitude': None,
        'Longitude': None
    }
    return stupid_dict                             
                            
def safe_get_html(address):
    r = requests.get(address)
        
    return r.status_code, BeautifulSoup(r.text, 'html.parser')

def get_coor_str(html_soup):
    lat_str = html_soup.find('span','latitude').text
    long_str = html_soup.find('span','longitude').text

    return lat_str, long_str
    
def get_lat_long(html_soup):
    lat_str = html_soup.find('span','latitude').text if html_soup.find('span','latitude') else "0"
    long_str = html_soup.find('span','longitude').text if html_soup.find('span','longitude') else "0"
    lat_decimal = l_parse(lat_str)
    long_decimal = l_parse(long_str)
    return lat_decimal, long_decimal

In [7]:
# Method to collect info for airports with IATA code beginning with i
def collect_data(i):
    start = time.time()
    url = alpha_template.format("_"+i)
    response = requests.get(url)
    print(url, response.status_code)
    soup = BeautifulSoup(response.text, 'html.parser')

    table_data = soup.find('body').find('table').find('tbody')
    table_rows = table_data.find_all("tr")
    _ = table_rows.pop(0)

    # there are separators every few rows, to denote the first 2 characters of the airport iata code, ie -AA- , -AB- ect
    just_airports = [x for x in table_rows if(len(x.find_all('td'))>0)]
    air_list = [port.find_all('td') for port in just_airports]

    air_page_list  = []
    for one_row in air_list:
        air_page_list.append(add_airport(one_row))

    # now we loop over the list and navigate to the target page to collect the lat/long
    problem_pages = []
    for air_page in air_page_list:
        if "Airport" in air_page["Link"]:
            if air_page["Link"].find('redlink=1') == -1:
                pg_status, testSoup = safe_get_html(air_page["Link"])
                if testSoup:
                    air_page["Latitude"], air_page["Longitude"] = get_lat_long(testSoup)
                else:
                    # Setting aside airports with problematic links
                    problem_pages.append((air_page, pg_status))
                    continue
            else:
                # Setting aside airports with problematic links
                problem_pages.append((air_page, 404))
        else:
            # Setting aside airports with problematic links
            problem_pages.append((air_page, 404))
    print(f"Finished Scraping {len(air_page_list)} Pages, {len(problem_pages)} Pages Unreachable.")
    print(round(time.time() - start, 2), "s to process {}".format(i))
    return air_page_list, problem_pages

In [8]:
# For plugging in missing values from an external dataset
external_dataset_url = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat"
airport_df = pd.read_csv(external_dataset_url, header=None)
airport_df.drop(columns=[0], inplace=True)

In [9]:
# Driver Code For Controlled Web Scraping
csv_file = "airports.csv"
if os.path.isfile(csv_file):
    os.remove(csv_file)
Path(csv_file).touch()

# Looping over all alphabets
for i in ascii_uppercase:

    # Calling method to scraping the data
    airports, _ = collect_data(i)
    
    # Plugging in missing data
    count = 0
    for airport in airports:
        if airport["Latitude"] is None or airport["Longitude"] is None:
            iata = airport["IATA"]
            lat = airport_df[airport_df[4] == iata][6]
            lon = airport_df[airport_df[4] == iata][7]
            if len(np.array(lat)) != 0 and len(np.array(lon)) != 0:
                count += 1
                airport["Latitude"] = np.array(lat)[0]
                airport["Longitude"] = np.array(lon)[0]
    print("{} Missing Values Filled".format(count))
    # Writing the data to a file after every letter
    csv_columns = ["Link","Name","IATA","ICAO","Location","Latitude","Longitude"]
    dict_data = airports
    try:
        with open(csv_file, mode ='a', encoding="utf-8") as csvfile:
            print("Writing Collected Data To File...")
            print("Size of Data Written to File - {} bytes.\n".format(sys.getsizeof(dict_data)))
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns,lineterminator='\n') #delimiter = ',', lineterminator='\n'
            if os.stat(csv_file).st_size == 0: #check size of the file, if file already exists do not write headers when appending each IATA code of airport data 
                writer.writeheader()
            for data in dict_data:
                writer.writerow(data)
    except IOError:
        print("I/O error")

https://en.wikipedia.org/wiki/List_of_airports_by_IATA_code:__A 200
Finished Scraping 527 Pages, 98 Pages Unreachable.
76.5 s to process A
24 Missing Values Filled
Writing Collected Data To File...
Size of Data Written to File - 4840 bytes.

https://en.wikipedia.org/wiki/List_of_airports_by_IATA_code:__B 200
Finished Scraping 616 Pages, 130 Pages Unreachable.
144.29 s to process B
33 Missing Values Filled
Writing Collected Data To File...
Size of Data Written to File - 5488 bytes.

https://en.wikipedia.org/wiki/List_of_airports_by_IATA_code:__C 200
Finished Scraping 545 Pages, 102 Pages Unreachable.
139.2 s to process C
22 Missing Values Filled
Writing Collected Data To File...
Size of Data Written to File - 4840 bytes.

https://en.wikipedia.org/wiki/List_of_airports_by_IATA_code:__D 200
Finished Scraping 302 Pages, 79 Pages Unreachable.
68.1 s to process D
19 Missing Values Filled
Writing Collected Data To File...
Size of Data Written to File - 2528 bytes.

https://en.wikipedia.org/wi

In [10]:
# Uncomment and run this code to download the csv file
# To your computer if you are working on Google Collab

# from google.colab import files
# files.download(csv_file)

In [2]:
# Some Optional Code To Load Data From
# Our Output File
import re 
import pandas as pd 
airportdata = pd.read_csv("airports.csv")
airportdata.isnull().sum()

Link            0
Name            0
IATA            0
ICAO         1347
Location        0
Latitude     1507
Longitude    1507
dtype: int64