# Import the airlines ICAO dataset from wikipedia

To import data from web page we need two main library:
- urllib that allow us to request the HTML source code
- BeautifulSoup that is a useful HTML parser.

In [None]:
from bs4 import BeautifulSoup
import numpy as np
from datetime import datetime
import time
import urllib.request

#### Request the html code and pass it to Beautiful soup

In [None]:
fp = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_airline_codes")
mybytes = fp.read()

html = mybytes.decode("utf8")
fp.close()
soup = BeautifulSoup(html)

### Read data
Find the table element in the html and then find all the rows (tr) inside.
Data are organized in this way:
- first row is the header, so we keep them in a separate list to use them later
- all the remain rows are data.

In [None]:
table = soup.find("table", recursive=True)
caption = table.find("caption",recursive =True)

header = []
'''
Order of tds
IATA
ICAO
Airline
Call sign
Country/Region
Comments'''
#I will never use the comments, so I skip it

tbody = table.find("tbody",recursive=True)
trs = tbody.find_all("tr",recursive=True)
i=0
rows = 0
print("Found "+str(len(trs))+" rows in the table")
data = np.empty(0)
for tr in trs:
    #first tr is the header
    if i==0:
        i+=1
        ths = tr.find_all("th",recursive=True)
        for th in ths:
            text = th.getText().replace("\n","")
            header.append(text)
    else:
        tds = tr.find_all("td",recursive=True)
        line = []
        if len(tds) < 5:
            #last stupid row with no element..
            break
        for td in tds:
            text = td.getText().replace("\n","").replace(",",";")
            line.append(text)
        data = np.append(data,line[0:5])
        rows+=1
        data = data.reshape(rows,5)
        #print(tr.getText())
header = header[0:5]

### Other Codes
After the table there are a lot of other codes related to airlines companies, so we also look at them.<br>
Since all the informations are inside some paragraph, we found a textual pattern to retrieve them. The pattern is:<br>
|-<br>
| IATA CODE<br>
| ICAO CODE<br>
| Airline<br>
| Callsign<br>
| Country/Region<br>
| Comment

In [None]:
p = soup.find_all("p")
for l in p:
    if "|" not in l.getText():
        continue
    for s in l.getText().split("|-"):
        spl = s.replace("\n","").replace(","," ").split("|")
        if len(spl) == 7:
            data = np.append(data,spl[1:6])
            rows+=1
            data = data.reshape(rows,5)

Export the data into a CSV file

In [None]:
head = ','.join(header)
np.savetxt('airtrafficDB/data/airline.csv', data, header=head, fmt=['%s','%s','%s','%s','%s'],delimiter=',')