# Web scraping using pandas

pandas provides a `read_html` function which is convenient to parse tables from web pages. As an example, we will use the https://en.wikipedia.org/wiki/List_of_ICAO_aircraft_type_designators#cite_note-ICAOcode-2 to get IATA and ICAO codes for aircrats.

In [1]:
import pandas as pd

In [2]:
out = pd.read_html("https://en.wikipedia.org/wiki/List_of_ICAO_aircraft_type_designators#cite_note-ICAOcode-2")

This returns a list of all tables found in the page:

In [5]:
print(type(out))

<class 'list'>


Each item in this list is a dataframe:

In [6]:
out[0].head()

Unnamed: 0,0,1,2
0,ICAOCode[2],IATACode,Model
1,A124,A4F,Antonov AN-124 Ruslan
2,A140,A40,Antonov AN-140
3,A148,A81,Antonov An-148
4,A158,A58,Antonov An-158


We can see that first row holds labels of columns, we can fix this easilly:

In [7]:
df = out[0].drop(axis=0, labels=0)
df.columns = ["ICAO", "IATA", "Fullname"]

In [8]:
df.head()

Unnamed: 0,ICAO,IATA,Fullname
1,A124,A4F,Antonov AN-124 Ruslan
2,A140,A40,Antonov AN-140
3,A148,A81,Antonov An-148
4,A158,A58,Antonov An-158
5,A19N,32D,Airbus A319neo


We can clean the dataset by removing duplicates (if any):

In [9]:
df = df[~df.duplicated()]

Split manufacturer from model:

In [10]:
def split(row):
    series = row.copy()
    fields = series.Fullname.split()
    try:
        field2 = fields[2]
    except IndexError:
        field2 = None
    try:
        field3 = fields[3]
    except IndexError:
        field3 = None
    if series.Fullname == "Launch - Boat":
        series["Manufacturer"] = u"Launch"
        series["Model"] = u"Boat"
    elif series.Fullname == "Road Feeder Service - Cargo Truck":
        series["Manufacturer"] = u"Road Feeder Service"
        series["Model"] = u"Cargo Truck"
    elif series.Fullname == "Concorde":
        series["Manufacturer"] = u"Aerospatiale"
        series["Model"] = u"Concorde"
    elif series.Fullname == "EV-97 EuroStar / EV-97 Harmony":
        series["Manufacturer"] = u"Evektor-Aerotechnik"
        series["Model"] = u"EV-97 EuroStar / EV-97 Harmony"
    elif series.Fullname == "Evektor SportStar":
        series["Manufacturer"] = u"Evektor-Aerotechnik"
        series["Model"] = u"SportStar"
    elif series.Fullname == "Gulfstream Aerospace G-159 Gulfstream I":
        series["Manufacturer"] = u"Grumman"
        series["Model"] = u"G-159 Gulfstream I"
    elif fields[0] == "Pipistrel":
        series["Manufacturer"] = u"Pipistrel"
        series["Model"] = fields[1].rstrip('01234569789')
    elif len(fields) == 1:
        series["Manufacturer"] = series.Fullname
        series["Model"] = series.Fullname
    elif field3 in ["Siddeley)"]:
        series["Manufacturer"] = " ".join(fields[0:4])
        series["Model"] = " ".join(fields[4:])
    elif field2 in ["Factories", "Aviation)", "IPTN", "Mystere)", "Canada", "Industries", "(BAC)"]:
        series["Manufacturer"] = u" ".join(fields[0:3])
        series["Model"] = u" ".join(fields[3:])
    elif fields[1] in ["Industrie", "(Aero)", "(Aerospatiale)", "Aerospace", "(BAC)", "Britten-Norman", 
                       "Dornier", "Havilland", "Siddeley", "Douglas", "(MBB)", "Helicopters", "(Nord)", 
                       "(Swearingen)", "Yunshuji", "Ultralight", "Rocket"]:
        series["Manufacturer"] = u" ".join(fields[0:2])
        series["Model"] = u" ".join(fields[2:])
    elif fields[0] in ["Fokker", "BAe", "Airbus", "Boeing", "Antonov", "Agusta", "Ayres", "Avro", 
                       "Aerospatiale/Alenia", "Beechcraft", "Bell", "Canadair", "Cessna", "Convair", 
                       "Curtiss", "Douglas", "Embraer", "Eurocopter", "Fairchild", "Grumman", "Helio",
                       "Ilyushin", "Junkers", "Lockheed", "LET", "Gates", "MIL", "Mitsubishi", "Piper",
                       "Pilatus", "Partenavia", "Saab", "Sikorsky", "Shorts", "Aerospatiale/BAC", 
                       "Sukhoi", "Tupolev", "Vickers", "Yakovlev", "NAMC", "Beechcfrat", "Aerospatiale", 
                       "ICON", "Aeroprakt", "Robin", "Bombardier", "Beriev", "COMAC", "CASA/IPTN", 
                       "Tecnam", "Dassault", "Gulfstream", "Honda", "Learjet", "Mil", "Pipistrel", 
                       "Reims-Cessna", "Socata"]:
        series["Manufacturer"] = fields[0]
        series["Model"] = u" ".join(fields[1:])
    else:
        series["Manufacturer"] = series.Fullname
        series["Model"] = series.Fullname
    series.drop(labels="Fullname", inplace=True)
    return series

In [11]:
df2 = df.apply(split, axis=1)

And now let's map manufacturer names:

In [12]:
df2.Manufacturer.replace({u"BAe": u"British Aerospace",
                          u"British Aerospace (BAC)": u"British Aerospace",
                          u"Airbus Industrie": u"Airbus", 
                          u"Gulfstream/Rockwell (Aero)": u"Gulfstream/Rockwell",
                          u"Gulfstream/Rockwell": u"Gulfstream/Rockwell",
                          u"Eurocopter (Aerospatiale)": u"Eurocopter",
                          u"Eurocopter (MBB)": u"Eurocopter", 
                          u"Beechcfrat": u"Beechcraft", 
                          u"Aerospatiale (Sud Aviation)": u"Aerospatiale",
                          u"Aerospatiale (Nord)": u"Aerospatiale",
                          u"Aerospatiale/Alenia": u"ATR",
                          u"CASA / IPTN": u"CASA/IPTN",
                          u"Dassault (Breguet Mystere)": u"Dassault",
                          u"MIL": u"Mil",
                          u"De Havilland Canada": u"De Havilland"}, inplace=True)

In [14]:
manuf = df2.Manufacturer.unique()
manuf.sort()
print(manuf)

['ATR' 'Aeroprakt' 'Aerospatiale' 'Airbus' 'Antonov' 'Avro' 'Beechcraft'
 'Bell' 'Beriev' 'Boeing' 'Bombardier' 'British Aerospace' 'CASA/IPTN'
 'COMAC' 'Canadair' 'Cessna' 'Convair' 'Curtiss' 'Dassault' 'De Havilland'
 'Douglas' 'Embraer' 'Eurocopter' 'Evektor-Aerotechnik'
 'Fairchild Dornier' 'Fokker' 'Government Aircraft Factories' 'Grumman'
 'Gulfstream' 'Gulfstream/Rockwell' 'Harbin Yunshuji' 'Hawker Siddeley'
 'Honda' 'ICON' 'Ilyushin' 'Israel Aircraft Industries' 'Junkers' 'LET'
 'Learjet' 'Lockheed' 'MD Helicopters' 'McDonnell Douglas' 'Mil'
 'Mitsubishi' 'NAMC' 'Partenavia' 'Pilatus' 'Pilatus Britten-Norman'
 'Piper' 'Pipistrel' 'Reims-Cessna' 'Robin' 'Saab' 'Shorts' 'Sikorsky'
 'Socata' 'Sukhoi' 'TL Ultralight' 'Team Rocket' 'Tecnam' 'Tupolev'
 'Yakovlev']


In [15]:
models = df2.Model.unique()
models.sort()
print(models)

['100' '1124 Westwind'
 '125 series / Hawker/Raytheon 700/800/800XP/850/900'
 '125-1000 series / Hawker/Raytheon 1000' '146-100' '146-200' '146-300'
 '152' '162' '170' '172' '172 Cutlass RG' '175 (long wing)'
 '175 (short wing)' '177 Cardinal RG' '182 Skylane' '190' '1900' '195'
 '2000' '208 Caravan' '210 Centurion' '212' '212 Aviocar' '262' '328JET'
 '35 / 36 / C-21A' '410' '412' '415' '429' '50' '60' '70' '707' '717'
 '720B' '727-100' '727-200' '737 MAX 10' '737 MAX 7' '737 MAX 8'
 '737 MAX 9' '737-100' '737-200' '737-300' '737-400' '737-500' '737-600'
 '737-700' '737-800' '737-900' '747 LCF Dreamlifter' '747-100' '747-200'
 '747-300' '747-400' '747-8' '747SP' '747SR' '757-200' '757-300' '767-200'
 '767-300' '767-400' '777-200 / Boeing 777-200ER'
 '777-200LR / Boeing 777F' '777-300' '777-300ER' '777-8' '777-9' '787-10'
 '787-8' '787-9' 'A-22 Foxbat / A-22 Valor / A-22 Vision' 'A220-100'
 'A220-300' 'A300-600' 'A300-600ST "Super Transporter" / "Beluga"'
 'A300B1' 'A300B2, A300B4, and 

Now save the dataset for a later re-use:

In [16]:
df2.to_csv("../data/aircrafts_codes.csv", encoding='utf-8', index=False)