# Web scraping using pandas

pandas provides a `read_html` function which is convenient to parse tables from web pages. As an example, we will use the https://en.wikipedia.org/wiki/List_of_ICAO_aircraft_type_designators#cite_note-ICAOcode-2 to get IATA and ICAO codes for aircrats.

In [1]:
import pandas as pd

In [2]:
out = pd.read_html("https://en.wikipedia.org/wiki/List_of_ICAO_aircraft_type_designators#cite_note-ICAOcode-2")

This returns a list of all tables found in the page:

In [3]:
print type(out)

<type 'list'>


Each item in this list is a dataframe:

In [4]:
out[0].head()

Unnamed: 0,0,1,2
0,ICAO Code[2],IATA Code,Model
1,A124,A4F,Antonov AN-124 Ruslan
2,A140,A40,Antonov AN-140
3,A148,A81,Antonov An-148
4,A158,A58,Antonov An-158


We can see that first row holds labels of columns, we can fix this easilly:

In [5]:
df = out[0].drop(axis=0, labels=0)
df.columns = ["ICAO", "IATA", "Fullname"]

In [6]:
df.head()

Unnamed: 0,ICAO,IATA,Fullname
1,A124,A4F,Antonov AN-124 Ruslan
2,A140,A40,Antonov AN-140
3,A148,A81,Antonov An-148
4,A158,A58,Antonov An-158
5,A19N,319,Airbus A319neo


We can clean the dataset by removing duplicates (if any):

In [7]:
df = df[~df.duplicated()]

Split manufacturer from model:

In [8]:
def split(row):
    series = row.copy()
    fields = series.Fullname.split()
    try:
        field2 = fields[2]
    except IndexError:
        field2 = None
    try:
        field3 = fields[3]
    except IndexError:
        field3 = None
    if series.Fullname == "Launch - Boat":
        series["Manufacturer"] = u"Launch"
        series["Model"] = u"Boat"
    elif series.Fullname == "Road Feeder Service - Cargo Truck":
        series["Manufacturer"] = u"Road Feeder Service"
        series["Model"] = u"Cargo Truck"
    elif series.Fullname == "Concorde":
        series["Manufacturer"] = u"Aerospatiale"
        series["Model"] = u"Concorde"
    elif series.Fullname == "EV-97 EuroStar / EV-97 Harmony":
        series["Manufacturer"] = u"Evektor-Aerotechnik"
        series["Model"] = u"EV-97 EuroStar / EV-97 Harmony"
    elif series.Fullname == "Evektor SportStar":
        series["Manufacturer"] = u"Evektor-Aerotechnik"
        series["Model"] = u"SportStar"
    elif series.Fullname == "Gulfstream Aerospace G-159 Gulfstream I":
        series["Manufacturer"] = u"Grumman"
        series["Model"] = u"G-159 Gulfstream I"
    elif fields[0] == "Pipistrel":
        series["Manufacturer"] = u"Pipistrel"
        series["Model"] = fields[1].rstrip('01234569789')
    elif len(fields) == 1:
        series["Manufacturer"] = series.Fullname
        series["Model"] = series.Fullname
    elif field3 in ["Siddeley)"]:
        series["Manufacturer"] = " ".join(fields[0:4])
        series["Model"] = " ".join(fields[4:])
    elif field2 in ["Factories", "Aviation)", "IPTN", "Mystere)", "Canada", "Industries", "(BAC)"]:
        series["Manufacturer"] = u" ".join(fields[0:3])
        series["Model"] = u" ".join(fields[3:])
    elif fields[1] in ["Industrie", "(Aero)", "(Aerospatiale)", "Aerospace", "(BAC)", "Britten-Norman", 
                       "Dornier", "Havilland", "Siddeley", "Douglas", "(MBB)", "Helicopters", "(Nord)", 
                       "(Swearingen)", "Yunshuji", "Ultralight", "Rocket"]:
        series["Manufacturer"] = u" ".join(fields[0:2])
        series["Model"] = u" ".join(fields[2:])
    elif fields[0] in ["Fokker", "BAe", "Airbus", "Boeing", "Antonov", "Agusta", "Ayres", "Avro", 
                       "Aerospatiale/Alenia", "Beechcraft", "Bell", "Canadair", "Cessna", "Convair", 
                       "Curtiss", "Douglas", "Embraer", "Eurocopter", "Fairchild", "Grumman", "Helio",
                       "Ilyushin", "Junkers", "Lockheed", "LET", "Gates", "MIL", "Mitsubishi", "Piper",
                       "Pilatus", "Partenavia", "Saab", "Sikorsky", "Shorts", "Aerospatiale/BAC", 
                       "Sukhoi", "Tupolev", "Vickers", "Yakovlev", "NAMC", "Beechcfrat", "Aerospatiale", 
                       "ICON", "Aeroprakt", "Robin", "Bombardier", "Beriev", "COMAC", "CASA/IPTN", 
                       "Tecnam", "Dassault", "Gulfstream", "Honda", "Learjet", "Mil", "Pipistrel", 
                       "Reims-Cessna", "Socata"]:
        series["Manufacturer"] = fields[0]
        series["Model"] = u" ".join(fields[1:])
    else:
        series["Manufacturer"] = series.Fullname
        series["Model"] = series.Fullname
    series.drop(labels="Fullname", inplace=True)
    return series

In [9]:
df2 = df.apply(split, axis=1)

And now let's map manufacturer names:

In [10]:
df2.Manufacturer.replace({u"BAe": u"British Aerospace",
                          u"British Aerospace (BAC)": u"British Aerospace",
                          u"Airbus Industrie": u"Airbus", 
                          u"Gulfstream/Rockwell (Aero)": u"Gulfstream/Rockwell",
                          u"Gulfstream/Rockwell": u"Gulfstream/Rockwell",
                          u"Eurocopter (Aerospatiale)": u"Eurocopter",
                          u"Eurocopter (MBB)": u"Eurocopter", 
                          u"Beechcfrat": u"Beechcraft", 
                          u"Aerospatiale (Sud Aviation)": u"Aerospatiale",
                          u"Aerospatiale (Nord)": u"Aerospatiale",
                          u"Aerospatiale/Alenia": u"ATR",
                          u"CASA / IPTN": u"CASA/IPTN",
                          u"Dassault (Breguet Mystere)": u"Dassault",
                          u"MIL": u"Mil",
                          u"De Havilland Canada": u"De Havilland"}, inplace=True)

In [11]:
manuf = df2.Manufacturer.unique()
manuf.sort()
print manuf

[u'ATR' u'Aeroprakt' u'Aerospatiale' u'Airbus' u'Antonov' u'Avro'
 u'Beechcraft' u'Bell' u'Beriev' u'Boeing' u'Bombardier'
 u'British Aerospace' u'CASA/IPTN' u'COMAC' u'Canadair' u'Cessna'
 u'Convair' u'Curtiss' u'Dassault' u'De Havilland' u'Douglas' u'Embraer'
 u'Eurocopter' u'Evektor-Aerotechnik' u'Fairchild Dornier' u'Fokker'
 u'Government Aircraft Factories' u'Grumman' u'Gulfstream'
 u'Gulfstream/Rockwell' u'Harbin Yunshuji' u'Hawker Siddeley' u'Honda'
 u'ICON' u'Ilyushin' u'Israel Aircraft Industries' u'Junkers' u'LET'
 u'Learjet' u'Lockheed' u'MD Helicopters' u'McDonnell Douglas' u'Mil'
 u'Mitsubishi' u'NAMC' u'Partenavia' u'Pilatus' u'Pilatus Britten-Norman'
 u'Piper' u'Pipistrel' u'Reims-Cessna' u'Robin' u'Saab' u'Shorts'
 u'Sikorsky' u'Socata' u'Sukhoi' u'TL Ultralight' u'Team Rocket' u'Tecnam'
 u'Tupolev' u'Yakovlev']


In [12]:
models = df2.Model.unique()
models.sort()
print models

[u'100' u'1124 Westwind'
 u'125 series / Hawker/Raytheon 700/800/800XP/850/900'
 u'125-1000 series / Hawker/Raytheon 1000' u'146-100' u'146-200'
 u'146-300' u'152' u'162' u'170' u'172' u'172 Cutlass RG'
 u'175 (long wing)' u'175 (short wing)' u'177 Cardinal RG' u'182 Skylane'
 u'190' u'1900' u'195' u'2000' u'208 Caravan' u'210 Centurion' u'212'
 u'212 Aviocar' u'262' u'328JET' u'35 / 36 / C-21A' u'410' u'412' u'415'
 u'429' u'50' u'60' u'70' u'707' u'717' u'720B' u'727-100' u'727-200'
 u'737 MAX 10' u'737 MAX 7' u'737 MAX 8' u'737 MAX 9' u'737-200'
 u'737-300' u'737-400' u'737-500' u'737-600' u'737-700' u'737-800'
 u'737-900' u'747 LCF Dreamlifter' u'747-100' u'747-200' u'747-300'
 u'747-400' u'747-8' u'747SP' u'747SR' u'757-200' u'757-300' u'767-200'
 u'767-300' u'767-400' u'777-200 / Boeing 777-200ER'
 u'777-200LR / Boeing 777F' u'777-300' u'777-300ER' u'777-8' u'777-9'
 u'787-10' u'787-8' u'787-9' u'A-22 Foxbat / A-22 Valor / A-22 Vision'
 u'A300' u'A300-600' u'A300-600ST Beluga Fre

Now save the dataset for a later re-use:

In [13]:
df2.to_csv("../data/aircrafts_codes.csv", encoding='utf-8')