In [10]:
#1. Quiz: Carrier List

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task in this exercise is to modify 'extract_carrier()` to get a list of
all airlines. Exclude all of the combination values like "All U.S. Carriers"
from the data that you return. You should return a list of codes for the
carriers.

All your changes should be in the 'extract_carrier()' function. The
'options.html' file in the tab above is a stripped down version of what is
actually on the website, but should provide an example of what you should get
from the full file.

Please note that the function 'make_request()' is provided for your reference
only. You will not be able to to actually use it from within the Udacity web UI.
"""

from bs4 import BeautifulSoup
import pprint

html_page = "DataElements.htm"


def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "html.parser")
        carrier_list = soup.find(id = 'CarrierList')
        for option in carrier_list.find_all('option'):
            carrier = str(option['value'])
            if len(carrier) == 2:
                data.append(carrier)

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
               data = (("__EVENTTARGET", ""),
                       ("__EVENTARGUMENT", ""),
                       ("__VIEWSTATE", viewstate),
                       ("__VIEWSTATEGENERATOR",viewstategenerator),
                       ("__EVENTVALIDATION", eventvalidation),
                       ("CarrierList", carrier),
                       ("AirportList", airport),
                       ("Submit", "Submit")))

    return r.text


def test():
    data = extract_carriers(html_page)
    assert len(data) == 16
    assert "FL" in data
    assert "NK" in data

if __name__ == "__main__":
    #test()
    #test won't pass because actual quiz uses old data
    pprint.pprint(extract_carriers(html_page))

['AS',
 'G4',
 'AA',
 '5Y',
 'DL',
 'MQ',
 'EV',
 'F9',
 'HA',
 'B6',
 'OO',
 'WN',
 'NK',
 'UA',
 'VX']


In [17]:
#2. Quiz: Airport List

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Complete the 'extract_airports()' function so that it returns a list of airport
codes, excluding any combinations like "All".

Refer to the 'options.html' file in the tab above for a stripped down version
of what is actually on the website. The test() assertions are based on the
given file.
"""

from bs4 import BeautifulSoup
html_page = "DataElements.htm"


def extract_airports(page):
    data = []
    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "html.parser")
        airportlist = soup.find(id = 'AirportList')
        for option in airportlist.find_all('option'):
            airport = str(option['value'])
            if 'All' not in airport:
                data.append(airport)
    
    return data


def test():
    data = extract_airports(html_page)
    assert len(data) == 15
    assert "ATL" in data
    assert "ABR" in data

if __name__ == "__main__":
    #test()
    print(extract_airports(html_page))

['ATL', 'BWI', 'BOS', 'CLT', 'MDW', 'ORD', 'DAL', 'DFW', 'DEN', 'DTW', 'FLL', 'IAH', 'LAS', 'LAX', 'MIA', 'MSP', 'JFK', 'LGA', 'EWR', 'MCO', 'PHL', 'PHX', 'PDX', 'SLC', 'SAN', 'SFO', 'SEA', 'TPA', 'DCA', 'IAD', 'UXM', 'ABR', 'ABI', 'DYS', 'ADK', 'VZF', 'BQN', 'AKK', 'KKI', 'AKI', 'AKO', 'CAK', '7AK', 'KQA', 'AUK', 'ALM', 'ALS', 'ABY', 'ALB', 'ABQ', 'ZXB', 'WKK', 'AED', 'AEX', 'AXN', 'AET', 'ABE', 'AIA', 'APN', 'DQH', 'AOO', 'AMA', 'ABL', 'OQZ', 'AOS', 'OTS', 'AKP', 'EDF', 'DQL', 'MRI', 'ANC', 'AND', 'AGN', 'ANI', 'ANN', 'ANB', 'ANV', 'ATW', 'ACV', 'ARC', 'ADM', 'AVL', 'HTS', 'ASE', 'AST', 'AHN', 'AKB', 'PDK', 'FTY', 'ACY', 'ATT', 'ATK', 'MER', 'AUO', 'AGS', 'AUG', 'AUS', 'A28', 'BFL', 'BGR', 'BHB', 'BRW', 'BTI', 'BQV', 'A2K', 'BTR', 'BTL', 'AK2', 'A56', 'BTY', 'BPT', 'BVD', 'WBQ', 'BKW', 'BED', 'A11', 'KBE', 'BLV', 'BLI', 'BLM', 'JVL', 'BVU', 'BJI', 'RDM', 'BEH', 'BET', 'BTT', 'BVY', 'OQB', 'A50', 'BIC', 'BIG', 'BGQ', 'BMX', 'PWR', 'A85', 'BIL', 'BIX', 'BGM', 'KBC', 'BHM', 'BIS', 'BYW'

In [82]:
#3. Quiz: Processing All

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Let's assume that you combined the code from the previous 2 exercises with code
from the lesson on how to build requests, and downloaded all the data locally.
The files are in a directory "data", named after the carrier and airport:
"{}-{}.html".format(carrier, airport), for example "FL-ATL.html".

The table with flight info has a table class="dataTDRight". Your task is to
use 'process_file()' to extract the flight data from that table as a list of
dictionaries, each dictionary containing relevant data from the file and table
row. This is an example of the data structure you should return:

data = [{"courier": "FL",
         "airport": "ATL",
         "year": 2012,
         "month": 12,
         "flights": {"domestic": 100,
                     "international": 100}
        },
         {"courier": "..."}
]

Note - year, month, and the flight data should be integers.
You should skip the rows that contain the TOTAL data for a year.

There are couple of helper functions to deal with the data files.
Please do not change them for grading purposes.
All your changes should be in the 'process_file()' function.

The 'data/FL-ATL.html' file in the tab above is only a part of the full data,
covering data through 2003. The test() code will be run on the full table, but
the given file should provide an example of what you will get.
"""
from bs4 import BeautifulSoup
from zipfile import ZipFile
import os

datadir = "data"
page = 'FL-ATL.htm'

def open_zip(datadir):
    with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
        myzip.extractall()


def process_all(datadir):
    files = os.listdir(datadir)
    return files


def process_file(f):
    """
    This function extracts data from the file given as the function argument in
    a list of dictionaries. This is example of the data structure you should
    return:

    data = [{"courier": "FL",
             "airport": "ATL",
             "year": 2012,
             "month": 12,
             "flights": {"domestic": 100,
                         "international": 100}
            },
            {"courier": "..."}
    ]


    Note - year, month, and the flight data should be integers.
    You should skip the rows that contain the TOTAL data for a year.
    """
    data = []
    info = {}
    info["courier"], info["airport"] = f[:6].split("-")
    # Note: create a new dictionary for each entry in the output data list.
    # If you use the info dictionary defined here each element in the list 
    # will be a reference to the same info dictionary.
    with open(f, "r") as html:

        soup = BeautifulSoup(html, "html.parser")
        dataTDRight = soup.find_all("tr", class_ = "dataTDRight")
        all_no_total_values = []
        for item in dataTDRight:
            values = item.find_all("td")
            no_total_values = []
            for value in values:
                if not value.findChildren('b'):
                    no_total_values.append(value.text)
            all_no_total_values.append(no_total_values)
        #pprint.pprint(all_no_total_values)
        for line in all_no_total_values:
            if line:
                line_info = {}
                line_info['courier'] = info['courier']
                line_info['airport'] = info['airport']
                line_info['year'] = int(line[0])
                line_info['month'] = int(line[1])
                flights = {}
                flights['domestic'] = int(line[2].replace(",", ""))
                if line[3].strip():
                    flights['international'] = int(line[3].replace(",", ""))
                line_info['flights'] = flights
                
                data.append(line_info)
                    
        return data


def test():
    print "Running a simple test..."
    open_zip(datadir)
    files = process_all(datadir)
    data = []
    # Test will loop over three data files.
    for f in files:
        data += process_file(f)
        
    assert len(data) == 399  # Total number of rows
    for entry in data[:3]:
        assert type(entry["year"]) == int
        assert type(entry["month"]) == int
        assert type(entry["flights"]["domestic"]) == int
        assert len(entry["airport"]) == 3
        assert len(entry["courier"]) == 2
    assert data[0]["courier"] == 'FL'
    assert data[0]["month"] == 10
    assert data[-1]["airport"] == "ATL"
    assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}
    
    print "... success!"

if __name__ == "__main__":
    #test()
    pprint.pprint(process_file(page))

[{'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 327185, 'international': 1681},
  'month': 10,
  'year': 2002},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 314306, 'international': 1851},
  'month': 11,
  'year': 2002},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 358883, 'international': 2399},
  'month': 12,
  'year': 2002},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 315363, 'international': 2364},
  'month': 1,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 292336, 'international': 2251},
  'month': 2,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 375124, 'international': 2998},
  'month': 3,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 363176, 'international': 2841},
  'month': 4,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 377485, 'international': 2550},
  'm

In [84]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# So, the problem is that the gigantic file is actually not a valid XML, because
# it has several root elements, and XML declarations.
# It is, a matter of fact, a collection of a lot of concatenated XML documents.
# So, one solution would be to split the file into separate documents,
# so that you can process the resulting files as valid XML documents.

import xml.etree.ElementTree as ET
PATENTS = 'patent.data'

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def split_file(filename):
    """
    Split the input file into separate files, each containing a single patent.
    As a hint - each patent declaration starts with the same line that was
    causing the error found in the previous exercises.
    
    The new files should be saved with filename in the following format:
    "{}-{}".format(filename, n) where n is a counter, starting from 0.
    """
    
    counter = 0
    with open(filename, 'r') as f:
        for line in f:
            if line.startswith("<?xml"):
                wfilename = "{}-{}".format(filename, counter)
                counter += 1
                wf = open(wfilename, "w+")
                wf.write(line)
            else:
                wf.write(line)


def test():
    split_file(PATENTS)
    for n in range(4):
        try:
            fname = "{}-{}".format(PATENTS, n)
            f = open(fname, "r")
            if not f.readline().startswith("<?xml"):
                print "You have not split the file {} in the correct boundary!".format(fname)
            f.close()
        except:
            print "Could not find file {}. Check if the filename is correct!".format(fname)


test()