In [34]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import datetime
import os

states = ['Kansas', 'Nebraska', 'Oklahoma']

state_airports = {}

In [35]:
# get airport codes for states of interest
for st in states:
    print(st)
    temp = []
    r = requests.get("https://www.wunderground.com/history/index.html?error=AMBIGUOUS&query=%s" % st)

    regex = r"\/?history\/?airport\/(\w+)"
    soup = BeautifulSoup(r.text, 'html.parser')

    for i in soup.find_all('a'):
        link = i.get('href')
        if isinstance(link, str):
            matches = re.search(regex, link)
            if matches:
                temp.append(matches.group(1))
    
    state_airports[st] = temp

Kansas
Nebraska
Oklahoma


In [36]:
%time
def non_empty_length(list_):
    counter = 0
    for i in list_:
        if i != '':
            counter+=1
    return counter

def name_list(list_named):
    return (list_named[0], list_named[1:])

def list_to_dict(rows):
    dict_ = {}
    for i in rows:
        if (non_empty_length(i) > 1):
            name, list_ = name_list(i)
            dict_[name] = list_
    return dict_

# get weather for a day at a given airport
def get_airport_weather(airport, year = 2018, day = 29, month = 3):
    link = "https://www.wunderground.com/history/airport/%s/%s/%s/%s/DailyHistory.html" % (airport, year, month, day)
    try:
        r = requests.get(link)
    except e:
        
    soup = BeautifulSoup(r.text, 'html.parser')
    table = soup.find('table', {'class':'airport-history-summary-table'})
    
    headers = [header.text for header in table.find_all('th') if not header.text == '\xa0']

    rows = []

    for row in table.find_all('tr'):
        temp_row = [val.text.replace('\xa0' , '').strip().replace('\n', ' ') for val in row.find_all('td')]
        if (non_empty_length(temp_row) > 1):
            rows.append(temp_row)
    return pd.DataFrame(rows, columns = ['labels', 'current', 'average', 'record']).set_index('labels')

Wall time: 0 ns


In [44]:
# https://stackoverflow.com/questions/993358/creating-a-range-of-dates-in-python
base = datetime.date.today()
base -= datetime.timedelta(days = 1)
num_days = 1
date_list = [base - datetime.timedelta(days=x) for x in range(0, num_days)]


In [45]:
%%time
state = 'Kansas'
for air in state_airports[state]:
    print(air)
    for date in date_list:
        print(date)
        air_weather = get_airport_weather(air, year = date.year, month = date.month, day=date.day)
        file_dir = "../WeatherData/%s/%s/%s/%s" % (state, air, date.year, date.month)
        os.makedirs(file_dir, exist_ok = True)
        air_weather.to_csv("%s/%s.csv" % (file_dir, date.day))

KCNU
2018-03-30
KCFV
2018-03-30
KAAO
2018-03-30
KCBK
2018-03-30
KCNK
2018-03-30
KDDC
2018-03-30
KEHA
2018-03-30
KEMP
2018-03-30
KGCK
2018-03-30
KGLD
2018-03-30
KHYS
2018-03-30
KHLC
2018-03-30
KHUT
2018-03-30
KLWC
2018-03-30
KLBL
2018-03-30
KMHK
2018-03-30
KIAB
2018-03-30
KP28
2018-03-30
KEWK
2018-03-30
KOJC
2018-03-30
KPPF
2018-03-30
KRSL
2018-03-30
KSLN
2018-03-30
KTOP
2018-03-30
KICT
2018-03-30
KWLD
2018-03-30
Wall time: 1min 26s


In [None]:
# https://stackoverflow.com/questions/914821/producer-consumer-problem-with-python-multiprocessing

from multiprocessing import Process, Queue, cpu_count
import random
import time

In [None]:
def serve(queue):
    works = ["task_1", "task_2"]
    while True:
        time.sleep(0.01)
        queue.put(random.choice(works))

In [33]:
def work(id, queue):
    while True:
        task = queue.get()
        if task is None:
            break
        time.sleep(0.05)
        print "%d task:" % id, task
    queue.put(None)


class Manager:
    def __init__(self):
        self.queue = Queue()
        self.NUMBER_OF_PROCESSES = cpu_count()

    def start(self):
        print "starting %d workers" % self.NUMBER_OF_PROCESSES
        self.workers = [Process(target=work, args=(i, self.queue,))
                        for i in xrange(self.NUMBER_OF_PROCESSES)]
        for w in self.workers:
            w.start()

        serve(self.queue)

    def stop(self):
        self.queue.put(None)
        for i in range(self.NUMBER_OF_PROCESS):
            self.workers[i].join()
        queue.close()


Manager().start()

{}

In [22]:
test = ['KCNU', 'KCFV', 'KAAO', 'KCBK', 'KCNK', 'KDDC', 'KEHA', 'KEMP', 'KGCK', 'KGLD', 'KHYS', 'KHLC', 'KHUT', 'KLWC', 'KLBL', 'KMHK', 'KIAB', 'KP28', 'KEWK', 'KOJC', 'KPPF', 'KRSL', 'KSLN', 'KTOP', 'KICT', 'KWLD']

{'Kansas': ['KCNU',
  'KCFV',
  'KAAO',
  'KCBK',
  'KCNK',
  'KDDC',
  'KEHA',
  'KEMP',
  'KGCK',
  'KGLD',
  'KHYS',
  'KHLC',
  'KHUT',
  'KLWC',
  'KLBL',
  'KMHK',
  'KIAB',
  'KP28',
  'KEWK',
  'KOJC',
  'KPPF',
  'KRSL',
  'KSLN',
  'KTOP',
  'KICT',
  'KWLD'],
 'Nebraska': ['KANW',
  'KAIA',
  'KAUH',
  'KBIE',
  'KBBW',
  'KCDR',
  'KOLU',
  'KFNB',
  'KFET',
  'KGRI',
  'KHSI',
  'KHDE',
  'KIML',
  'KEAR',
  'KLXN',
  'KLNK',
  'KMCK',
  'KOFK',
  'KLBF',
  'KOFF',
  'KOGA',
  'KOMA',
  'KONL',
  'KODX',
  'KBFF',
  'KSNY',
  'KVTN',
  'KJYR'],
 'Oklahoma': ['KADH',
  'KLTS',
  'KAVK',
  'K1F0',
  'KAQR',
  'KBVO',
  'KCQB',
  'KCHK',
  'KGCM',
  'KDUC',
  'KDUA',
  'KWDG',
  'KFDR',
  'KGAG',
  'KGMJ',
  'KGOK',
  'KHBR',
  'KLAW',
  'KMLC',
  'KMKO',
  'KOUN',
  'KOKC',
  'KOKM',
  'KPVJ',
  'KPNC',
  'KRKR',
  'KSNL',
  'KSWO',
  'KTQH',
  'KTIK',
  'KTUL',
  'KWWR']}

In [3]:
state_airports

{'Kansas': ['KCNU',
  'KCFV',
  'KAAO',
  'KCBK',
  'KCNK',
  'KDDC',
  'KEHA',
  'KEMP',
  'KGCK',
  'KGLD',
  'KHYS',
  'KHLC',
  'KHUT',
  'KLWC',
  'KLBL',
  'KMHK',
  'KIAB',
  'KP28',
  'KEWK',
  'KOJC',
  'KPPF',
  'KRSL',
  'KSLN',
  'KTOP',
  'KICT',
  'KWLD'],
 'Nebraska': ['KANW',
  'KAIA',
  'KAUH',
  'KBIE',
  'KBBW',
  'KCDR',
  'KOLU',
  'KFNB',
  'KFET',
  'KGRI',
  'KHSI',
  'KHDE',
  'KIML',
  'KEAR',
  'KLXN',
  'KLNK',
  'KMCK',
  'KOFK',
  'KLBF',
  'KOFF',
  'KOGA',
  'KOMA',
  'KONL',
  'KODX',
  'KBFF',
  'KSNY',
  'KVTN',
  'KJYR'],
 'Oklahoma': ['KADH',
  'KLTS',
  'KAVK',
  'K1F0',
  'KAQR',
  'KBVO',
  'KCQB',
  'KCHK',
  'KGCM',
  'KDUC',
  'KDUA',
  'KWDG',
  'KFDR',
  'KGAG',
  'KGMJ',
  'KGOK',
  'KHBR',
  'KLAW',
  'KMLC',
  'KMKO',
  'KOUN',
  'KOKC',
  'KOKM',
  'KPVJ',
  'KPNC',
  'KRKR',
  'KSNL',
  'KSWO',
  'KTQH',
  'KTIK',
  'KTUL',
  'KWWR']}