In [1]:
from bokeh.io import show, output_notebook
from bokeh.models import (
    ColumnDataSource,
    HoverTool,
    LogColorMapper,
    Range1d, CustomJS, Slider
)
from bokeh.palettes import RdBu11 as palette
from bokeh.plotting import figure
from bokeh.layouts import row, widgetbox
from bokeh.sampledata.us_counties import data as counties
from bokeh.sampledata.us_states import data as states
from bokeh.sampledata.unemployment import data as unemployment
import pandas as pd
import random
import json
from ufo_parser import DurationParser 
from ufo_downloader import LocationFinder

output_notebook()


In [2]:
states_accumulated ={}
available_state_codes = states.keys()

for key, value in counties.items():
    state_name = value["state"].upper()
    if state_name in states.keys() and "number" not in states[state_name]:
        states[state_name]["number"] = key[0]

for key,state in states.items():
    state["code"] = key

state_list = []

for key,state in states.items():
    state_list.append(state)

state_df = pd.DataFrame(state_list)

In [3]:
ufo_data = []
with open("data.json","r") as f:
    ufo_data = json.load(f)
    
#raw_data = pandas.read_csv("data.csv", sep=";", encoding="UTF8", header=None, skiprows=1)
ufo_df = pd.DataFrame.from_records(ufo_data)
ufo_df.columns

Index(['City', 'Date', 'Duration', 'Posted', 'Shape', 'State', 'Summary',
       'link'],
      dtype='object')

In [21]:
dir(lf)

['_LocationFinder__chk_word',
 '_LocationFinder__lookup_cache',
 '_LocationFinder__match_country_code',
 '_LocationFinder__match_usa',
 '_LocationFinder__match_usa_state',
 '_LocationFinder__remove_brackets',
 '_LocationFinder__remove_div',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'download_geodata',
 'find',
 'find_ugly',
 'get_geodata',
 'get_geodata_http_used',
 'missing',
 'save_cache',
 'states']

In [19]:
t = time.time()
lf = LocationFinder("location_cache.json")
durparse = DurationParser()
STATES = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", \
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", \
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", \
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", \
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

data, wrong_data, skipped_lines = [], [], 0
for _, d in ufo_df.iterrows():
    try:
        # Transform or validate data
        date = pd.to_datetime(d["Date"])
    except Exception as e:
        date = pd.NaT
    
    try:
        posted = pd.to_datetime(d["Posted"])
    except Exception as e:
        posted = pd.NaT
    
    # State transformation and validation against prebuilt set.
    state = d["State"]
    if state is not pd.np.NaN:
        state = d["State"].upper()
        if state not in STATES:
            state = pd.np.NaN
    
    # Unknown shapes are, according to the website, categorized as unspecified
    shape = d["Shape"].lower() if d["Shape"] is not pd.np.NaN else "unspecified"
    # Use the city parser to get geolocation
    res,http_used = lf.find_ugly(city=d["City"],state_code=d["State"],http_used=True)
    lat = res["latitude"] if res["latitude"] is not None else pd.np.NaN
    lon = res["longitude"] if res["longitude"] is not None else pd.np.NaN
    
    if http_used:
        print(d["City"])
    
    # Use the duration parser to parse duration
    duration = DurationParser.parse_duration(d["Duration"])
    row = (date, d["City"], state, shape, duration, d["Summary"], posted, d["link"], lat, lon, res["confidence"])

    # Do some checks...
    if date.date() > posted.date():
        wrong_data.append(row)
        continue

    # Add data to cleaned array
    data.append(row)

names = ["Date", "City", "State", "Shape", "Duration", "Summary", "Posted", "Link","Lat","Lon","Confidence"]
data = pd.DataFrame.from_records(data, columns=names)
wrong_data = pd.DataFrame.from_records(wrong_data, columns=names)
print(time.time()-t)

131.92645859718323


In [None]:
color_mapper = LogColorMapper(palette=palette)

state_xy = (list(state_df["lons"].values),list(state_df["lats"].values))

max_x = max([max(l) for l in state_xy[0]])
max_y = max([max(l) for l in state_xy[1]])
min_x = min([min(l) for l in state_xy[0]])
min_y = min([min(l) for l in state_xy[1]])

data=dict(
    x=state_xy[0],
    y=state_xy[1],
    name=list(state_df["name"].values),
    used = list(state_df["value"].values)
)

data['1999'] = list(state_df["value"].values)
data['2000'] = [random.randrange(0,50) for i in range(len(state_xy[0]))]
data['2001'] = [random.randrange(0,50) for i in range(len(state_xy[0]))]
data['2002'] = [random.randrange(0,50) for i in range(len(state_xy[0]))]
data['2003'] = [random.randrange(0,50) for i in range(len(state_xy[0]))]
data['2004'] = [random.randrange(0,50) for i in range(len(state_xy[0]))]
data['2005'] = [random.randrange(0,50) for i in range(len(state_xy[0]))]
data['2006'] = [random.randrange(0,50) for i in range(len(state_xy[0]))]
data['2007'] = [random.randrange(0,50) for i in range(len(state_xy[0]))]
data['2008'] = [random.randrange(0,50) for i in range(len(state_xy[0]))]

source = ColumnDataSource(data)

TOOLS = "pan,wheel_zoom,reset,hover,save"

p = figure(
    title="States", tools=TOOLS,
    x_axis_location=None, y_axis_location=None
)
p.width=450
p.height = 450
p.x_range= Range1d(-170,-60)
p.y_range = Range1d(min_y-10,max_y+10)
p.grid.grid_line_color = None

renderer = p.patches('x', 'y', source=source,
          fill_color={'field': 'used', 'transform': color_mapper},
          fill_alpha=0.7, line_color="white", line_width=0.5)

hover = p.select_one(HoverTool)
hover.point_policy = "follow_mouse"
hover.tooltips = [
    ("Name", "@name"),
    ("Unemployment rate)", "@used%"),
    ("(Long, Lat)", "($x, $y)"),
]

callback = CustomJS(args=dict(source=source,plot=p,color_mapper = color_mapper,renderer = renderer), code="""
    var data = source.data;
    var year = year.value;
    used = data['used']
    should_be = data[String(year)]
    for (i = 0; i < should_be.length; i++) {
        used[i] = should_be[i];
    } 

    source.trigger('change');
""")


year_slider = Slider(start=1999, end=2008, value=1999, step=1,
                    title="year", callback=callback)
callback.args["year"] = year_slider

layout = row(
    p,
    widgetbox(year_slider),
)
show(layout)

In [None]:
lons = []
for item in state_df["lons"]:
    lons.append(item)

In [None]:
max_val,min_val = -1000,2000
for item in lons:
    for i in item:
        if i < min_val:
            min_val = i
        if i > max_val:
            max_val = i
print(min_val,max_val)

In [None]:
sc