In [None]:
import pandas as pd

## Pair Programming

Do the following as Pairs.

During Class:

 * Person One: is the "driver"
 * Person Two/Three: is the "observer"
 
Attempt in :30 minutes:

 1. Complete your notebook
 2. Send to your team members to code review
 3. Address their comments until happy
 4. submit
 

# The Exercise

You and your pair will dive head deep into the world of Pair Programming. The Goal is to build one combined dataframe out of four data sets.

There are four gapminder data sets:


 * [fertility](http://spreadsheets.google.com/pub?key=phAwcNAVuyj0TAlJeCEzcGQ)
 * [life_expectancy](http://spreadsheets.google.com/pub?key=tiAiXcrneZrUnnJ9dBU-PAw)
 * [population](http://spreadsheets.google.com/pub?key=phAwcNAVuyj0XOoBL_n5tAQ)
 * [regions](https://docs.google.com/spreadsheets/d/1OxmGUNWeADbPJkQxVPupSOK5MbAECdqThnvyPrwG5Os/pub?gid=1)




In [None]:
import os
os.listdir("data")

### Step 1: read all the files into data frames

> Hint: look here http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.from_csv.html#pandas-dataframe-from-csv
>  or http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html

Please notice you already have imported pandas as pd.  You might want to tab-complete or use a '?' to explore it more.

*output variables named: df_fertility, df_life, df_population, df_regions*


### Step 2: create list of years as intergers

> Hint: look here http://pandas.pydata.org/pandas-docs/stable/api.html#attributes-and-underlying-data (columns)

You can get a list of columns using "." columns on a dataframe. Now convert those to integers and store them in a dict, with the key names the string version, the values the intger like so:

```python

{'1964': 1964,
 '1965': 1965,
 '1966': 1966,
 '1967': 1967,
  ... }
 ```

*output called: rename_dict*

### Step 3: rename all the columns with the year as str, to year as int

> hint: look here http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html#pandas-dataframe-rename


*output variables named: df_fertility, df_life, df_population, df_regions*


### Step 4: add some extra info

Since we will be presenting with bokeh (a graphing library), let's assign each group a color and get a popuation size.

In [None]:
from bokeh.palettes import Spectral6
import numpy as np

def assign_region_color(regions, scale=200, min_size=3):
    """ This function assignes the bubbles and colors"""
    regions.Group = regions.Group.astype('category')
    regions_list = list(regions.Group.cat.categories)

    def get_color(r):
        """internally defined function"""
        return Spectral6[regions_list.index(r.Group)]
    regions['region_color'] = regions.apply(get_color, axis=1)
    return regions

def normalize_popuation_size(population, scale_factor=200, min_size=3):
    """Turn population into bubble sizes. Use min_size and factor to tweak."""
    population_size = np.sqrt(population / np.pi) / scale_factor
    return population_size.where(population_size >= min_size).fillna(min_size)
    

### Step 5: concatinate all the data

Let's create a dictionary called sources. Make it have a key for each year, but start that key with a "_" so it looks like this:

```python
sources.keys()
# ['_1989', '_2010', '_2004', '_1999', ...] 
```

Pseudocode:

```python

# some extra stuff
region_color = regions_df['region_color']
region_color.name = 'region_color'
dictionary_of_sources = dict(zip([x for x in years], ['_%s' % x for x in years]))
js_source_array = str(dictionary_of_sources).replace("'", "")

#function you will call
def make_data(fertility, life, population, region_color):
    sources = {}
    
    ### FINISH THIS CODE
    for each year:
        get fertility
        add fertility.name = 'fertility'
        # do same for: life, population
        into new data frame conacatinate: fertility, life, population, region_color
        create a key with "_"+year and assign ColumnDataSource(new_df)
    return dict
```


### display a pretty graph

In [None]:
from IPython.display import display, HTML

import pandas as pd

from jinja2 import Template

# using bokeh=0.11.0=py35_0
from bokeh.models import (
    ColumnDataSource, Plot, Circle, Range1d, 
    LinearAxis, HoverTool, Text,
    SingleIntervalTicker, Slider, CustomJS
)
from bokeh.palettes import Spectral6
from bokeh.plotting import vplot
from bokeh.resources import JSResources
from bokeh.embed import file_html


In [None]:
sources = make_data(df_fertility, df_life, df_population, region_color)

# Set up the plot
xdr = Range1d(1, 9)
ydr = Range1d(20, 100)
plot = Plot(
    x_range=xdr,
    y_range=ydr,
    title="",
    plot_width=800,
    plot_height=400,
    outline_line_color=None,
    toolbar_location=None,    
)
AXIS_FORMATS = dict(
    minor_tick_in=None,
    minor_tick_out=None,
    major_tick_in=None,
    major_label_text_font_size="10pt",
    major_label_text_font_style="normal",
    axis_label_text_font_size="10pt",

    axis_line_color='#AAAAAA',
    major_tick_line_color='#AAAAAA',
    major_label_text_color='#666666',

    major_tick_line_cap="round",
    axis_line_cap="round",
    axis_line_width=1,
    major_tick_line_width=1,
)

xaxis = LinearAxis(ticker=SingleIntervalTicker(interval=1), axis_label="Children per woman (total fertility)", **AXIS_FORMATS)
yaxis = LinearAxis(ticker=SingleIntervalTicker(interval=20), axis_label="Life expectancy at birth (years)", **AXIS_FORMATS)   
plot.add_layout(xaxis, 'below')
plot.add_layout(yaxis, 'left')

# Add the year in background (add before circle)
text_source = ColumnDataSource({'year': rename_dict.keys()})
text = Text(x=2, y=35, text='year', text_font_size='150pt', text_color='#EEEEEE')
plot.add_glyph(text_source, text)

# Add the circle
renderer_source = sources['_%s' % years[0]]
circle_glyph = Circle(
    x='fertility', y='life', size='population',
    fill_color='region_color', fill_alpha=0.8, 
    line_color='#7c7e71', line_width=0.5, line_alpha=0.5)
circle_renderer = plot.add_glyph(renderer_source, circle_glyph)

# Add the hover (only against the circle and not other plot elements)
tooltips = "@index"
plot.add_tools(HoverTool(tooltips=tooltips, renderers=[circle_renderer]))

#legend 
text_x = 7
text_y = 95
for i, region in enumerate(regions):
    plot.add_glyph(Text(x=text_x, y=text_y, text=[region], text_font_size='10pt', text_color='#666666'))
    plot.add_glyph(Circle(x=text_x - 0.1, y=text_y + 2, fill_color=Spectral6[i], size=10, line_color=None, fill_alpha=0.8))
    text_y = text_y - 5 

# Add the slider
code = """
    var year = slider.get('value'),
        sources = %s,
        new_source_data = sources[year].get('data');
    renderer_source.set('data', new_source_data);
    text_source.set('data', {'year': [String(year)]});
""" % js_source_array

callback = CustomJS(args=sources, code=code)
slider = Slider(start=years[0], end=years[-1], value=1, step=1, title="Year", callback=callback, name='testy')
callback.args["renderer_source"] = renderer_source
callback.args["slider"] = slider
callback.args["text_source"] = text_source


# Stick the plot and the slider together
layout = vplot(plot, slider)

# Open our custom template
with open('gapminder_template.jinja', 'r') as f:
    template = Template(f.read())

# Use inline resources, render the html and open
js_resources = JSResources(mode='inline')
title = "Bokeh - Gapminder Bubble Plot"
html = file_html(layout, resources=(js_resources, None), title=title, template=template)

display(HTML(html))