In [1]:
# Import all the things

import pandas as pd
# import seaborn as sns # not using this now
import matplotlib.pyplot as plt

from ipywidgets import interactive
from IPython.display import display
import ipywidgets as widgets
from ipywidgets import interact, widgets

import numpy as np

import collections

%matplotlib inline

In [2]:
# Read in data

workshops  = pd.read_csv('workshops.csv')

In [3]:
workshops

Unnamed: 0,slug,tag_name,month,year,day,attendance,host_name,administrator_name,country
0,2011-11-07-toronto,SWC,11,2011,7,22.0,University of Toronto,Compute Canada,CA
1,2012-01-18-stsci,SWC,1,2012,18,14.0,Space Telescope Science Institute,Software Carpentry,US
2,2012-02-20-itcp,SWC,2,2012,20,50.0,International Centre for Theoretical Physics,Cyfronet,IT
3,2012-02-23-toronto,SWC,2,2012,23,28.0,University of Toronto,Compute Canada,CA
4,2012-03-07-indiana,SWC,3,2012,7,39.0,Indiana University,Software Carpentry,US
5,2012-03-26-mbari,SWC,3,2012,26,38.0,Monterey Bay Aquarium Research Institute,Software Carpentry,US
6,2012-03-28-nersc,SWC,3,2012,28,35.0,NERSC,Software Carpentry,US
7,2012-04-02-chicago,SWC,4,2012,2,35.0,University of Chicago,Software Carpentry,US
8,2012-04-14-utahstate,SWC,4,2012,14,32.0,Utah State University,Software Carpentry,US
9,2012-04-30-ucl,SWC,4,2012,30,44.0,University College London,SSI Software Sustainability Institute,GB


In [4]:
# Count number of values in each instructors cell
# See https://stackoverflow.com/questions/30202011/how-can-i-count-comma-separated-values-in-one-column-of-my-panda-table

# Convert the cell to list
# df['Category'] = df.Category.map(lambda x: [i.strip() for i in x.split(",")])
# Count values
# business['# Categories'] = business.Category.apply(len)


# Instructors are not included in this data set
# workshops['instructors'] = workshops.instructors.map(lambda x: [i.strip() for i in x.split(",")])
# workshops['count_instructors'] = workshops.instructors.apply(len)

## Preview the whole dataframe

In [5]:
# Preview data

workshops

Unnamed: 0,slug,tag_name,month,year,day,attendance,host_name,administrator_name,country
0,2011-11-07-toronto,SWC,11,2011,7,22.0,University of Toronto,Compute Canada,CA
1,2012-01-18-stsci,SWC,1,2012,18,14.0,Space Telescope Science Institute,Software Carpentry,US
2,2012-02-20-itcp,SWC,2,2012,20,50.0,International Centre for Theoretical Physics,Cyfronet,IT
3,2012-02-23-toronto,SWC,2,2012,23,28.0,University of Toronto,Compute Canada,CA
4,2012-03-07-indiana,SWC,3,2012,7,39.0,Indiana University,Software Carpentry,US
5,2012-03-26-mbari,SWC,3,2012,26,38.0,Monterey Bay Aquarium Research Institute,Software Carpentry,US
6,2012-03-28-nersc,SWC,3,2012,28,35.0,NERSC,Software Carpentry,US
7,2012-04-02-chicago,SWC,4,2012,2,35.0,University of Chicago,Software Carpentry,US
8,2012-04-14-utahstate,SWC,4,2012,14,32.0,Utah State University,Software Carpentry,US
9,2012-04-30-ucl,SWC,4,2012,30,44.0,University College London,SSI Software Sustainability Institute,GB


In [6]:
def workshops_by_year(data, year): 
    """Takes a dataframe and a year, returns dataframe filtered by year"""
    df = pd.read_csv(data)
    df = df[df['year'] == year]
    return df



## Select a year to see a dataframe of all workshops (across Carpentries) that year

In [7]:
# The interact function takes as arguments the function and that function's arguments separately
interact(workshops_by_year, data = 'workshops.csv', year = range(2011, 2019)) 

interactive(children=(Text(value='workshops.csv', description='data'), Dropdown(description='year', options=(2…

<function __main__.workshops_by_year(data, year)>

In [8]:
def workshops_by_carpentry(data, carpentry): 
    """Takes a dataframe and a carpentry, returns dataframe filtered by that carpentry"""
    df = pd.read_csv(data)
    df = df[df['tag_name'].str.contains(carpentry)]
    return df

## Select a Carpentry to see a dataframe of all workshops for that Carpentry (across all years)

In [9]:
interact(workshops_by_carpentry, data = 'workshops.csv', carpentry = ['SWC', 'DC', 'LC']) 

interactive(children=(Text(value='workshops.csv', description='data'), Dropdown(description='carpentry', optio…

<function __main__.workshops_by_carpentry(data, carpentry)>

In [10]:
def attendance_by_carpentry_and_year(data, carpentry): 
    """Takes a dataframe and a carpentry, returns attendance total by year for that carpentry as pandas series and bar plot"""
    df = pd.read_csv(data)
    df = df[df['tag_name'].str.contains(carpentry)]
    attendance_by_year = df.groupby('year')['attendance'].sum()
   
    attendance_list = attendance_by_year.tolist()
    
    offset = max(attendance_list) * .025
    ax = attendance_by_year.plot(x = 'year', y = 'attendance', kind = 'bar')
    for i in range(len(attendance_list)):
        ax.text(i, attendance_list[i] + offset, str(int(attendance_list[i])))
    plt.show()
    
    return attendance_by_year

## Select a Carpentry to see a bar chart of attendance by year for that Carpentry

This does not account for workshops missing attendance

In [11]:
interact(attendance_by_carpentry_and_year, data = 'workshops.csv', carpentry = ['SWC', 'DC', 'LC']) 

interactive(children=(Text(value='workshops.csv', description='data'), Dropdown(description='carpentry', optio…

<function __main__.attendance_by_carpentry_and_year(data, carpentry)>

In [17]:
def workshops_by_carpentry_and_year(data, carpentry="All", yr="All", stacked = True, agg="workshop count"): 
    """Takes a dataframe and a carpentry, returns dataframe filtered by that carpentry"""
    df = pd.read_csv(data)
    df.loc[df.administrator_name != 'self-organized', 'administrator_name'] = 'centrally'
    df.loc[df.administrator_name == 'self-organized', 'administrator_name'] = 'self'
    
    df.loc[df["tag_name"].str.contains("SWC"), 'tag_name'] = "SWC"
    df.loc[df["tag_name"].str.contains("DC"), 'tag_name'] = "DC"
    df.loc[df["tag_name"].str.contains("LC"), 'tag_name'] = "LC"
    df.loc[df["tag_name"].str.contains("TTT"), 'tag_name'] = "TTT"

    if carpentry != "All":
        df = df[df['tag_name'].str.contains(carpentry)]
    if yr != "All":
        df = df[df['year'] == year]
    
    if agg == "workshop count":
        attendance_by_year = df.groupby(['year', 'tag_name', 'administrator_name'])['slug'].count()
    elif agg == "attendance sum":
        attendance_by_year = df.groupby(['year', 'tag_name', 'administrator_name'])['attendance'].sum()

    attendance_by_year = attendance_by_year.to_frame()
    attendance_by_year.unstack().plot(kind='bar', stacked=stacked)

    return attendance_by_year


## Select a Carpentry and year to see a bar chart of workshop count OR attendance total for that Carpentry/year and that bar chart's source dataframe by self organized and centrally organized workshops.  Select "stacked" to toggle stacked view or side by side view.

This does not account for workshops missing attendance.  See here on how to fill na with means: 
https://chrisalbon.com/python/data_wrangling/pandas_missing_data/

Need to figure out how to make the legends and labels more human readable.

This is not currently accurate - we need to fix some data behind the scenes about whether workshops were self organized or centrally organized.

In [18]:
interact(workshops_by_carpentry_and_year, data = 'workshops.csv', stacked = [True, False], carpentry = ['All', 'SWC', 'DC', 'LC', 'TTT'], yr = ["All"] + list(range(2011, 2019)), agg=["attendance sum", "workshop count"])

interactive(children=(Text(value='workshops.csv', description='data'), Dropdown(description='carpentry', optio…

<function __main__.workshops_by_carpentry_and_year(data, carpentry='All', yr='All', stacked=True, agg='workshop count')>