In [1]:
## Import all the things

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline 

# The Carpentries: Programmatic Assessment Report
## 2017 Year End and 2018 Q1

# Part 1: Workshops over time

## Workshops by Carpentry by Year

### Table: All SWC & DC workshops, 2012-2018Q1
Instructor training events are analyzed separately.

In [2]:
# Load csv into df
workshops = pd.read_csv("data_files/workshops.csv")

# Clean up the tag names
workshops.loc[workshops['tag_name'].str.contains("SWC"), "tag"] = "SWC"
workshops.loc[workshops['tag_name'].str.contains("DC"), "tag"] = "DC"
workshops.loc[workshops['tag_name'].str.contains("LC"), "tag"] = "LC"
workshops.loc[workshops['tag_name'].str.contains("TTT"), "tag"] = "TTT"

# Clean up attendance value - this is inconsistently stored as NaN or 0
# All zero values should be NaN

# df.replace(-999, np.nan)
# workshops = workshops['attendance'].replace(0.0, np.nan)

workshops['attendance'] =  workshops['attendance'].replace(0.0, np.nan)

# Date data type
workshops['start'] = pd.to_datetime(workshops['start'])

# Remove instructor training events; these will be analyzed separately.
# Limit to non TTT workshops

workshops_nonTTT = workshops[workshops['tag'] != "TTT"]
workshops_nonTTT

Unnamed: 0,slug,tag_name,start,attendance,host_name,country,tag
0,2012-01-18-stsci,SWC,2012-01-18,14.0,Space Telescope Science Institute,US,SWC
1,2012-02-20-itcp,SWC,2012-02-20,50.0,International Centre for Theoretical Physics,IT,SWC
2,2012-02-23-toronto,SWC,2012-02-23,28.0,University of Toronto,CA,SWC
3,2012-03-07-indiana,SWC,2012-03-07,39.0,Indiana University,US,SWC
4,2012-03-26-mbari,SWC,2012-03-26,38.0,Monterey Bay Aquarium Research Institute,US,SWC
5,2012-03-28-nersc,SWC,2012-03-28,35.0,NERSC,US,SWC
6,2012-04-02-chicago,SWC,2012-04-02,35.0,University of Chicago,US,SWC
7,2012-04-14-utahstate,SWC,2012-04-14,32.0,Utah State University,US,SWC
8,2012-04-30-ucl,SWC,2012-04-30,44.0,University College London,GB,SWC
9,2012-05-07-michiganstate,SWC,2012-05-07,55.0,Michigan State University,US,SWC


### Workshops by Carpenty by year: Table view
The table and chart below show the number of Software Carpentry and Data Carpentry workshops has remained roughly steady for the past several years.

This may not account for unreported self-organized workshops.  The Carpentries is working to improve data collection to have more accurate reflections of our scope of work.

In [None]:
workshops_by_carpentry_year = workshops_nonTTT.groupby([workshops_nonTTT['start'].dt.year, 'tag'])['slug'].count().unstack()
workshops_by_carpentry_year['total'] = workshops_by_carpentry_year.sum(axis=1)
workshops_by_carpentry_year.index.values
workshops_by_carpentry_year = workshops_by_carpentry_year.rename(index = {2018:'2018Q1'})
workshops_by_carpentry_year

# Values should be ints not floats


### Workshops by Carpentry by year: Chart view

In [None]:
fig = plt.figure(figsize=(12, 6)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
width = .5
title = "Carpentries workshops count by year"
workshops_by_carpentry_year.plot(y = ["DC", "SWC"], kind='bar', ax=ax, width=width, position=1, title=title)

ax.grid(True)

plt.show()

### Table: Count Total SWC and DC workshops by country by year

In [None]:
workshops_by_country_year = workshops_nonTTT.groupby(['country', workshops_nonTTT['start'].dt.year])['slug'].count().unstack()
workshops_by_country_year = workshops_by_country_year.fillna(0)
workshops_by_country_year.rename(columns={2018: '2018Q1'}, inplace=True)
workshops_by_country_year['total'] = workshops_by_country_year.sum(axis=1)
workshops_by_country_year

# Values should be ints not floats

### Chart: Count Total SWC and DC workshops by country by year
*How can we make this chart easier to read?*

In [None]:
fig = plt.figure(figsize=(12, 6)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes

title = "Carpentries workshops by country by year"
workshops_by_country_year.plot(y = list(workshops_by_country_year)[:-1], kind='bar', ax=ax, position=1, title=title)

ax.grid(True)

plt.show()

list(workshops_by_country_year)

### Compare Q1 2017 and Q1 2018 Workshops by country: table view

In [None]:
# # Get all workshops for 2017 Q1
workshops_2017Q1 = workshops_nonTTT[(workshops_nonTTT['start'].dt.year == 2017) & (workshops_nonTTT['start'].dt.month <= 3)]

# # Get all workshops for 2018 Q1
workshops_2018Q1 = workshops_nonTTT[(workshops_nonTTT['start'].dt.year == 2018) & (workshops_nonTTT['start'].dt.month <= 3)]

# 2017 Q1 workshops by country
workshops_by_country_year_2017Q1 = workshops_2017Q1.groupby(['country', workshops_nonTTT['start'].dt.year])['slug'].count().unstack()
workshops_by_country_year_2017Q1.rename(columns={2017:'2017Q1'}, inplace=True)

# 2018 Q1 workshops by country
workshops_by_country_year_2018Q1 = workshops_2018Q1.groupby(['country', workshops_nonTTT['start'].dt.year])['slug'].count().unstack()
workshops_by_country_year_2018Q1.rename(columns={2018:'2018Q1'}, inplace=True)

# Concatenated table, 2017 Q1 and 2018 Q1 workshops by country
workshops_by_country_year_1718Q1 = pd.concat([workshops_by_country_year_2017Q1, workshops_by_country_year_2018Q1], axis=1)

workshops_by_country_year_1718Q1
# Values should be ints not floats

### Compare Q1 2017 and Q1 2018 Workshops by country: chart view

In [None]:
fig = plt.figure(figsize=(12, 6)) # Create matplotlib figure

ax = fig.add_subplot(111) # Create matplotlib axes
width = 0.25

workshops_by_country_year_1718Q1['2017Q1'].plot(kind='bar', color='lightgreen', ax=ax, width=width, position=1, edgecolor = 'green',  linewidth = 1)
workshops_by_country_year_1718Q1['2018Q1'].plot(kind='bar', color='lightblue', ax=ax, width=width, position=0, edgecolor = 'blue',  linewidth = 1)

ax.legend(["2017 Q1", "2018 Q1"]);

# ax.set_xticklabels(workshops_by_country_2018_Q1_all_countries.country)
ax.grid(True)

plt.title("Count workshops by country, Q1 2017 & Q1 2018")

plt.show()

### Proportion of 2017 workshops that were in 2017 Q1
Rather than comparing 2017 Q1 and 2017 Q1, can we compare all 2017 and projected 2018?  This value is very close to 25%.

In [None]:
total2017 = workshops_nonTTT[workshops_nonTTT['start'].dt.year == 2017]
Q1_2017 = total2017[total2017['start'].dt.month <= 3]

total2017_count = len(total2017)
Q1_2017_count = len(Q1_2017)


Q1_2017_proportion = Q1_2017_count/total2017_count
Q1_2017_proportion


### When did each country have its first workshop?

Data goes back to 2012; some countries were holding Carpentries workshops before this time.

In [None]:
w2 = workshops_by_country_year.copy()

# See https://stackoverflow.com/questions/31828240/first-non-null-value-per-row-from-a-list-of-pandas-columns
# and
# https://stackoverflow.com/questions/38467749/find-first-non-zero-value-in-each-row-of-pandas-dataframe

# Fill zero values with Nan to make them non valid values
res = w2[w2 != 0.0]

first_workshop = res.apply(pd.Series.first_valid_index, axis=1)
first_workshop

### Which countries had their first workshop in 2017?


In [None]:
first_workshop_2017 = first_workshop[(first_workshop == 2017)]
first_workshop_2017

## Attendance at workshops

### Workshops missing attendance

In [None]:
missing_attendance = workshops_nonTTT[pd.isnull(workshops['attendance'])]
missing_attendance

### Scatter plot showing frequency of attendance rates
Would like to add jitter to better visualize high frequencies in middle ranges.  Also overlay mean, median, mode for each year to show shift in measures of central tendancy.

In [None]:
attendance = workshops_nonTTT[['start', 'attendance']]   #df['date'].dt.year
attendance['year'] = attendance['start'].dt.year
attendance.plot.scatter('year', 'attendance')

### Total attendance by year: Table view

In [None]:
total_attendance_by_year = attendance.groupby('year')['attendance'].sum().to_frame()
total_attendance_by_year

# Values should be ints, not floats

### Total attendance by year: Chart view

In [None]:
total_attendance_by_year.plot.bar()

### Binned attendance by year

This table shows by year, how many workshops had 1-10 learners, 11-20 learners, and so on.  Need to change the column heading 2018 to 2018Q1

In [None]:
bins = pd.cut(attendance['attendance'], [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, np.inf])
attendance.groupby(bins)['attendance'].agg(['count'])

binned_attendance_by_year = attendance.groupby([ bins, 'year'])['attendance'].agg(['count'])

binned_attendance_by_year =  binned_attendance_by_year.unstack()
binned_attendance_by_year['total'] = binned_attendance_by_year.sum(axis=1)

binned_attendance_by_year
# Values should all be ints, not floats

## Instructor Training

In [3]:
badges_by_training_event = pd.read_csv("data_files/badges_by_training_event.csv")

badges_by_training_event['pct_completion'] = badges_by_training_event['count_badged'] / badges_by_training_event['attendance'] * 100
badges_by_training_event = badges_by_training_event.round({'pct_completion':1})
badges_by_training_event['start'] = pd.to_datetime(badges_by_training_event['start'])


badges_by_training_event

Unnamed: 0,count_badged,slug,attendance,country,start,pct_completion
0,17,2012-08-26-ttt-online,20,W3,2012-08-26,85.0
1,19,2012-10-11-ttt-online,25,W3,2012-10-11,76.0
2,5,2013-01-06-ttt-online,12,W3,2013-01-06,41.7
3,18,2013-03-12-ttt-online,27,W3,2013-03-12,66.7
4,18,2013-05-12-ttt-online,45,W3,2013-05-12,40.0
5,24,2013-08-12-ttt-online,41,W3,2013-08-12,58.5
6,22,2013-09-30-ttt-online,57,W3,2013-09-30,38.6
7,25,2014-01-16-ttt-online,67,W3,2014-01-16,37.3
8,3,2014-04-14-ttt-pycon,34,W3,2014-04-14,8.8
9,24,2014-04-24-ttt-online,58,W3,2014-04-24,41.4


### Number of instructor training events by country

In [None]:
trainigs_by_country = badges_by_training_event.groupby([badges_by_training_event['start'].dt.year, 'country'])['slug'].count().unstack()
trainigs_by_country['total'] = trainings_online_inperson.sum(axis=1)
trainigs_by_country.index.values
trainigs_by_country = trainigs_by_country.rename(index = {2018:'2018Q1'})
trainigs_by_country

### Binned completion rates for instructor training events by year

In [None]:
bins = pd.cut(badges_by_training_event['pct_completion'], [0, 25, 50, 75, 99, 100])
binned_badges_by_year = badges_by_training_event.groupby([ bins, badges_by_training_event['start'].dt.year])['pct_completion'].agg(['count'])

binned_badges_by_year =  binned_badges_by_year.unstack()
binned_badges_by_year['total'] = binned_badges_by_year.sum(axis=1)

binned_badges_by_year

# Need to add total row
# Need to remove 2018 (even Q1 - none of these people are expected to be badged)
# Values should all be ints, not floats
# Table reads "6 events in 2016 had a completion rate under 25%"


### Attendance and number badged by year

#### Total attendance by year

In [None]:
attendance_by_year = badges_by_training_event.groupby([badges_by_training_event['start'].dt.year, 'country'])['attendance'].sum().unstack()
attendance_by_year

# trainigs_by_country = badges_by_training_event.groupby([badges_by_training_event['start'].dt.year, 'country'])['slug'].count().unstack()


#### Total badged by year

In [None]:
badges_by_year = badges_by_training_event.groupby([badges_by_training_event['start'].dt.year, 'country'])['count_badged'].sum().unstack()
badges_by_year


#### Percent of trainees badged per year
Note the country data shows the country the event was held in, not the country the trainee is based in.

In [None]:
percent_badged_by_year = badges_by_year/attendance_by_year*100

# badges_by_training_event.round({'pct_completion':1})
percent_badged_by_year = percent_badged_by_year.round(1)
percent_badged_by_year

### Comparison:  Inperson vs Online Training Events
#### Number of events per year

In [None]:
# In person training events

countries = list(trainigs_by_country)
countries.remove('W3')
countries.remove('total')
inperson_trainings = trainigs_by_country[countries].sum(axis = 1).to_frame()

online_trainings = trainigs_by_country[['W3']]

trainings_online_inperson = pd.concat([country_trainings, online_trainings], axis=1)

trainings_online_inperson.rename(columns = {0:'in-person', 'W3':'online'}, inplace=True)

trainings_online_inperson


### Binned completion rates for instructor training events by year: online events only

### Binned completion rates for instructor training events by year: inperson events only
