**Purpose**: Analyze the feedback that are embeded at the end of the tutorials in Galaxy Training Material

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from pprint import pprint
import datetime
import emoji

In [None]:
url = 'https://docs.google.com/spreadsheets/d/1NfZhi5Jav7kl9zFCkeb7rIC2F8xW1isruv1TeO4WpNI/export?format=tsv'

In [None]:
str(datetime.datetime.now())

# Load the feedback

In [None]:
df = (pd.read_csv(url, sep='\t')
    # remove last column
      .drop(['Make feedback confidential?'], axis=1)
    # rename column
      .rename(columns = {'Timestamp': 'timestamp',
                         'How much did you like this tutorial?': 'note',
                         'What did you like?': 'pro',
                         'What could be improved?': 'con',
                         'Tutorial': 'tutorial_topic'}))
# extract topic from tutorial name
new = df.tutorial_topic.str[::-1].str.split('(', n = 1, expand = True)
df["tutorial"]= new[1].str[::-1].str[:-1]
df["topic"]= new[0].str[::-1].str[:-1]
df = (df.drop(columns =["tutorial_topic"])
    # remove rows with NaN on note, pro and con
      .dropna(subset=['note', 'pro', 'con'], how='all')
    # replace NaN in note by 0
      .fillna(value={'note': 0}))
# format note to integer
df.note = df.note.astype(int)
# format pro and con to string
df.pro = df.pro.astype(str)
df.con = df.con.astype(str)
# format timestamp to remove hour and use datetime
df.timestamp = pd.to_datetime(
    df.timestamp.str.split(' ', n = 1, expand = True)[0],
    dayfirst=True)

In [None]:
# change topic for some tutorials
df.loc[df.tutorial == 'Formation of the Super-Structures on the Inactive X', 'topic'] = 'Epigenetics'
df.loc[df.tutorial == 'Identification of the binding sites of the Estrogen receptor', 'topic'] = 'Epigenetics'
df.loc[df.tutorial == 'Identification of the binding sites of the T-cell acute lymphocytic leukemia protein 1 (TAL1)', 'topic'] = 'Epigenetics'
df.loc[df.tutorial == 'RAD-Seq Reference-based data analysis', 'topic'] = 'Ecology'
df.loc[df.tutorial == 'RAD-Seq de-novo data analysis', 'topic'] = 'Ecology'
df.loc[df.tutorial == 'RAD-Seq to construct genetic maps', 'topic'] = 'Ecology'

# Aggregate the feedbacks and notes

In [None]:
def get_notes(df, name):
    '''Aggregage the notes
    
    :df: dataframe with all feedbacks
    
    :return: dataframe object with aggregated notes
    '''
    return (df.note
        .value_counts(sort=False)
        .to_frame()
        .rename(columns= {'note': name}))

In [None]:
notes = get_notes(df, 'All topics')

In [None]:
def get_topic_df(grouped_by_topic, topic, notes):
    '''Extract the dataframe for a topic and plot note histogram
    
    :grouped_by_topic: groupby object grouping by topic
    :topic: topic to extract
    
    :return: dataframe object for the topic
    '''
    topic_df = (grouped_by_topic
        .get_group(topic)
        .drop('topic', 1))
    
    notes = pd.concat([notes, get_notes(topic_df, topic)], axis=1)
    return topic_df, notes


def extract_tutorial_feedbacks(topic_df, topic_name):
    '''Extract pro/con per tutorial for a topic and 
    write them in a file
    
    :topic_df: dataframe object for the topic
    :topic_name: name for the topic, name for the file
    '''
    grouped_by_tuto = topic_df.groupby(by="tutorial")
    with open('../results/%s.md' % topic_name, 'w') as f:
        for tuto, group in grouped_by_tuto:
            # get groups
            tuto_df = grouped_by_tuto.get_group(tuto)
            pros = []
            cons = []
            # get pros/cons
            for index, row in tuto_df.iterrows():
                if row['pro'] != 'nan':
                    pros.append("%s (*%s*)" % (row['pro'], row['timestamp']))
                if row['con'] != 'nan':
                    cons.append("%s (*%s*)" % (row['con'], row['timestamp']))
            # write in report file
            f.write("- **%s**\n" % tuto)
            if len(pros) > 0:
                f.write("  - Pro:\n    - ")
                f.write("\n    - ".join(pros))
            if len(cons) > 0:
                f.write("\n  - Con:\n    - ")
                f.write("\n    - ".join(cons))
            f.write("\n\n")

In [None]:
grouped_by_topic = df.groupby(by="topic")
for topic in grouped_by_topic.groups:
    print(topic)
    topic_df, notes = get_topic_df(grouped_by_topic, topic, notes)
    topic_name = topic.lower().replace(' ', '-')
    extract_tutorial_feedbacks(topic_df, topic_name)

Details (pros/cons) for each tutorials are available: https://github.com/bebatut/galaxy-training-material-stats/tree/master/results

# General stats about feedback

Feedback number:

In [None]:
# number of rows
len(df)

Feedback number over time

In [None]:
months = df.timestamp.dt.to_period("M")
nb_per_months = (df
                 .groupby(months)
                 .count()
                 .timestamp)
nb_per_months

In [None]:
plt.figure()
(nb_per_months
    .cumsum()
    .plot())
plt.xlabel('Months')
plt.ylabel('Cumulative number of feedback')
plt.show()

Feedback number per topics

In [None]:
(grouped_by_topic
     .count()
     .sort_values('timestamp', ascending=False)
     .timestamp)

Top 10 tutorials with feedbacks

In [None]:
(df
    .groupby(by="tutorial")
    .count()
    .sort_values('timestamp', ascending=False)
    .timestamp
    .head(10))

# Notes

In [None]:
def plot_note_histogram(s, title):
    plt.figure()
    s.plot(kind='barh', color='k', ylim=(0,5), xlim=(0,1), title=title)
    plt.xlabel('Proportion of feedback')
    plt.show()

In [None]:
notes = (notes
         .fillna(0.0)
         .astype(int)
         .rename(index = {0: 'No value'}))
# 1: emoji.emojize(':-1:', use_aliases=True)
# 5: emoji.emojize(':heart:', use_aliases=True)
notes

In [None]:
notes_prop = (notes/notes.sum().round(2))
for col in notes_prop.columns:
    plot_note_histogram(notes_prop[col], col)

In [None]:
notes_prop