## Common Stuff

First we place the code we use for importing modules and creating conections.

In [1]:
import certifi
import configparser
import json
import os
import sys

import plotly as plotly
import plotly.graph_objs as go

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search


import pandas as pd

def create_conn():
    """Creates an ES connection from ''.settings' file.

    ''.settings' contents sample:
    [ElasticSearch]

    user=john_smith
    password=aDifficultOne
    host=my.es.host
    port=80
    path=es_path_if_any
    """

    parser = configparser.ConfigParser()
    parser.read('../.settings')

    section = parser['ElasticSearch']
    user = section['user']
    password = section['password']
    host = section['host']
    port = section['port']
    path = section['path']

    connection = "https://" + user + ":" + password + "@" + host + ":" + port \
                + "/" + path

    es_read = Elasticsearch([connection], use_ssl=True,
                            verity_certs=True, ca_cert=certifi.where(),
                            scroll='300m', timeout=1000)

    return es_read


## Onion Model

Groups of contributors, by level of activity: core, regular, casual
Our 'onion' has three groups:

* **Core**: minimum number of authors who made **80%** of contributions.
* **Regular**: minimum number of authors who made **between 80% and 95%** of contributions.
* **Casual**: the rest of contributors, who made the last **5%** of contributions.

A particular feature we include in the following code is storing not only the global count of people in each group, but also how many of them are Employees. We can expect most of **core** developers will be employees.

Finally, we compute Onion **by quarter**, so we can show the evolution through time.

In [2]:
def onion(df, bucket_column, time_column, value_column):
    
    total = df[value_column].sum()
    
    percent_80 = total * 0.8
    percent_95 = total * 0.95
    core = 0
    core_sum = 0
    regular = 0
    regular_sum = 0
    casual = 0
    core_non = 0
    regular_non = 0
    casual_non = 0
    core_emp = 0
    regular_emp = 0
    casual_emp = 0

    for row in df.iterrows():
        value = row[1][value_column]
        non = False
        if row[1]['org'] == 'Non-Employees':
            non = True
        
        if (percent_80 > core_sum):
            core = core + 1
            core_sum = core_sum + value
            regular_sum = regular_sum + value
            if non:
                core_non = core_non + 1
            else:
                core_emp = core_emp + 1
                
        elif percent_95 > regular_sum:
            regular = regular + 1
            regular_sum = regular_sum + value
            if non:
                regular_non = regular_non + 1
            else:
                regular_emp = regular_emp + 1
        else:
            casual = casual + 1
            if non:
                casual_non = casual_non + 1
            else:
                casual_emp = casual_emp + 1

    return {"core":core,
            "regular":regular,
            "casual":casual,
            "core-non": core_non,
            "regular-non": regular_non,
            "casual-non": casual_non,
            "core-emp": core_emp,
            "regular-emp": regular_emp,
            "casual-emp": casual_emp} 

def onion_evolution(df, bucket_field, time_field, metric_field):
    
    #print(len(df))
    
    onion_df = pd.DataFrame(
        columns=['Time', 
                 'Core', 'Core-Non', 'Core-Emp',
                 'Regular', 'Regular-Non', 'Regular-Emp',
                 'Casual', 'Casual-Non', 'Casual-Emp'])
    
    for time in df[time_field].unique():
        slice_df = df.loc[df['time'] == time]
        slice_df = slice_df.sort_values(by=metric_field, ascending=False)
        onion_result = onion(slice_df, 
                             bucket_column=bucket_field, 
                             time_column=time_field,
                             value_column=metric_field)
        
        onion_df.loc[len(onion_df)] = [time, 
                                       onion_result['core'],
                                       onion_result['core-non'], 
                                       onion_result['core-emp'],
                                       onion_result['regular'],
                                       onion_result['regular-non'],
                                       onion_result['regular-emp'],
                                       onion_result['casual'], 
                                       onion_result['casual-non'],
                                       onion_result['casual-emp']]
        
    
    return onion_df

## Some filtering to clean data

First we take data **from 2010**.

We filter out **bots** to avoid noise.

We filter out **commits with no actions on files**.

In [3]:
date_range = {'gte': '2010-01-01', 'lt': 'now/y'}


def add_bot_filter(s):
    return s.filter('term', author_bot='false')

def add_merges_filter(s):
    return s.filter('range', files={'gt': 0})

def add_date_filter(s):
    return s.filter('range', grimoire_creation_date=date_range)


## Stack results in a Data Frame

We need a helper function to create a Data Frame from ES results. What we need here is to group data **by time and uuid**, in order **to know how many commits each author made in a given period of time (quarter)**. Notice we also split them in Employees/Non-employees.

In [4]:
def stack_by(result, group_column, time_column, value_column, group_field, time_field, value_field):
    """Creates a dataframe based on group and time values
    """
    df = pd.DataFrame(columns=[time_column, group_column, 'org', value_column])

    for b in result.to_dict()['aggregations'][time_field]['buckets']:
        for i in b[group_field]['buckets']:
            for org in i['org']['buckets']:
                df.loc[len(df)] = [b['key_as_string'], i['key'], org['key'], org[value_field]['value']]
    
    return df

## Querying ES

We are ready to query ES. As mentioned before to explain how to build a dataframe, we query ES to get the number of commits of each and every author, bucketized by organization, for each quarter from 2010.

In [5]:

es_conn = create_conn()

# Create search object
s = Search(using=es_conn, index='git')

s = add_bot_filter(s)
s = add_merges_filter(s)

# Adds date range to retrieve data from
s = add_date_filter(s)


# Unique count of Commits by Authors over time
s.aggs.bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .bucket('uuid', 'terms', field='author_uuid', size=100000)\
    .bucket('org', 'terms', field='author_org_name', size=100000)\
    .metric('commits', 'cardinality', field='hash', precision_threshold=3000)
result = s.execute()

authors_df = stack_by(result, 'uuid', 'time', 'commits', 'uuid', 'time', 'commits')

# Divide authors in Employees and Non-Employees based on org name
authors_df.loc[authors_df['org'] == 'Bitergia', 'org'] = 'Employees'
authors_df.loc[authors_df['org'] != 'Employees', 'org'] = 'Non-Employees'

## Compute onion

Finally we just need to call our onion functions to get results computed. To ease data visualization, we add a cell containing quarters as strings, which should be more human-firendly than dates. 

In [6]:
onion_df = onion_evolution(authors_df, bucket_field='uuid', time_field='time', metric_field='commits')

# Calculate quarters
onion_df['Quarter'] = onion_df['Time'].map(lambda x: str(pd.Period(x,'Q')))

onion_df

Unnamed: 0,Time,Core,Core-Non,Core-Emp,Regular,Regular-Non,Regular-Emp,Casual,Casual-Non,Casual-Emp,Quarter
0,2010-01-01T00:00:00.000Z,4,4,0,1,1,0,2,2,0,2010Q1
1,2010-04-01T00:00:00.000Z,4,4,0,2,2,0,3,3,0,2010Q2
2,2010-07-01T00:00:00.000Z,3,3,0,1,1,0,1,1,0,2010Q3
3,2010-10-01T00:00:00.000Z,3,3,0,1,1,0,1,1,0,2010Q4
4,2011-01-01T00:00:00.000Z,4,4,0,1,1,0,3,3,0,2011Q1
5,2011-04-01T00:00:00.000Z,4,4,0,3,3,0,3,3,0,2011Q2
6,2011-07-01T00:00:00.000Z,4,4,0,1,1,0,2,2,0,2011Q3
7,2011-10-01T00:00:00.000Z,4,4,0,1,1,0,1,1,0,2011Q4
8,2012-01-01T00:00:00.000Z,4,4,0,3,3,0,3,3,0,2012Q1
9,2012-04-01T00:00:00.000Z,6,5,1,3,3,0,4,4,0,2012Q2


## Plot some charts

Because tables are useful but charts come in handy to have a quick glance of data.

As for newcomers & people leaving case, we are going to use [Plot.ly](https://plot.ly/python/). In this case we are using some bar charts to compare above numbers and their evolution quarter by quarter.

First define a general function to build the charts:

In [7]:
def print_grouped_bar(df, x_column, value_columns, title):
    """
    """
    plotly.offline.init_notebook_mode(connected=True)

    bars = []
    x_values = df[x_column].tolist()
    for value_column in value_columns:
        bars.append(go.Bar(
            x=x_values,
            y=df[value_column].tolist(),
            name=value_column))

    layout = go.Layout(
        barmode='group',
        title= title
    )

    fig = go.Figure(data=bars, layout=layout)
    plotly.offline.iplot(fig, filename='grouped-bar')
    
def print_stacked_bar(df, x_column, value_columns, title):
    """
    """
    plotly.offline.init_notebook_mode(connected=True)

    bars = []
    x_values = df[x_column].tolist()
    for value_column in value_columns:
        bars.append(go.Bar(
            x=x_values,
            y=df[value_column].tolist(),
            name=value_column))

    layout = go.Layout(
        barmode='stack',
        title= title
    )

    fig = go.Figure(data=bars, layout=layout)
    plotly.offline.iplot(fig, filename='stacked-bar')

### And now we are ready to get some fun using them!!

Example:
* Core vs Core-Non: show the evolution of core developers among your community (non-employees). **How many of your core devs are not employees**.

In [8]:
print_grouped_bar(onion_df, 
                  'Quarter',
                  ['Core', 'Core-Non', 'Regular', 'Regular-Non', 'Casual', 'Casual-Non'],
                  'Contribution Groups: all developers / non-employees')
print_stacked_bar(onion_df, 
                  'Quarter',
                  ['Core-Emp', 'Core-Non', 'Regular-Emp', 'Regular-Non','Casual-Emp', 'Casual-Non'],
                  'Contribution Groups: employees / non-employees')
print_grouped_bar(onion_df, 
                  'Quarter',
                  ['Core-Emp', 'Core-Non', 'Regular-Emp', 'Regular-Non','Casual-Emp', 'Casual-Non'],
                  'Contribution Groups: employees / non-employees')
