### This is Microtask-1 for the project: Reporting of CHAOSS Metrics.

In [1]:
from datetime import datetime, timedelta
from elasticsearch import Elasticsearch
from pprint import pprint
from dateutil.relativedelta import relativedelta
from calendar import monthrange, month_name
from collections import defaultdict, OrderedDict

es = Elasticsearch("localhost:9200")

In [2]:
def get_all_commit_records(index=None, custom_source=False):
    "Queries the elasticsearch instance and returns all the documents it has indexed."
    
    temp_res = es.search(index=index, body={"query":{"match_all":{}}})
    size = temp_res["hits"]["total"]
    query = {
                "size":size, 
                "query":
                        {
                            "match_all":{}
                        }
    }
    # Use custom source so that the result is smaller in size
    if custom_source:
        query["_source"] = ["author_name", "committer_name", "commit_date", "is_git_commit", "lines_added", 
                            "lines_changed", "lines_removed", "utc_commit", "grimoire_creation_date"]
        
    res = es.search(index=index, body=query)
    return [res["hits"]["hits"][i]["_source"] for i in range(res["hits"]["total"])]

In [3]:
result = get_all_commit_records("aima_python_git", True)

#### This is what a document looks like

In [4]:
result[0]

{'author_name': 'spottedMetal',
 'commit_date': '2007-07-13T21:12:24',
 'committer_name': 'spottedMetal',
 'grimoire_creation_date': '2007-07-13T21:12:24+00:00',
 'is_git_commit': 1,
 'lines_added': 181,
 'lines_changed': 247,
 'lines_removed': 66,
 'utc_commit': '2007-07-13T21:12:24'}

Custom functions to parse dates, get start and end of months and dividing commits according to the months in which they were made

In [26]:
def parse_date(date, custom_format=None):
    """Returns a datetime.datetime object from a string. 
    custom_format for the date can be given as input"""
    
    if custom_format:
        return datetime.strptime(date, custom_format)
    return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")

In [22]:
def get_end_date_of_month(date):
    "Given a date, return the end date of the month"
    return date + relativedelta(days = +(monthrange(date.year, date.month)[1] - date.day))

In [23]:
def get_start_date_of_month(date):
    "Given a date, return the start date of the month"
    return date - relativedelta(days = +date.day-1)

In [24]:
def get_bucket_name(date):
    "Given a date return the a name in the form of MonthYYYY"
    return month_name[date.month] + str(date.year)

In [25]:
def get_extreme_commits_dates(commit_list):
    "Given a list of commits, return the dates of the first and the last commits"
    fc_date = min(parse_date(item['commit_date']) for item in result)
    lc_date = max(parse_date(item['commit_date']) for item in result)
    return fc_date, lc_date

In [27]:
def make_buckets(commit_list):
    """Given a list of commits, return containers of months from when 
    the project was started to the month of the latest commit. Each container 
    is a month and details about all the commits and committers for that month."""
    
    buckets = OrderedDict()
    
    first_date, last_date = get_extreme_commits_dates(commit_list)
    
    month_start_date = get_start_date_of_month(first_date)
    month_end_date = get_end_date_of_month(first_date)
    bucket_name = get_bucket_name(first_date)
    
    while month_end_date <= last_date:
        commit = {}
        commit["start"] = month_start_date
        commit["end"] = month_end_date
        commit['new_committers'] = defaultdict(int)
        commit['old_committers'] = defaultdict(int)
        commit['commits'] = []
        buckets[bucket_name] = commit
        
        month_start_date = month_end_date + relativedelta(days=+1)
        month_end_date = get_end_date_of_month(month_start_date)
        bucket_name = get_bucket_name(month_start_date)
    
    commit = {}
    commit["start"] = month_start_date
    commit["end"] = month_end_date
    commit['new_committers'] = defaultdict(int)
    commit['old_committers'] = defaultdict(int)
    commit['commits'] = []
    buckets[bucket_name] = commit
    
    return buckets

In [28]:
def distribute_commits_in_buckets(commit_list):
    "Distribute the commits to the months in which they were made."
    months = make_buckets(commit_list)
    
    for commit in commit_list:
        month = get_bucket_name(parse_date(commit['commit_date']))
        months[month]["commits"].append(commit)
    
    return months

Main function which calculates the number of new committers for each month

In [29]:
def analyse_repository(commit_list):
    months = distribute_commits_in_buckets(commit_list)
    all_committers = []
    
    for name, month in months.items():
        for commit in month['commits']:
            committer = commit['author_name']
            if committer in all_committers:
                month['old_committers'][committer] += 1
            else:
                month['new_committers'][committer] += 1
        all_committers = list(set(all_committers + list(month['old_committers'].keys()) + 
                                  list(month['new_committers'].keys())))
                
    return months

In [30]:
Output = analyse_repository(result)