### This is Microtask-1 for the project: Reporting of CHAOSS Metrics.

In [1]:
from datetime import datetime, timedelta
from elasticsearch import Elasticsearch
from IPython.display import display
from dateutil.relativedelta import relativedelta
from calendar import monthrange, month_name
from collections import defaultdict, OrderedDict
import pandas as pd

es = Elasticsearch("localhost:9200")

# Name of the index we are analysing
index_name = "aima_python_git"

We will query the index that we created as shown in the Microtask-1.md  
We'd like to get **all** the commits (documents in the index) from the repository.

In [2]:
def get_all_commit_records(index=None, custom_source=False):
    "Queries the elasticsearch instance and returns all the documents in the index."
    
    temp_res = es.search(index=index, body={"query":{"match_all":{}}})
    size = temp_res["hits"]["total"]
    query = {
                "size":size, 
                "query":
                        {
                            "match_all":{}
                        }
    }
    # Use custom source so that the result is smaller in size
    if custom_source:
        query["_source"] = ["author_name", "committer_name", "commit_date", "is_git_commit", "lines_added", 
                            "lines_changed", "lines_removed", "utc_commit", "grimoire_creation_date"]
        
    res = es.search(index=index, body=query)
    return [res["hits"]["hits"][i]["_source"] for i in range(res["hits"]["total"])]

We store it in the *result* variable

In [3]:
result = get_all_commit_records(index_name, True)

#### This is what a document looks like

In [4]:
result[0]

{'author_name': 'spottedMetal',
 'commit_date': '2007-07-13T21:12:24',
 'committer_name': 'spottedMetal',
 'grimoire_creation_date': '2007-07-13T21:12:24+00:00',
 'is_git_commit': 1,
 'lines_added': 181,
 'lines_changed': 247,
 'lines_removed': 66,
 'utc_commit': '2007-07-13T21:12:24'}

#### Custom functions: 
To parse dates, get start and end dates of months and to divide commits according to the months in which they were made

In [5]:
def parse_date(date, custom_format=None):
    """Returns a datetime.datetime object from a string. 
    custom_format for the date can be given as input"""
    
    if custom_format:
        return datetime.strptime(date, custom_format)
    
    return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")

In [6]:
def get_end_date_of_month(date):
    "Given a date, return the end date of the month"
    return date + relativedelta(days = +(monthrange(date.year, date.month)[1] - date.day))

In [7]:
def get_start_date_of_month(date):
    "Given a date, return the start date of the month"
    return date - relativedelta(days = +date.day-1)

In [8]:
def get_bucket_name(date):
    "Given a date return the a string in the form of MonthYYYY"
    return month_name[date.month] + str(date.year)

In [9]:
def get_extreme_commits_dates(commit_list):
    "Given a list of commits, return the dates of the first and the last commits"
    
    fc_date = min(parse_date(item['commit_date']) for item in result)
    lc_date = max(parse_date(item['commit_date']) for item in result)
    return fc_date, lc_date

In [10]:
def make_buckets(first_date, last_date):
    """Given the project start date and the last commit date, return 
    containers for months in between those dates. Each container is a month 
    containing details about all the commits and committers for that month."""
    
    buckets = OrderedDict()
    
    month_start_date = get_start_date_of_month(first_date)
    month_end_date = get_end_date_of_month(first_date)
    bucket_name = get_bucket_name(first_date)
    
    while month_end_date <= last_date:
        commit = {}
        commit['new_committers'] = defaultdict(int)
        commit['old_committers'] = defaultdict(int)
        commit['commits'] = []
        buckets[bucket_name] = commit
        
        month_start_date = month_end_date + relativedelta(days=+1)
        month_end_date = get_end_date_of_month(month_start_date)
        bucket_name = get_bucket_name(month_start_date)
    
    commit = {}
    commit['new_committers'] = defaultdict(int)
    commit['old_committers'] = defaultdict(int)
    commit['commits'] = []
    buckets[bucket_name] = commit
    
    return buckets

#### Analysis:

Here we analyse the repository. We take in a list of the commits, starting from the very first. Then we create buckets of months and put all the commits of each month in that month's bucket.

Then we separate the commits as made by an `old_committer` or a `new_committer`.

In [11]:
def analyse_repository(commit_list):
    
    first_date, last_date = get_extreme_commits_dates(commit_list)
    
    months = make_buckets(first_date, last_date)
    
    for commit in commit_list:
        month = get_bucket_name(parse_date(commit['commit_date']))
        months[month]["commits"].append(commit)
        
    all_committers = []
    
    for name, month in months.items():
        for commit in month['commits']:
            committer = commit['author_name']
            if committer in all_committers:
                month['old_committers'][committer] += 1
            else:
                month['new_committers'][committer] += 1
        all_committers = list(set(all_committers + list(month['old_committers'].keys()) + 
                                  list(month['new_committers'].keys())))
        del month['commits']
                
    return months

In [12]:
Output = analyse_repository(result)

The number of new committers per month:

In [13]:
fmt = '{:<20}{}'

print(fmt.format('Month', 'No. of new committers'))

for month_name, month in Output.items():
    print(fmt.format(month_name, len(month["new_committers"])))

Month               No. of new committers
June2007            1
July2007            1
August2007          0
September2007       0
October2007         0
November2007        0
December2007        0
January2008         0
February2008        0
March2008           0
April2008           0
May2008             0
June2008            0
July2008            0
August2008          0
September2008       0
October2008         0
November2008        0
December2008        0
January2009         0
February2009        0
March2009           0
April2009           0
May2009             0
June2009            0
July2009            0
August2009          0
September2009       0
October2009         0
November2009        0
December2009        0
January2010         0
February2010        0
March2010           0
April2010           0
May2010             1
June2010            0
July2010            0
August2010          0
September2010       0
October2010         0
November2010        0
December2010        0
January2011 

We convert the buckets in Output to a list of dictionaries. Each dictionary contains 3 elements: `month`, `Author of commit` and `Number of commits`. Then we use the `pandas` library to convert this list into a table.

In [14]:
def get_table_from_dict(commit_dict):
    table = []
    for month_name, month in commit_dict.items():
        ls = []
        for key, val in month['new_committers'].items():
            item = {}
            item['month'] = month_name
            item["Author of commit"] = key
            item['Number of commits'] = val
            ls.append(item)
        table = table + ls
    
    return pd.DataFrame(table)

In [15]:
table = get_table_from_dict(Output)

In [16]:
display(table)

Unnamed: 0,Author of commit,Number of commits,month
0,peter.norvig,1,June2007
1,spottedMetal,18,July2007
2,srburnet,1,May2010
3,withal,13,August2011
4,norvig,3,February2016
5,greyshadows,1,March2016
6,abhishek garg,2,March2016
7,utk1610,1,March2016
8,SnShine,33,March2016
9,Tamer Tas,5,March2016


`get_table_from_dict` gives us the required table fot new committers each month and the number of commits that they do.  
Now we'll just use the `pandas.to_csv` function to put this table in a csv file

In [17]:
table.to_csv(index_name + ".csv", index=False)