#                                                Microtask 1


#### Produce a listing of the number of new committers per month, and the number of commits for each of them, as a table and as a CSV file. Use the GrimoireLab enriched index for git.


In [5]:
# importing the neccesary libraries

from datetime import datetime
from pprint import pprint
from elasticsearch_dsl import Search

import elasticsearch as ES
import elasticsearch_dsl
import subprocess
import calendar
import pandas as pd

#### Configuring ElasticSearch URL, Repository name and indices name

In [8]:
# Url for the git repo to analyze
repo_url = "http://github.com/grimoirelab/perceval.git"

#url at which Elasticsearch must be running
es_url = "http://localhost:9200"

# Directory for letting Perceval clone the git repo
raw_index = "commits_raw"
enriched_index = "commits"

# ElasticSearch instance (url)
es = ES.Elasticsearch([es_url])

#### Before retriving data from git repo, first we need to make sure any index of same name doesn't already exits and delete them if they exist

In [9]:

if(es.indices.exists(raw_index)):
    es.indices.delete(raw_index)
    
if(es.indices.exists(enriched_index)):
    es.indices.delete(enriched_index)

#### Retrieving data 

In [10]:
# We will be using p2o.py to retrive data,upload raw_data, enrich data and 
# upload enriched indices to elasticsearch with the help of python subprocess

subprocess.run(['p2o.py', '--enrich', '--index', raw_index , '--index-enrich', enriched_index, 
 '-e',es_url, '--no_inc', '--debug', 'git', repo_url ])

CompletedProcess(args=['p2o.py', '--enrich', '--index', 'commits_raw', '--index-enrich', 'commits', '-e', 'http://localhost:9200', '--no_inc', '--debug', 'git', 'http://github.com/grimoirelab/perceval.git'], returncode=0)

In [11]:
# We will search in enriched index using Elasticsearch's built in search function

response=es.search(index=enriched_index)
total_commits = (response['hits']['total'])
response = es.search(index=enriched_index, body={"size":total_commits})
commits_list = []
for x in range(total_commits):
    commits_list.append(response['hits']['hits'][x]['_source'])
    
print ("Total number of commits on this repository", total_commits)

Total number of commits on this repository 1016


In [12]:
# output 1 element to get the idea of structure of data
pprint (commits_list[0])

In [24]:
# We will segregate out useful data from the complete search output.
# We will store first commit's date and total commits for each different user.

commiters = []

def check_commiter(name):
    for x in range(len(commiters)):
        if(commiters[x]['Name']==name):
            return x
    return -1
    
for x in range(total_commits):
    author_name=commits_list[x]['author_name']
    commit_date=commits_list[x]['commit_date']
    commit_date = pd.tslib.Timestamp(commit_date)
    #print (author_name)
    tmp=check_commiter(author_name) 
    if( tmp == -1):
        commiters.append({'Name':author_name , 'first_commit_Date': commit_date.strftime('%y-%b') ,'commit_count':1})
    else:
        commiters[tmp]['commit_count']=commiters[tmp]['commit_count']+1
pprint (commiters[0])

{'Name': 'Santiago Dueñas', 'commit_count': 708, 'first_commit_Date': '15-Nov'}


In [19]:
# We will calculate month of the very first and the latest commit
def getmonthname(date):
    a=date.split('T')
    return ((a[0]))
    
first_commit_date=commits_list[0]['commit_date']
last_commit_date=commits_list[total_commits-1]['commit_date']
first_commit_month = getmonthname(first_commit_date)
last_commit_month = getmonthname(last_commit_date)

# We will construct an array corresponding to each month in the range

daterange = pd.date_range(first_commit_month,last_commit_month , freq='1M') 

daterange = [d.strftime('%y-%b') for d in daterange]

print (daterange)

['15-Nov', '15-Dec', '16-Jan', '16-Feb', '16-Mar', '16-Apr', '16-May', '16-Jun', '16-Jul', '16-Aug', '16-Sep', '16-Oct', '16-Nov', '16-Dec', '17-Jan', '17-Feb', '17-Mar', '17-Apr', '17-May', '17-Jun', '17-Jul', '17-Aug', '17-Sep', '17-Oct', '17-Nov', '17-Dec', '18-Jan', '18-Feb']


#### Producing List and dataframe for Author's total commits and New Commiters each month.

In [20]:
# producing a list containg total commits of each user
authors_commit_count={}
total=0
for x in commiters:
    authors_commit_count[x['Name']]=x['commit_count']
    total+=x['commit_count']
    
#converting it into dataframe using pandas
df = pd.DataFrame(list(authors_commit_count.items()), columns=["Author","Total commits"])

In [21]:
#producing csv file of the dataframe contaning every authors total number of commits
df.to_csv('authors_commits_count.csv', index=False ,sep=',')
df

Unnamed: 0,Author,Total commits
0,Valerio Cosentino,169
1,Miguel Ángel Fernández,3
2,quan,5
3,J. Manrique Lopez de la Fuente,1
4,Alvaro del Castillo,45
5,David Pose Fernández,1
6,valerio,2
7,camillem,2
8,David Esler,1
9,valerio cosentino,6


In [22]:
#Counting new commiters for each month in form of a dataframe

new_commiters_per_month = {}
for x in range(len(daterange)):
    new_commiters_per_month[daterange[x]]=0
for x in commiters:
    new_commiters_per_month[x['first_commit_Date']] +=1
df = pd.DataFrame(list(new_commiters_per_month.items()), columns=["Year-Month","Number of New Commiters"])

In [23]:
df.to_csv('new_commiters_permonth.csv', index=False ,sep=',')
df

Unnamed: 0,Year-Month,Number of New Commiters
0,16-Jan,2
1,17-Sep,1
2,17-Jul,0
3,17-Jan,1
4,16-May,0
5,16-Jul,1
6,18-Jan,1
7,17-Nov,2
8,16-Sep,1
9,16-Feb,1
