#                                        Microtask 3

##### Produce a listing of repositories, as a table and as CSV file, with the number of commits authored, issues opened, and pull requests opened, during the last three months, ordered by the total number (commits plus issues plus pull requests).

In [1]:
#importing the necessary Libraries 
from pprint import pprint
from elasticsearch_dsl import Search

import datetime
import elasticsearch as ES
import subprocess
import calendar
import requests
import pandas as pd
import json
import string

#### Configuring Repository name, Organisation's name, elasticsearch url, indices name and github access token.

#### I'll be analysing  all repositories of a particular organisation.


In [2]:
# Name of organization.
org_name = "chaoss"

# Url from where github api will retrieve repositories data.
url =  'https://api.github.com/users/'+org_name+'/repos'

#url at which Elasticsearch must be running
es_url = "http://localhost:9200"

# ElasticSearch instance (url)
es = ES.Elasticsearch([es_url])

#verification token to avoid problems regarding unauthenticated access to the GitHub API
token = ''


####  Here I have used [github developers api](https://developer.github.com/v3/) to retrieve repositories details from github.
#### I'll be storing the number of repos, name and other useful urls for each repo.

In [5]:
# Getting data using github api
r = requests.get(url)

api_response=json.loads(r.text)
no_of_repos=len(api_response)
repos = []
for x in range(no_of_repos):
    tmp=[]
    tmp.append(api_response[x]['name'])
    tmp.append(api_response[x]['clone_url'])
    tmp.append(api_response[x]['html_url'])
    repos.append(tmp)

# Displaying collected data for analysis.
print('The number of repositories '+ org_name +' organisation :',no_of_repos)

#printing the first repo's details.
repos[0]

The number of repositories chaoss organisation : 23


['governance',
 'https://github.com/chaoss/governance.git',
 'https://github.com/chaoss/governance']

### Here I collected commits data for all the above git repositories.

#### Firstly running p2o.py to retrieve data from git, enriching it and uploading it to elasticsearch. I used python subprocess for running p2o.py.
#### ElasticSearch_dsl for quering enriched indices. Filtering them on the basis of commit date, not being older than 3 months. And storing relevant fields in a dictionary.

In [15]:
commit_list = {}
for x in range(no_of_repos):
    total_commits=0
    raw_index = repos[x][0]+'_raw'
    enriched_index = repos[x][0]+'_enriched'
    repo_url = repos[x][1]
   
    subprocess.run(['p2o.py', '--enrich', '--index', raw_index , '--index-enrich', enriched_index, 
 '-e',es_url, '--no_inc', '--debug', 'git', repo_url ])
    
    response=es.search(index=enriched_index)
    no_of_commits = (response['hits']['total'])
    
    response = es.search(index=enriched_index, body={"size":no_of_commits})
    
    def date_modify(dat):    
                a=dat.split(" ")
                b=a[0].split('-')
                return (b[0]+'/'+b[1]+'/'+b[2])

            
    for i in range(no_of_commits):
        date = pd.tslib.Timestamp(response['hits']['hits'][i]['_source']['commit_date'])
                
        current_date=date_modify(str(datetime.datetime.now()))
        commit_date =date_modify(str(date))
        date_format = "%Y/%m/%d"
        b = datetime.datetime.strptime(current_date, date_format)
        a = datetime.datetime.strptime(commit_date, date_format)
        delta = b - a
        
        if (delta.days <=90):
            total_commits = total_commits+1
            
    commit_list[repos[x][0]] = total_commits


https://github.com/chaoss/governance.git
https://github.com/chaoss/grimoirelab.git
https://github.com/chaoss/grimoirelab-bestiary.git
https://github.com/chaoss/grimoirelab-elk.git
https://github.com/chaoss/grimoirelab-hatstall.git
https://github.com/chaoss/grimoirelab-kibiter.git
https://github.com/chaoss/grimoirelab-kidash.git
https://github.com/chaoss/grimoirelab-kingarthur.git
https://github.com/chaoss/grimoirelab-manuscripts.git
https://github.com/chaoss/grimoirelab-mordred.git
https://github.com/chaoss/grimoirelab-perceval.git
https://github.com/chaoss/grimoirelab-perceval-mozilla.git
https://github.com/chaoss/grimoirelab-perceval-opnfv.git
https://github.com/chaoss/grimoirelab-perceval-puppet.git
https://github.com/chaoss/grimoirelab-sigils.git
https://github.com/chaoss/grimoirelab-sortinghat.git
https://github.com/chaoss/grimoirelab-toolkit.git
https://github.com/chaoss/grimoirelab-tutorial.git
https://github.com/chaoss/metrics.git
https://github.com/chaoss/prospector.git
https:

In [16]:
commit_list

{'governance': 16,
 'grimoirelab': 70,
 'grimoirelab-bestiary': 141,
 'grimoirelab-elk': 169,
 'grimoirelab-hatstall': 51,
 'grimoirelab-kibiter': 65,
 'grimoirelab-kidash': 14,
 'grimoirelab-kingarthur': 27,
 'grimoirelab-manuscripts': 0,
 'grimoirelab-mordred': 0,
 'grimoirelab-perceval': 0,
 'grimoirelab-perceval-mozilla': 0,
 'grimoirelab-perceval-opnfv': 0,
 'grimoirelab-perceval-puppet': 0,
 'grimoirelab-sigils': 0,
 'grimoirelab-sortinghat': 0,
 'grimoirelab-toolkit': 0,
 'grimoirelab-tutorial': 0,
 'metrics': 0,
 'prospector': 0,
 'website': 0,
 'wg-diversity-inclusion': 0,
 'whitepaper': 0}

### Here I collected Issues and pull request data for all the above repositories.
#### Firstly running p2o.py to retrieve data from github, enriching it and uploading it to elasticsearch. I used python subprocess for running p2o.py.
#### ElasticSearch_dsl for quering enriched indices. Filtering them on the basis of commit date, not being older than 3 months. And storing relevant fields in a dictionary.  

In [34]:
Issue_count={}
PR_count={}
for x in range(no_of_repos):
    issue_count=0
    pr_count=0
    repo_name = repos[x][0]
    raw_index = repos[x][0]+'_raw'
    enriched_index = repos[x][0]+'_enriched_index'
    subprocess.run(['p2o.py', '--enrich', '--index', raw_index,
      '--index-enrich', enriched_index, '-e', es_url,
      '--no_inc', '--debug', 'github', org_name , repo_name,
      '-t',  token, '--sleep-for-rate'])
    response = Search(using=es ,index=enriched_index)
    response = response.filter("terms", item_type = ['issue', 'pull request'])
    response = response.filter('range', created_at ={'gte': 'now-3M'})
    s=response.execute()
    for i in s:
        if(i.item_type == 'pull request'):
            pr_count+=1
        else:
            issue_count+=1
    Issue_count[repos[x][0]] = issue_count
    PR_count [repos[x][0]] = pr_count

In [21]:
pprint (Issue_count)
pprint (PR_count)

{'governance': 2,
 'grimoirelab': 6,
 'grimoirelab-bestiary': 0,
 'grimoirelab-elk': 0,
 'grimoirelab-hatstall': 0,
 'grimoirelab-kibiter': 0,
 'grimoirelab-kidash': 2,
 'grimoirelab-kingarthur': 1,
 'grimoirelab-manuscripts': 5,
 'grimoirelab-mordred': 1,
 'grimoirelab-perceval': 1,
 'grimoirelab-perceval-mozilla': 1,
 'grimoirelab-perceval-opnfv': 1,
 'grimoirelab-perceval-puppet': 1,
 'grimoirelab-sigils': 0,
 'grimoirelab-sortinghat': 0,
 'grimoirelab-toolkit': 0,
 'grimoirelab-tutorial': 6,
 'metrics': 0,
 'prospector': 3,
 'website': 0,
 'wg-diversity-inclusion': 0,
 'whitepaper': 0}
{'governance': 8,
 'grimoirelab': 4,
 'grimoirelab-bestiary': 10,
 'grimoirelab-elk': 10,
 'grimoirelab-hatstall': 10,
 'grimoirelab-kibiter': 10,
 'grimoirelab-kidash': 6,
 'grimoirelab-kingarthur': 9,
 'grimoirelab-manuscripts': 5,
 'grimoirelab-mordred': 9,
 'grimoirelab-perceval': 9,
 'grimoirelab-perceval-mozilla': 9,
 'grimoirelab-perceval-opnfv': 7,
 'grimoirelab-perceval-puppet': 7,
 'grimoir

### Storing data the commits, issues and Pull requests data togther along with their total number.

In [45]:
data = []
for x in range(no_of_repos):
    tmp = {}
    tmp ['Repository'] = repos[x][0]
    tmp ['Number of commits'] = commit_list[repos[x][0]]
    tmp ['Number of Issues'] = Issue_count[repos[x][0]]
    tmp ['Number of pull requests'] = PR_count[repos[x][0]]
    tmp ['Total'] = commit_list[repos[x][0]] + Issue_count[repos[x][0]] + PR_count[repos[x][0]]
    data.append(tmp)
pprint(data)

[{'Number of Issues': 2,
  'Number of commits': 16,
  'Number of pull requests': 8,
  'Repository': 'governance',
  'Total': 26},
 {'Number of Issues': 6,
  'Number of commits': 70,
  'Number of pull requests': 4,
  'Repository': 'grimoirelab',
  'Total': 80},
 {'Number of Issues': 0,
  'Number of commits': 141,
  'Number of pull requests': 10,
  'Repository': 'grimoirelab-bestiary',
  'Total': 151},
 {'Number of Issues': 0,
  'Number of commits': 169,
  'Number of pull requests': 10,
  'Repository': 'grimoirelab-elk',
  'Total': 179},
 {'Number of Issues': 0,
  'Number of commits': 51,
  'Number of pull requests': 10,
  'Repository': 'grimoirelab-hatstall',
  'Total': 61},
 {'Number of Issues': 0,
  'Number of commits': 65,
  'Number of pull requests': 10,
  'Repository': 'grimoirelab-kibiter',
  'Total': 75},
 {'Number of Issues': 2,
  'Number of commits': 14,
  'Number of pull requests': 6,
  'Repository': 'grimoirelab-kidash',
  'Total': 22},
 {'Number of Issues': 1,
  'Number of c

#### Producing dataframe of the stored data using pandas. And visualizing the features in form of a table.

In [43]:
df = pd.DataFrame(data, columns=["Repository", "Number of Issues", "Number of commits", "Number of pull requests", "Total"])
df.sort_values(by=['Total'], inplace=True)
df

Unnamed: 0,Repository,Number of Issues,Number of commits,Number of pull requests,Total
19,prospector,3,0,0,3
22,whitepaper,2,0,4,6
16,grimoirelab-toolkit,1,0,5,6
13,grimoirelab-perceval-puppet,1,0,7,8
12,grimoirelab-perceval-opnfv,1,0,7,8
20,website,10,0,0,10
18,metrics,4,0,6,10
17,grimoirelab-tutorial,6,0,4,10
15,grimoirelab-sortinghat,2,0,8,10
14,grimoirelab-sigils,2,0,8,10


#### producing csv file of the dataframe contaning every repositories total number of Commits, Issues and Pull requests in the last three months.

In [41]:
df.to_csv('repo_analysis.csv', index=False ,sep=',')
