# Microtask 3
---
Produce a listing of repositories, as a table and as a .csv file, with the number of commits authored, issues opened, and pull requests opened, during the last three months, ordered by total number (commits plus issues plus pull requests).

In [None]:
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

import subprocess
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

Assuming we have a running instance of Elasticsearch locally at http://localhost:9200

In [None]:
# elasticsearch instance
es = Elasticsearch('http://localhost:9200', verify_certs=False)

In [None]:
# the following repositories will be used
repos = [
    {'org': 'chaoss', 
     'repo': 'grimoirelab-perceval', 
     'index_raw': 'perceval_r', 
     'index_enriched': 'perceval', 
     'index_github_raw': 'perceval_g_r', 
     'index_github_enriched': 'perceval_g'},
    {'org': 'chaoss',
     'repo': 'grimoirelab-kingarthur',
     'index_raw': 'arthur_r',
     'index_enriched': 'arthur',
     'index_github_raw': 'arthur_g_r',
     'index_github_enriched': 'arthur_g'},
    {'org': 'chaoss',
     'repo': 'grimoirelab-sortinghat',
     'index_raw': 'sortinghat_r',
     'index_enriched': 'sortinghat',
     'index_github_raw': 'sortinghat_g_r',
     'index_github_enriched': 'sortinghat_g'},
    {'org': 'chaoss',
     'repo': 'grimoirelab-mordred',
     'index_raw': 'mordred_r',
     'index_enriched': 'mordred',
     'index_github_raw': 'mordred_g_r',
     'index_github_enriched': 'mordred_g'},
    {'org': 'chaoss',
     'repo': 'grimoirelab-manuscripts',
     'index_raw': 'manuscripts_r',
     'index_enriched': 'manuscripts',
     'index_github_raw': 'manuscripts_g_r',
     'index_github_enriched': 'manuscripts_g'}
]
token = ''

Run `p2o.py` to extract data from these repositories

In [None]:
for repo in repos:
    print(f"Fetching git indices for {repo['repo']}")
    subprocess.run(['p2o.py', '--enrich', '--index', repo['index_raw'], '--index-enrich', repo['index_enriched'], '-e', 'http://localhost:9200', '--no_inc', '--debug', 'git', 'https://github.com/' + repo['org'] + '/' + repo['repo'] + '.git'])
    print(f"Fetching GitHub indices for {repo['repo']}")
    subprocess.run(['p2o.py', '--enrich', '--index', repo['index_github_raw'], '--index-enrich', repo['index_github_enriched'], '-e', 'http://localhost:9200', '--no_inc', '--debug', 'github', repo['org'], repo['repo'], '-t', token, '--sleep-for-rate'])
    print('\n')

In [None]:
df_issues = []
df_pulls = []
df_commits = []
for repo in repos:
    print(repo['repo'])
    # get number of issues and pull requests for each repository in `repos`
    s = Search(using=es, index=repo['index_github_enriched'])
    s = s.filter('range', created_at={'gte' : 'now-3M'})
    s = s.source(['time_to_close_days', 'time_open_days', 'item_type', 'created_at', 'closed_at', 'author_name', 'id_in_repo'])
    s = s.sort({'created_at': {'order': 'asc'}})
    result = s.execute().to_dict()['hits']['hits']
    github = pd.DataFrame([ix['_source'] for ix in result])
    df_issues.append(len(github.where(github.item_type == 'issue').dropna()))
    df_pulls.append(len(github.where(github.item_type == 'pull request').dropna()))
    
    # get number of commits for each repository in `repos`
    s = Search(using=es, index=repo['index_enriched'])
    s = s.source(['commits_date'])
    s = s.filter('range', commit_date={'gte' : 'now-3M'})
    df_commits.append(s.execute()['hits']['total'])

In [None]:
names = [repo['repo'] for repo in repos]

In [None]:
df = pd.DataFrame(list(zip(names, df_issues, df_pulls, df_commits)))

In [None]:
df['total'] = df[1] + df[2] + df[3]

In [None]:
df.rename(columns={0:'name', 1:'issues', 2:'pull_requests', 3:'commits'}).sort_values(by='total', ascending=False)