In [None]:
import requests
import json
import sys
import os
sys.path.insert(0, '../ghtesting')

import matplotlib.pyplot as plt
import pandas as pd
from ghdatabase import GHDatabase
from ghrepo import GHRepo
from tqdm.auto import tqdm
from datetime import datetime

In [None]:
plt.style.use(['science'])

In [None]:
repo_db = GHDatabase('ecs260', 'webframework_repos', os.environ['CONNECTION_STRING'])

repos = list(repo_db.get_repos())

num_repos = len(repos)
print(f'Number of repos: {num_repos}')

In [None]:
codecov_db = GHDatabase('ecs260', 'codecov_api', os.environ['CONNECTION_STRING'])
codecov_api = list(codecov_db.get_repos())

# Get initial responses from codecov api

In [None]:
def get_codecov_response(_id, page=None):
    page = f'?page=%s&limit=250' % page if page is not None else ''
    url = 'http://codecov.io/api/gh/%s/commits%s' % (_id, page)
    r = requests.get(url)
    return r.json()

In [None]:
ids_done = set(map(lambda x: x['_id'], codecov_api))
remaining = [repo for repo in repos if repo['_id'] not in ids_done]

for repo in tqdm(remaining):
    
    # get data from api
    _id = repo['_id']
    page = 1
    first_response = None
    repo_commits = []
    
    while True:
        
        # get commits page from api
        response = get_codecov_response(_id, page)
        
        # save first response
        if page == 1:
            first_response = response
            
        # get commits if available
        commits = response.get('commits', [])
        
        # extend repo commits collection
        repo_commits.extend(commits)
        
        # increment page number
        page += 1
        
        # break if no new commits in current page
        if len(commits) == 0:
            break
       
    codecov_db.update_repo(dict(
        _id=_id,
        data=first_response,
        commits=repo_commits
    ))

# Check coverage

In [None]:
active = []
inactive = []
errors = []

for response in codecov_api:
    if 'error' in response['data']:
        errors.append(response)
        
    elif response['data']['repo']['active']:
        active.append(response)
    
    elif not response['data']['repo']['active']:
        inactive.append(response)

In [None]:
print('Errors', len(errors))
print('Inactive', len(inactive))
print('Active', len(active))

In [None]:
dc = {x['_id']: {'commits': len(x['commits'])} for x in active}
pd.DataFrame(dc).T.describe()

## Introduction point

In [None]:
metric_map = dict(
    coverage = 'c',
    files = 'f',
    lines = 'n',
    hits = 'h',
    missed = 'm',
    partials = 'p',
    branches = 'b',
    messages = 'M',
    sessions = 's'
)

def get_metric_values(commits, metric):
    metric_values = []
    key = metric_map[metric]
    for commit in commits:
        totals = commit.get('parent_totals')
        if totals is not None:
            metric_values.append(float(totals[key]))
    return metric_values

def get_author(commit):
    return commit['author']['name']

def get_timestamp(commit):
    return commit['timestamp']

def chronological_order(commits):
    return sorted(commits, key=lambda x : datetime.strptime(x['timestamp'], '%Y-%m-%d %H:%M:%S'))

In [None]:
repo = active[150]['data']
commits = chronological_order(repo['commits'])

In [None]:
coverage = get_metric_values(commits, 'coverage')
lines = get_metric_values(commits, 'lines')

In [None]:
fig, ax = plt.subplots(dpi=300)

ax.plot(lines, coverage)

ax.set_xlabel('Lines')
ax.set_ylabel('Coverage')

ax.set_ylim([0, 100])
fig.show()