In [None]:
import requests
import json
import sys
import os
sys.path.insert(0, '../ghtesting')

import pandas as pd
from ghdatabase import GHDatabase
from ghrepo import GHRepo
from tqdm.auto import tqdm
from datetime import datetime
from git import Repo
import pickle

In [None]:
repo_db = GHDatabase('ecs260', 'webframework_repos', os.environ['CONNECTION_STRING'])

repos = list(repo_db.get_repos())

num_repos = len(repos)
print(f'Number of repos: {num_repos}')

In [None]:
codecov_db = GHDatabase('ecs260', 'codecov_api', os.environ['CONNECTION_STRING'])
codecov_api = list(codecov_db.get_repos())

# Get initial responses from codecov api

In [None]:
def get_codecov_response(_id, page=None):
    page = f'?page=%s&limit=250' % page if page is not None else ''
    url = 'http://codecov.io/api/gh/%s/commits%s' % (_id, page)
    r = requests.get(url)
    return r.json()

In [None]:
ids_done = set(map(lambda x: x['_id'], codecov_api))
remaining = [repo for repo in repos if repo['_id'] not in ids_done]

for repo in tqdm(remaining):
    
    # get data from api
    _id = repo['_id']
    page = 1
    first_response = None
    repo_commits = []
    
    while True:
        
        # get commits page from api
        response = get_codecov_response(_id, page)
        
        # save first response
        if page == 1:
            first_response = response
            
        # get commits if available
        commits = response.get('commits', [])
        
        # extend repo commits collection
        repo_commits.extend(commits)
        
        # increment page number
        page += 1
        
        # break if no new commits in current page
        if len(commits) == 0:
            break
       
    codecov_db.update_repo(dict(
        _id=_id,
        data=first_response,
        commits=repo_commits
    ))

# Check coverage

In [None]:
active = []
inactive = []
errors = []

for response in codecov_api:
    if 'error' in response['data']:
        errors.append(response)
        
    elif response['data']['repo']['active']:
        active.append(response)
    
    elif not response['data']['repo']['active']:
        inactive.append(response)

In [None]:
print('Errors', len(errors))
print('Inactive', len(inactive))
print('Active', len(active))

# Collecting valid coverage reports and saving files

In [None]:
def get_repository_path(repo):
    return os.path.join('repositories', repo.name.split('/')[1])
    
def get_repos():
    repos_pickle = '../data/repos.pickle'

    # load from disk if already cached
    if os.path.exists(repos_pickle):
        with open(repos_pickle, 'rb') as f:
            return pickle.load(f)
        
    # get list of repos
    db = GHDatabase('ecs260', 'webframework_repos', os.environ['CONNECTION_STRING'])
    repos = [GHRepo(i) for i in db.get_repos()]
    repos = [repo for repo in repos if os.path.exists(get_repository_path(repo))]
        
    # get commits
    cc = GHDatabase('ecs260', 'codecov_api', os.environ['CONNECTION_STRING'])
    commits = list(cc.get_repos())
    
    # put codecov reports with repo
    for repo in repos:
        repo.set_codecov_reports([i for i in commits if i['_id'] == repo.name][0]['commits'])
  
    # write to disk
    with open(repos_pickle, 'wb') as f:
        pickle.dump(repos, f)

    return repos

def get_webframework(repo):
    if 'angular' in repo.topics:
        return 'Angular'
    if 'vue' in repo.topics:
        return 'Vue'
    if 'react' in repo.topics:
        return 'React'

def get_contributors(report):
    repo = Repo(get_repository_path(repo))

def get_branch(repo):
    return Repo(get_repository_path(repo)).active_branch.name

def get_reports(repo):
    # get reports only for the main branch
    main_branch = get_branch(repo)
    main_reports = [r for r in repo.codecov_reports if r['branch'] == main_branch]
        
    # sort by timestamp
    strptime = lambda x : datetime.strptime(x['timestamp'].split('.')[0], '%Y-%m-%d %H:%M:%S')
    return sorted(main_reports, key=strptime)

In [None]:
# if os.path.exists('../data/final_reports.pickle'):
#     with open('../data/final_reports.pickle', 'rb') as f:
#         final_reports = pickle.load(f)
# else:
# final_reports = {}

for repo in tqdm(get_repos()):
    
    if repo.name in final_reports:
        continue

    git = Repo(get_repository_path(repo))

    reports = get_reports(repo)

    if len(reports) == 0:
        continue

    latest_report = None
    earliest_report = None
    branch = get_branch(repo)

    try:
        for report in reports:
            try:
                git.git.checkout(report['commitid'])
                git.git.checkout(branch)
                earliest_report = report
                break
            except KeyboardInterrupt as e:
                raise e
            except:
                pass
    finally:
        git.git.checkout(branch)

    try:
        for report in reports[::-1]:
            try:
                git.git.checkout(report['commitid'])
                git.git.checkout(branch)
                latest_report = report
                break
            except KeyboardInterrupt as e:
                raise e
            except:
                pass
    finally:
        git.git.checkout(branch)

#     if latest_report is not None and earliest_report is not None:
    final_reports[repo.name] = dict(latest=latest_report, earliest=earliest_report)

with open('../data/final_reports.pickle', 'wb') as f:
    pickle.dump(final_reports, f)

In [None]:
len(final_reports)