In [None]:
import sys
sys.path.insert(0, '../ghtesting')

from util import *
from tqdm.auto import tqdm
import json
from matplotlib import pyplot as plt
import numpy as np
import subprocess

In [None]:
plt.style.use(['science'])

In [None]:
WEBFRAMEWORKS = ['Angular', 'React', 'Vue']
COLORS = ['r', 'b', 'g']

In [None]:
repos = get_repos()

In [None]:
def get_commit_history(git):
    return [log.split()[1] for log in git.git.log().split('\n') if log.startswith('commit')]

def read_file(git, commitid, file):
    try:
        return git.git.show(f'{commitid}:{file}')
    except:
        return ""

def get_ci(readme):
    for service_url, service_name in service_urls.items():
        if re.search(service_url, readme) is not None:
            return service_name

In [None]:
def has_ci(git, commit_id, ci):
    readme = read_file(git, commit_id, 'README.md')
    return get_ci(readme) == ci

In [None]:
def get_first_ci_commit(git, commit_history, current_ci):
    lo = 0
    hi = len(commit_history) - 1
    while lo < hi - 1:
        mid = round((lo + hi) / 2)
        if has_ci(git, commit_history[mid], current_ci):
            hi = mid
        else:
            lo = mid
    if has_ci(git, commit_history[lo], current_ci): return commit_history[lo]
    if has_ci(git, commit_history[mid], current_ci): return commit_history[mid]
    if has_ci(git, commit_history[hi], current_ci): return commit_history[hi]

In [None]:
with open('../data/ci_introduction_commit.json') as f:
    ci_introduction = json.load(f)

In [None]:
for repo in tqdm(repos):
    git = Repo(get_repository_path(repo))
    
    if repo.name in ci_introduction:
        continue
    
    # get ci from current readme
    current_readme = read_file(git, git.active_branch.name, 'README.md')
    current_ci = get_ci(current_readme)
    
    # get reverse commit history
    commit_history = get_commit_history(git)
    commit_history = commit_history[::-1]
    
    ci_introduction[repo.name] = get_first_ci_commit(git, commit_history, current_ci)
    
with open('../data/ci_introduction_commit.json', 'w') as f:
    json.dump(ci_introduction, f)

## CDF Generation

### By Contributors

In [None]:
with open('../data/contributors_ci_introduction.json') as f:
    contributors = json.load(f)

In [None]:
for repo in tqdm(repos):
    if repo.name in contributors:
        continue
    git = Repo(get_repository_path(repo))        
    commit_id = ci_introduction[repo.name]
    contributors[repo.name] = get_contributors_by_commit_id(git, commit_id)
    
with open('../data/contributors_ci_introduction.json', 'w') as f:
    json.dump(contributors, f)

In [None]:
contrib_dist = {}
for repo in repos:
    wf = get_webframework(repo)
    if wf not in contrib_dist:
        contrib_dist[wf] = []
    contrib_dist[wf].append(len(contributors[repo.name]))

In [None]:
fig, ax = plt.subplots(dpi=300)
plt.xscale('log')

for wf, color in zip(WEBFRAMEWORKS, COLORS):
    x = sorted(contrib_dist[wf])
    y = np.linspace(0, 100, len(x))
    ax.plot(x, y, label=wf, color=color)

ax.set_xlabel("\# of contributors")
ax.set_ylabel("\% of repositories")
ax.legend(loc='lower right')
ax.set_ylim([0, 100])
fig.show()
fig.savefig('../plots/cdf-rq2-contributors.pdf', transparent = True, bbox_inches = 'tight', dpi=300)

### By SLOC

In [None]:
with open('../data/sloc.json') as f:
    sloc = json.load(f)

In [None]:
def get_sloc_by_commit_id(repo, commit_id):
    repo_path = get_repository_path(repo)
    git = Repo(repo_path)
    branch = git.active_branch.name
    git.git.checkout(commit_id)
    proc = subprocess.run(['/home/mharoon/.opt/nodejs/bin/sloc', '-f', 'json', repo_path], stdout=subprocess.PIPE)
    git.git.checkout(branch)
    try:
        return json.loads(proc.stdout)
    except:
        print(repo.name)
        return {}

In [None]:
for repo in tqdm(repos):
    if repo.name in sloc:
        continue
    sloc[repo.name] = get_sloc_by_commit_id(repo, ci_introduction[repo.name])
    
with open('../data/sloc.json', 'w') as f:
    json.dump(sloc, f)

In [None]:
def get_sloc(repo):
    if repo.name in sloc:
        if 'summary' in sloc[repo.name]:
            if 'total' in sloc[repo.name]['summary']:
                return sloc[repo.name]['summary']['total']
            return 0
    return None

In [None]:
distributions = {}
for repo in tqdm(repos):
    wf = get_webframework(repo)
    git = Repo(get_repository_path(repo))

    if wf not in distributions:
        distributions[wf] = []
    
    loc = get_sloc(repo)
    if loc is not None:
        distributions[wf].append(loc)

In [None]:
fig, ax = plt.subplots(dpi=300)
plt.xscale('log')

for wf, color in zip(WEBFRAMEWORKS, COLORS):
    x = sorted(distributions[wf])
    y = np.linspace(0, 100, len(x))
    ax.plot(x, y, label=wf, color=color)

ax.set_xlabel("SLOC")
ax.set_ylabel("\% of repositories")
ax.set_ylim([0, 100])
ax.legend(loc='lower right')
fig.show()
fig.savefig('../plots/cdf-rq2-sloc.pdf', transparent = True, bbox_inches = 'tight', dpi=300)