In [3]:
import requests
from datetime import datetime
from tqdm import tqdm
import pickle
from os import chdir
import subprocess
from dateutil.parser import parse
import os
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

In [4]:
token = "..."
headers = {'Authorization': 'token ' + token}

In [5]:
repo = "pallets/flask"

In [6]:
def fetch_responses(token, url):
    responses = []
    while True:
        headers = {'Authorization': 'token ' + token}
        rsp = requests.get(url, headers=headers)
        responses.append(rsp)
        if 'next' in rsp.links:
            url = rsp.links['next']['url']
        else:
            break
    return responses

def rate_limit(response):
    print("Rate remaining:", response.headers['X-RateLimit-Remaining'])
    print("Rate limit reset:", datetime.fromtimestamp(int(response.headers['X-RateLimit-Reset'])))

## Fetch PRs

In [7]:
url = f"https://api.github.com/search/issues?q=repo:{repo}+is:merged+base:main&per_page=100"
url

'https://api.github.com/search/issues?q=repo:pallets/flask+is:merged+base:main&per_page=100'

In [8]:
responses = fetch_responses(token, url)
responses

[<Response [200]>]

In [9]:
rate_limit(responses[-1])

Rate remaining: 29
Rate limit reset: 2022-03-25 16:14:51


In [10]:
items = []
for response in responses:
    items += response.json()['items']
len(items)

41

In [11]:
numbers = [x['number'] for x in items]
numbers[:5]

[4492, 4491, 4488, 4487, 4486]

In [None]:
pulls_resps = []
for number in tqdm(numbers):
    url = f"https://api.github.com/repos/{repo}/pulls/{number}"
    resp = requests.get(url, headers=headers)
    pulls_resps.append(resp)
print(set([x.status_code for x in pulls_resps]))

 49%|████▉     | 20/41 [00:11<00:12,  1.68it/s]

In [None]:
rate_limit(pulls_resps[-1])

In [None]:
for pr in pulls_resps[20:30]:
    print(pr.json()['merge_commit_sha'])

In [None]:
pulls_resps[0].json().keys()

In [None]:
milestones = set()
for pr in pulls_resps:
    pr = pr.json()
    if pr['milestone']:
        milestones.add(pr['milestone']['id'])
milestones

In [None]:
prs = {}
for pr in pulls_resps:
    pr = pr.json()
    sha = pr['merge_commit_sha']
    prs[sha] = {
        'title': pr['title'],
        'sha': sha
    }
len(prs)

## Fetch releases

In [None]:
url = f"https://api.github.com/repos/{repo}/releases?per_page=100"
release_resps = fetch_responses(token, url)
print(set([x.status_code for x in release_resps]))

In [None]:
release_tags = set()
for r in release_resps[0].json():
    if r['target_commitish'] == 'main' and r['draft'] == False and r['prerelease'] == False:
        release_tags.add(r['tag_name'])
print(len(release_tags))
list(release_tags)[0]

In [None]:
rate_limit(release_resps[-1])

In [None]:
release_resps[0].json()[0].keys()

In [None]:
release_resps[0].json()[0]['html_url']

## Kontrollera att alla PR, tags finns med lokalt

- Plocka ut en lista med alla sha lokalt
- Map pr->sha för att kolla om alla pr finns i historiken
- Kan även kolla för release tags

In [None]:
subprocess.run(f"git clone https://github.com/{repo}", shell=True)

In [None]:
repo_dir = repo.split('/')[-1]
repo_dir

In [None]:
chdir(repo_dir)
try:
    subprocess.run("git checkout main", shell=True)
    
    lines = subprocess.getoutput("git log --format=format:%H").split('\n')
    commits = list(reversed(lines))
    print(len(commits))
    print(commits[:3])
    
    print()
    all_tags = {}
    for tag in release_tags:
        # OBS: Detta inkluderar även tags från andra branches...
        lines = subprocess.getoutput(f"git rev-list -n 1 {tag}").split('\n')
        sha = lines[0]
        all_tags[sha] = tag
    print(all_tags)
finally:
    chdir('..')

In [None]:
len(commits)

In [None]:
for key in prs.keys():
    assert key in commits

In [None]:
# Endast tag från main
tags = {}
for sha, tag in all_tags.items():
    if sha in commits:
        tags[sha] = tag

print(len(tags), "release tags on main")
for key in tags.keys():
    assert key in commits

## Sektionera PR mellan releases

Gå igenom alla commits i 2 vändor O(2*n) = O(n)
1. Extrahera tags -> sections {start: tag, prs: [], end: tag}
2. Öka section för varje tag och lägg till pr på rätt plats

Ordningen baseras på commits i main.

In [None]:
tag_commits = [c for c in commits if c in tags]

sections = []
for i in range(len(tag_commits)-1):
    sections.append({
        'start': tags[tag_commits[i]],
        'prs': [],
        'end': tags[tag_commits[i+1]],
    })

for c in sections:
    print(c)

In [None]:
pr_commits = [c for c in commits if c in prs]
if commits.index(pr_commits[0]) > commits.index(tag_commits[-1]):
    print("WARNING - alla releases kommer innan första PR")

In [None]:
sec = -1
for commit in commits:
    if 0 <= sec < len(sections) and commit in prs:
        sections[sec]['prs'].append(prs[commit])
    if commit in tags:
        # Inkludera sista PR om den skulle vara en tag
        # OBS: den första ska inte inkuderas - annars bli diffen fel
        sec += 1
        
for section in sections:
    print(len(section['prs']))

In [None]:
sections[0]

Temporär lösning, hårdkoda några random sections

In [None]:
"""
print(len(prs))
release_indices = [5, 40, 100, 435]

sections = []
for i in range(len(release_indices)-1):
    start = release_indices[i]
    end = release_indices[i+1]
    sections.append({
        'start': pr_commits[start],
        'prs': pr_commits[start+1:end],
        'end': pr_commits[end],
    })

sections[0]
"""

**TODO: Changes - modified only (ignore adds, dels)**

In [None]:
def doc_file(path):
    l = path
    return (l.endswith('.md') or l.endswith('.rst')) # and l.startswith('doc/')

chdir(repo_dir)
try:
    subprocess.run("git checkout main", shell=True)
    
    for section in sections:
        # Extract actual changes from history
        start = section['start']
        end = section['end']
        command = f"git diff '{start}' '{end}' --name-only"
        lines = subprocess.getoutput(command).split('\n')
        changes = list(filter(doc_file, lines))
        section['changes'] = changes
        print(len(changes), 'changes', changes[:3], '...')
        
finally:
    chdir('..')

In [None]:
sections[0]

Hämta docs filer, så som de såg ut i början av varje PR

In [None]:
def find(start_dir, ext):
    """ Search files recursively """
    files = []
    for file in os.listdir(start_dir):
        path = start_dir + "/" + file
        if os.path.isdir(path):
            files += find(path, ext)
        elif os.path.isfile(path) and file.endswith(ext):
            files.append(path)
    return files

def read(path):
    with open(path) as f:
        content = f.read()
    return content

def extract_docs(exts=['.md', '.rst']):
    docs = {}
    for ext in exts:
        paths = find(repo_dir, ext)
        for path in paths:
            key = path[len(repo_dir)+1:]
            docs[key] = read(path)
    return docs

In [None]:
pr_shas = []
for section in sections:
    pr_shas += [pr['sha'] for pr in section['prs']]
print(len(pr_shas))

In [None]:
docs = {}
for sha in tqdm(pr_shas):
    # Modellen behöver veta hur docs var innan PR, därför ^1
    # antag att varje PR består av 1 commit i master
    # det går att lösa annars också, men det blir mer komplicerat och involverar GitHub's API
    subprocess.run(f'git checkout {sha}^1', shell=True)
    docs[sha] = extract_docs()



- **TODO: Extrahera filer, filnamn i uppdaterat format (ladda från sido-repo)**
- **TODO: Metadata för varje PR**
- **TODO: prova om det går att använda base_sha istället för ^1**
- **TODO: Hitta repo som fungerar**

## Evaluation-loop

In [None]:
def predict_all(pr, all_docs):
    return all_docs

def predict_none(pr, all_docs):
    return []

def predict_tfidf(pr, all_docs):
    
    def search(text, tfidf):
        query = vectorizer.transform([text])
        cosine_sims = linear_kernel(query, tfidf).flatten()
        return sorted(zip(cosine_sims, doc_files), reverse=True)
    
    doc_files = list(all_docs.keys())
    corpus = [all_docs[doc] for doc in doc_files]
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(corpus)
    
    result = []
    for match in search(pr['title'], tfidf):
        result.append(match[1])
    return result[:10]


In [None]:
predict = predict_tfidf

tp = 0
fp = 0
fn = 0

for section in sections:
    actual_changes = set(section['changes'])
    
    model_changes = set()
    for pr in section['prs']:
        prediction = set(predict(pr, docs[pr['sha']]))
        model_changes = model_changes.union(prediction)
    
    tp += len(actual_changes.intersection(model_changes))
    fp += len(model_changes.difference(actual_changes))
    fn += len(actual_changes.difference(model_changes))
    

try:
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
except ZeroDivisionError:
    precision = 0
    recall = 0
    f1 = 0


print("f1:", f1)
print("precision:", precision)
print("recall:", recall)