In [3]:
import requests
from datetime import datetime
from tqdm import tqdm
import pickle
from os import chdir
import subprocess
from dateutil.parser import parse
import os
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

In [4]:
token = "..."
headers = {'Authorization': 'token ' + token}

In [5]:
repo = "pallets/flask"

In [6]:
def fetch_responses(token, url):
    responses = []
    while True:
        headers = {'Authorization': 'token ' + token}
        rsp = requests.get(url, headers=headers)
        responses.append(rsp)
        if 'next' in rsp.links:
            url = rsp.links['next']['url']
        else:
            break
    return responses

def rate_limit(response):
    print("Rate remaining:", response.headers['X-RateLimit-Remaining'])
    print("Rate limit reset:", datetime.fromtimestamp(int(response.headers['X-RateLimit-Reset'])))

## Fetch PRs

In [7]:
url = f"https://api.github.com/search/issues?q=repo:{repo}+is:merged+base:main&per_page=100"
url

'https://api.github.com/search/issues?q=repo:pallets/flask+is:merged+base:main&per_page=100'

In [8]:
responses = fetch_responses(token, url)
responses

[<Response [200]>]

In [9]:
rate_limit(responses[-1])

Rate remaining: 29
Rate limit reset: 2022-03-25 16:14:51


In [10]:
items = []
for response in responses:
    items += response.json()['items']
len(items)

41

In [11]:
numbers = [x['number'] for x in items]
numbers[:5]

[4492, 4491, 4488, 4487, 4486]

In [12]:
pulls_resps = []
for number in tqdm(numbers):
    url = f"https://api.github.com/repos/{repo}/pulls/{number}"
    resp = requests.get(url, headers=headers)
    pulls_resps.append(resp)
print(set([x.status_code for x in pulls_resps]))

100%|██████████| 41/41 [00:23<00:00,  1.74it/s]

{200}





In [13]:
rate_limit(pulls_resps[-1])

Rate remaining: 4917
Rate limit reset: 2022-03-25 17:04:53


In [14]:
for pr in pulls_resps[20:30]:
    print(pr.json()['merge_commit_sha'])

d01d26e5210e3ee4cbbdef12f05c886e08e92852
048709c8e7d4d3916193bd2c735ee93c073ab686
d7199c87854c574f46288fd1086fa31fb42c1446
e4373eba1cdd4b288b9d37e784bd26e0ca32056c
0826be48ed50c9ac73160ba7225715659961a992
f5f51cd09c094cc15e7e6385cf27564bedc30b1b
4550beb695e9090259957d4ea1a7a8af9d7b2802
625595cb1a0a70c34a82b5c151d02e6359d5e191
e248e09399b21b0d21d3c8005a30c4dc4d430faa
48ee204dd5bdc47f397553fb45b5fe37003c07d1


In [15]:
pulls_resps[0].json().keys()

dict_keys(['url', 'id', 'node_id', 'html_url', 'diff_url', 'patch_url', 'issue_url', 'number', 'state', 'locked', 'title', 'user', 'body', 'created_at', 'updated_at', 'closed_at', 'merged_at', 'merge_commit_sha', 'assignee', 'assignees', 'requested_reviewers', 'requested_teams', 'labels', 'milestone', 'draft', 'commits_url', 'review_comments_url', 'review_comment_url', 'comments_url', 'statuses_url', 'head', 'base', '_links', 'author_association', 'auto_merge', 'active_lock_reason', 'merged', 'mergeable', 'rebaseable', 'mergeable_state', 'merged_by', 'comments', 'review_comments', 'maintainer_can_modify', 'commits', 'additions', 'deletions', 'changed_files'])

In [16]:
milestones = set()
for pr in pulls_resps:
    pr = pr.json()
    if pr['milestone']:
        milestones.add(pr['milestone']['id'])
milestones

{6751979, 6751980}

In [17]:
prs = {}
for pr in pulls_resps:
    pr = pr.json()
    sha = pr['merge_commit_sha']
    prs[sha] = {
        'title': pr['title'],
        'sha': sha
    }
len(prs)

41

## Fetch releases

In [18]:
url = f"https://api.github.com/repos/{repo}/releases?per_page=100"
release_resps = fetch_responses(token, url)
print(set([x.status_code for x in release_resps]))

{200}


In [19]:
release_tags = set()
for r in release_resps[0].json():
    if r['target_commitish'] == 'main' and r['draft'] == False and r['prerelease'] == False:
        release_tags.add(r['tag_name'])
print(len(release_tags))
list(release_tags)[0]

4


'2.0.3'

In [20]:
rate_limit(release_resps[-1])

Rate remaining: 4916
Rate limit reset: 2022-03-25 17:04:53


In [21]:
release_resps[0].json()[0].keys()

dict_keys(['url', 'assets_url', 'upload_url', 'html_url', 'id', 'author', 'node_id', 'tag_name', 'target_commitish', 'name', 'draft', 'prerelease', 'created_at', 'published_at', 'assets', 'tarball_url', 'zipball_url', 'body', 'reactions'])

In [22]:
release_resps[0].json()[0]['html_url']

'https://github.com/pallets/flask/releases/tag/2.0.3'

## Kontrollera att alla PR, tags finns med lokalt

- Plocka ut en lista med alla sha lokalt
- Map pr->sha för att kolla om alla pr finns i historiken
- Kan även kolla för release tags

In [23]:
subprocess.run(f"git clone https://github.com/{repo}", shell=True)

CompletedProcess(args='git clone https://github.com/pallets/flask', returncode=128)

In [24]:
repo_dir = repo.split('/')[-1]
repo_dir

'flask'

In [25]:
chdir(repo_dir)
try:
    subprocess.run("git checkout main", shell=True)
    
    lines = subprocess.getoutput("git log --format=format:%H").split('\n')
    commits = list(reversed(lines))
    print(len(commits))
    print(commits[:3])
    
    print()
    all_tags = {}
    for tag in release_tags:
        # OBS: Detta inkluderar även tags från andra branches...
        lines = subprocess.getoutput(f"git rev-list -n 1 {tag}").split('\n')
        sha = lines[0]
        all_tags[sha] = tag
    print(all_tags)
finally:
    chdir('..')

4577
['33850c0ebd23ae615e6823993d441f46d80b1ff0', 'b15ad394279fc3b7f998fa56857f334a7c0156f6', '4ec7d2a0d8eac4f915dc0d38a886cd57045bb0c4']

{'ef557b3ff2602b9956a2f3ac02c6e134c529fccc': '2.0.3', '6f7762538bffe3ce9d03508ecab230bfff3e3dcd': '2.0.2', 'bc90801c2ada42d3cf112a3b5701bfdbb8b6211c': '2.0.1', '2f0c62f5e6e290843f03c1fa70817c7a3c7fd661': '2.0.0'}


In [26]:
len(commits)

4577

In [27]:
for key in prs.keys():
    assert key in commits

In [28]:
# Endast tag från main
tags = {}
for sha, tag in all_tags.items():
    if sha in commits:
        tags[sha] = tag

print(len(tags), "release tags on main")
for key in tags.keys():
    assert key in commits

4 release tags on main


## Sektionera PR mellan releases

Gå igenom alla commits i 2 vändor O(2*n) = O(n)
1. Extrahera tags -> sections {start: tag, prs: [], end: tag}
2. Öka section för varje tag och lägg till pr på rätt plats

Ordningen baseras på commits i main.

In [29]:
tag_commits = [c for c in commits if c in tags]

sections = []
for i in range(len(tag_commits)-1):
    sections.append({
        'start': tags[tag_commits[i]],
        'prs': [],
        'end': tags[tag_commits[i+1]],
    })

for c in sections:
    print(c)

{'start': '2.0.0', 'prs': [], 'end': '2.0.1'}
{'start': '2.0.1', 'prs': [], 'end': '2.0.2'}
{'start': '2.0.2', 'prs': [], 'end': '2.0.3'}


In [30]:
pr_commits = [c for c in commits if c in prs]
if commits.index(pr_commits[0]) > commits.index(tag_commits[-1]):
    print("WARNING - alla releases kommer innan första PR")

In [31]:
sec = -1
for commit in commits:
    if 0 <= sec < len(sections) and commit in prs:
        sections[sec]['prs'].append(prs[commit])
    if commit in tags:
        # Inkludera sista PR om den skulle vara en tag
        # OBS: den första ska inte inkuderas - annars bli diffen fel
        sec += 1
        
for section in sections:
    print(len(section['prs']))

3
19
10


In [32]:
sections[0]

{'start': '2.0.0',
 'prs': [{'title': 'update click minimum version',
   'sha': '905e5c23e8c5f6362b38ec1b5526fe999f491229'},
  {'title': 'Fix typo in the example of nesting bp docs',
   'sha': 'd575de5159a6e40944275763c9ada2801214058b'},
  {'title': 'converters have access to session',
   'sha': '9039534eee6a87da98a1dee9e4338d1b73e861f8'}],
 'end': '2.0.1'}

Temporär lösning, hårdkoda några random sections

In [33]:
"""
print(len(prs))
release_indices = [5, 40, 100, 435]

sections = []
for i in range(len(release_indices)-1):
    start = release_indices[i]
    end = release_indices[i+1]
    sections.append({
        'start': pr_commits[start],
        'prs': pr_commits[start+1:end],
        'end': pr_commits[end],
    })

sections[0]
"""

"\nprint(len(prs))\nrelease_indices = [5, 40, 100, 435]\n\nsections = []\nfor i in range(len(release_indices)-1):\n    start = release_indices[i]\n    end = release_indices[i+1]\n    sections.append({\n        'start': pr_commits[start],\n        'prs': pr_commits[start+1:end],\n        'end': pr_commits[end],\n    })\n\nsections[0]\n"

**TODO: Changes - modified only (ignore adds, dels)**

In [34]:
def doc_file(path):
    l = path
    return (l.endswith('.md') or l.endswith('.rst')) # and l.startswith('doc/')

chdir(repo_dir)
try:
    subprocess.run("git checkout main", shell=True)
    
    for section in sections:
        # Extract actual changes from history
        start = section['start']
        end = section['end']
        command = f"git diff '{start}' '{end}' --name-only"
        lines = subprocess.getoutput(command).split('\n')
        changes = list(filter(doc_file, lines))
        section['changes'] = changes
        print(len(changes), 'changes', changes[:3], '...')
        
finally:
    chdir('..')

8 changes ['CHANGES.rst', 'CONTRIBUTING.rst', 'README.rst'] ...
25 changes ['CHANGES.rst', 'CONTRIBUTING.rst', 'README.rst'] ...
16 changes ['CHANGES.rst', 'docs/api.rst', 'docs/cli.rst'] ...


In [35]:
sections[0]

{'start': '2.0.0',
 'prs': [{'title': 'update click minimum version',
   'sha': '905e5c23e8c5f6362b38ec1b5526fe999f491229'},
  {'title': 'Fix typo in the example of nesting bp docs',
   'sha': 'd575de5159a6e40944275763c9ada2801214058b'},
  {'title': 'converters have access to session',
   'sha': '9039534eee6a87da98a1dee9e4338d1b73e861f8'}],
 'end': '2.0.1',
 'changes': ['CHANGES.rst',
  'CONTRIBUTING.rst',
  'README.rst',
  'docs/blueprints.rst',
  'docs/deploying/index.rst',
  'docs/patterns/viewdecorators.rst',
  'docs/quickstart.rst',
  'examples/tutorial/README.rst']}

Hämta docs filer, så som de såg ut i början av varje PR

In [36]:
def find(start_dir, ext):
    """ Search files recursively """
    files = []
    for file in os.listdir(start_dir):
        path = start_dir + "/" + file
        if os.path.isdir(path):
            files += find(path, ext)
        elif os.path.isfile(path) and file.endswith(ext):
            files.append(path)
    return files

def read(path):
    with open(path) as f:
        content = f.read()
    return content

def extract_docs(exts=['.md', '.rst']):
    docs = {}
    for ext in exts:
        paths = find(repo_dir, ext)
        for path in paths:
            key = path[len(repo_dir)+1:]
            docs[key] = read(path)
    return docs

In [37]:
pr_shas = []
for section in sections:
    pr_shas += [pr['sha'] for pr in section['prs']]
print(len(pr_shas))

32


In [38]:
docs = {}
for sha in tqdm(pr_shas):
    # Modellen behöver veta hur docs var innan PR, därför ^1
    # antag att varje PR består av 1 commit i master
    # det går att lösa annars också, men det blir mer komplicerat och involverar GitHub's API
    subprocess.run(f'git checkout {sha}^1', shell=True)
    docs[sha] = extract_docs()



100%|██████████| 32/32 [00:01<00:00, 24.81it/s]


- **TODO: Extrahera filer, filnamn i uppdaterat format (ladda från sido-repo)**
- **TODO: Metadata för varje PR**
- **TODO: prova om det går att använda base_sha istället för ^1**
- **TODO: Hitta repo som fungerar**

## Evaluation-loop

In [39]:
def predict_all(pr, all_docs):
    return all_docs

def predict_none(pr, all_docs):
    return []

def predict_tfidf(pr, all_docs):
    
    def search(text, tfidf):
        query = vectorizer.transform([text])
        cosine_sims = linear_kernel(query, tfidf).flatten()
        return sorted(zip(cosine_sims, doc_files), reverse=True)
    
    doc_files = list(all_docs.keys())
    corpus = [all_docs[doc] for doc in doc_files]
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(corpus)
    
    result = []
    for match in search(pr['title'], tfidf):
        result.append(match[1])
    return result[:10]


In [41]:
predict = predict_tfidf

tp = 0
fp = 0
fn = 0

for section in sections:
    actual_changes = set(section['changes'])
    
    model_changes = set()
    for pr in section['prs']:
        prediction = set(predict(pr, docs[pr['sha']]))
        model_changes = model_changes.union(prediction)
    
    tp += len(actual_changes.intersection(model_changes))
    fp += len(model_changes.difference(actual_changes))
    fn += len(actual_changes.difference(model_changes))
    

try:
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
except ZeroDivisionError:
    precision = 0
    recall = 0
    f1 = 0


print("f1:", f1)
print("precision:", precision)
print("recall:", recall)

f1: 0.3670886075949367
precision: 0.26605504587155965
recall: 0.5918367346938775
