In [1]:
import requests
from datetime import datetime
from tqdm import tqdm
import pickle
from os import chdir
import subprocess
from dateutil.parser import parse
import os
import random

try:
    chdir('..')
    from extract import *
finally:
    chdir('notebooks')

In [2]:
token = "..."
headers = {'Authorization': 'token ' + token}
repo = "pandas-dev/pandas"

In [3]:
def fetch_responses(token, url):
    responses = []
    while True:
        headers = {'Authorization': 'token ' + token}
        rsp = requests.get(url, headers=headers)
        responses.append(rsp)
        if 'next' in rsp.links:
            url = rsp.links['next']['url']
        else:
            break
    return responses

def rate_limit(response):
    print("Rate remaining:", response.headers['X-RateLimit-Remaining'])
    print("Rate limit reset:", datetime.fromtimestamp(int(response.headers['X-RateLimit-Reset'])))

## Fetch PRs

In [4]:
url = f"https://api.github.com/search/issues?q=repo:{repo}+is:merged+base:main&per_page=100"
url

'https://api.github.com/search/issues?q=repo:pandas-dev/pandas+is:merged+base:main&per_page=100'

In [5]:
responses = fetch_responses(token, url)
responses

[<Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>]

In [6]:
rate_limit(responses[-1])

Rate remaining: 24
Rate limit reset: 2022-03-24 11:12:10


In [7]:
responses[-1].url

'https://api.github.com/search/issues?q=repo%3Apandas-dev%2Fpandas+is%3Amerged+base%3Amain&per_page=100&page=6'

In [8]:
items = []
for response in responses:
    items += response.json()['items']
len(items)

532

In [9]:
numbers = [x['number'] for x in items]
numbers[:5]

[46476, 46472, 46466, 46462, 46461]

In [10]:
pulls_resps = []
for number in tqdm(numbers):
    url = f"https://api.github.com/repos/pandas-dev/pandas/pulls/{number}"
    resp = requests.get(url, headers=headers)
    pulls_resps.append(resp)
print(set([x.status_code for x in pulls_resps]))

100%|██████████| 532/532 [04:44<00:00,  1.87it/s]

{200}





In [11]:
rate_limit(pulls_resps[-1])

Rate remaining: 4468
Rate limit reset: 2022-03-24 12:11:22


In [12]:
for pr in pulls_resps[20:30]:
    print(pr.json()['merge_commit_sha'])

6033ed4b3383d874ee4a8a461724c0b8c2ca968d
171cdad93349a10d88cd9ac002a5d0ad3c751ed3
d2478f55d115b86f08e017043132709198e194fd
abfb4b7a9b6ac188045ec85246919dc9b816d74f
b14d62e35e13b313c8d4436667437bb0a6204bc0
f469af7b381075a5a0e974462169ce8800258f0d
55d1acac77c1f85031c637c7cddd3bad8d7c8bf8
2d80d92ce35597992beb68e7bdc24c04db4061f7
a875c23f0def5f728e913a38e063737112ea21f1
ac532461a6840b4a1f1da8b565322818432690f7


In [13]:
pulls_resps[0].json().keys()

dict_keys(['url', 'id', 'node_id', 'html_url', 'diff_url', 'patch_url', 'issue_url', 'number', 'state', 'locked', 'title', 'user', 'body', 'created_at', 'updated_at', 'closed_at', 'merged_at', 'merge_commit_sha', 'assignee', 'assignees', 'requested_reviewers', 'requested_teams', 'labels', 'milestone', 'draft', 'commits_url', 'review_comments_url', 'review_comment_url', 'comments_url', 'statuses_url', 'head', 'base', '_links', 'author_association', 'auto_merge', 'active_lock_reason', 'merged', 'mergeable', 'rebaseable', 'mergeable_state', 'merged_by', 'comments', 'review_comments', 'maintainer_can_modify', 'commits', 'additions', 'deletions', 'changed_files'])

In [14]:
milestones = set()
for pr in pulls_resps:
    pr = pr.json()
    if pr['milestone']:
        milestones.add(pr['milestone']['id'])
milestones

{6840253, 7530006, 7593147, 7667127}

In [15]:
prs = {}
for pr in pulls_resps:
    pr = pr.json()
    sha = pr['merge_commit_sha']
    number = pr['number']
    prs[sha] = number
len(prs)

532

## Fetch releases

In [16]:
url = "https://api.github.com/repos/pandas-dev/pandas/releases?per_page=100"
release_resps = fetch_responses(token, url)
print(set([x.status_code for x in release_resps]))

{200}


In [17]:
release_tags = set()
for r in release_resps[0].json():
    if r['target_commitish'] == 'main' and r['draft'] == False and r['prerelease'] == False:
        release_tags.add(r['tag_name'])
print(len(release_tags))
list(release_tags)[0]

17


'v0.14.0'

In [18]:
rate_limit(release_resps[-1])

Rate remaining: 4467
Rate limit reset: 2022-03-24 12:11:22


In [19]:
release_resps[0].json()[0].keys()

dict_keys(['url', 'assets_url', 'upload_url', 'html_url', 'id', 'author', 'node_id', 'tag_name', 'target_commitish', 'name', 'draft', 'prerelease', 'created_at', 'published_at', 'assets', 'tarball_url', 'zipball_url', 'body'])

In [20]:
release_resps[0].json()[0]['html_url']

'https://github.com/pandas-dev/pandas/releases/tag/v1.4.1'

## Kontrollera att alla PR, tags finns med lokalt

- Plocka ut en lista med alla sha lokalt
- Map pr->sha för att kolla om alla pr finns i historiken
- Kan även kolla för release tags

In [22]:
!git clone https://github.com/pandas-dev/pandas

Cloning into 'pandas'...
remote: Enumerating objects: 307148, done.[K
remote: Total 307148 (delta 0), reused 0 (delta 0), pack-reused 307148[K
Receiving objects: 100% (307148/307148), 261.26 MiB | 5.07 MiB/s, done.
Resolving deltas: 100% (257011/257011), done.


In [23]:
chdir('pandas')
try:
    subprocess.run("git checkout main", shell=True)
    
    lines = subprocess.getoutput("git log --format=format:%H").split('\n')
    commits = list(reversed(lines))
    print(len(commits))
    print(commits[:3])
    
    print()
    all_tags = {}
    for tag in release_tags:
        # OBS: Detta inkluderar även tags från andra branches...
        lines = subprocess.getoutput(f"git rev-list -n 1 {tag}").split('\n')
        sha = lines[0]
        all_tags[sha] = tag
    print(all_tags)
finally:
    chdir('..')

29228
['9d0080576446de475d34b0dbb58389b15cd4f529', 'ec1a0a2a2571dc2c1c26612b374d4a66b22f0938', '1eeadf4e401647faa20911f531bc05c1872262ea']

{'da0f7ae362bb0ee747c3c5c141327d1d8ba161bc': 'v0.14.0', 'ca9eefc3c4733f368c054e33537ff18384114b43': 'v0.16.1', '27b783986230a3d044d045604b72a51acd13b7be': 'v0.19.1', 'e346c663cf76186c22f4d3b703461b1b60db280f': 'v0.20.1', '2814061730893bc8122caa4e01197c699da352e6': 'v0.20.2', '017adeaa4b70d63fe6c788db457dc9d31562f4d6': 'v0.15.0', '06d230151e6f18fdb8139d09abf539867a8cd481': 'v1.4.1', 'bb1f651536508cdfef8550f93ace7849b00046ee': 'v1.4.0', '18ea1d856d45c87fe18a41d1a267ede46e10880e': 'v0.15.2', 'a31c96d34d00dc757908b564dc93991e867d83e2': 'v0.20.0', 'b97dbd01e49f54ae6fa8df382d6f6e4c771d2bc0': 'v0.19.0', 'c91bdbadfdbf9f60879ead8dd86bd1e72ca18ccf': 'v0.16.0', 'd8ab3415373ea42713c096a97c3d9ed5c9cb82ee': 'v0.15.1', '06832891870119984c6a5404bc7f7a471f43b99c': 'v0.16.2', 'fe48704835323c140846d1bde5e1387aa0cac3d4': 'v0.17.0', '6c30cbecf8e5ae610f2a37ba821116bd9f7

In [24]:
len(commits)

29228

In [25]:
for key in prs.keys():
    assert key in commits

In [26]:
# Endast tag från main
tags = {}
for sha, tag in all_tags.items():
    if sha in commits:
        tags[sha] = tag

print(len(tags), "release tags on main")
for key in tags.keys():
    assert key in commits

12 release tags on main


## Sektionera PR mellan releases

Gå igenom alla commits i 2 vändor O(2*n) = O(n)
1. Extrahera tags -> sections {start: tag, prs: [], end: tag}
2. Öka section för varje tag och lägg till pr på rätt plats

Ordningen baseras på commits i main.

In [33]:
tag_commits = [c for c in commits if c in tags]

sections = []
for i in range(len(tag_commits)-1):
    sections.append({
        'start': tags[tag_commits[i]],
        'prs': [],
        'end': tags[tag_commits[i+1]],
    })

for c in sections:
    print(c)

{'start': 'v0.14.0', 'prs': [], 'end': 'v0.15.0'}
{'start': 'v0.15.0', 'prs': [], 'end': 'v0.15.1'}
{'start': 'v0.15.1', 'prs': [], 'end': 'v0.15.2'}
{'start': 'v0.15.2', 'prs': [], 'end': 'v0.16.0'}
{'start': 'v0.16.0', 'prs': [], 'end': 'v0.16.1'}
{'start': 'v0.16.1', 'prs': [], 'end': 'v0.16.2'}
{'start': 'v0.16.2', 'prs': [], 'end': 'v0.17.0'}
{'start': 'v0.17.0', 'prs': [], 'end': 'v0.17.1'}
{'start': 'v0.17.1', 'prs': [], 'end': 'v0.19.0'}
{'start': 'v0.19.0', 'prs': [], 'end': 'v0.20.0'}
{'start': 'v0.20.0', 'prs': [], 'end': 'v0.20.1'}


In [34]:
pr_commits = [c for c in commits if c in prs]
if commits.index(pr_commits[0]) > commits.index(tag_commits[-1]):
    print("WARNING - alla releases kommer innan första PR")



Temporär lösning, hårdkoda några random sections

In [35]:
print(len(prs))
release_indices = [5, 40, 100, 435]

sections = []
for i in range(len(release_indices)-1):
    start = release_indices[i]
    end = release_indices[i+1]
    sections.append({
        'start': pr_commits[start],
        'prs': pr_commits[start+1:end],
        'end': pr_commits[end],
    })

sections[0]

532


{'start': 'e53bda9e64c067f5612e069830acbf314ece7242',
 'prs': ['83ea173d150e715790ce7a1b2d329b3eeb4c3ea0',
  '96f2f2aff10666aa2b8118174a3f39a2e88a0bf4',
  'e84b9ee40075d8d9ad2eb217a5d9671e7b9073dd',
  'e255e56fa086e06127268e409adb82f440326273',
  'f2a0125663bbcf8701347c00329c311e7391608e',
  '51675d0839480ba7ada44cc93ba8a8df94d33de0',
  '5acb14b786b3e2f5cdcb54e158323608d19c0422',
  '5ceeb43a908b9e885c7fe3cc91e4b446255156a9',
  '94f976fd43c6ce00adb6d4b4d3e7c0bef8d78788',
  'd2ef7e9a3f46489a1fa57c957d8c3ce664622723',
  'db5c087a7e126c9e87f75968067ba756b3747333',
  '37c43da9ec4ced1108a4ed5320a1d3175f88eaec',
  'c80f656aad987881dac5cbccfc957dc6c3cef866',
  '39b7a6d5614dc2a3e0ce9d799fa1dc98c1e3cfd5',
  'e3ff3d95d7f87e6c0b24c5ad2983e84778c95741',
  'a659f1df7aa13282122fed472756e3b38b34ce55',
  'be20b2d903c21b961f45a22d51137944732cfcbb',
  '5357f795757ac026c506134b4500a14068196f3f',
  'ae5b2b7bab84a9c276ca14d0a46060735a0bd601',
  '09d693f445186a3e67ba2f0652e61ca7650ed4f4',
  '43245c93bf286a46

**TODO: Changes - modified only (ignore adds, dels)**

In [36]:
chdir('pandas')
try:
    subprocess.run("git checkout main", shell=True)
    
    for section in sections:
        # Extract actual changes from history
        start = section['start']
        end = section['end']
        lines = subprocess.getoutput(f"git diff {start} {end} --name-only").split('\n')
        changes = [l for l in lines if l.startswith('doc/') and (l.endswith('.md') or l.endswith('.rst'))]
        section['changes'] = changes
        print(len(changes), changes[:3])
        
finally:
    chdir('..')

6 ['doc/source/reference/arrays.rst', 'doc/source/user_guide/gotchas.rst', 'doc/source/user_guide/scale.rst']
35 ['doc/source/development/code_style.rst', 'doc/source/development/contributing_codebase.rst', 'doc/source/development/contributing_environment.rst']
34 ['doc/source/development/code_style.rst', 'doc/source/development/contributing_codebase.rst', 'doc/source/development/contributing_documentation.rst']


In [37]:
"""
# Jag tror detta funkar,
# men svårt att veta eftersom alla PR kom efter sista taggen i pandas/main
# behöver ett nytt repo / branch - alternativt testa med milestones istället...

sec = -1
for commit in commits:
    if sec >= 0 and commit in prs and sec < len(sections):
        print('add')
        break
        sections[sec]['prs'].append(commit) # todo: prs[commit]
    if commit in tags:
        # Inkludera sista PR om den skulle vara en tag
        # OBS: den första ska inte inkuderas - annars bli diffen fel
        sec += 1
        
for section in sections:
    print(len(section['prs']))
"""

"\n# Jag tror detta funkar,\n# men svårt att veta eftersom alla PR kom efter sista taggen i pandas/main\n# behöver ett nytt repo / branch - alternativt testa med milestones istället...\n\nsec = -1\nfor commit in commits:\n    if sec >= 0 and commit in prs and sec < len(sections):\n        print('add')\n        break\n        sections[sec]['prs'].append(commit) # todo: prs[commit]\n    if commit in tags:\n        # Inkludera sista PR om den skulle vara en tag\n        # OBS: den första ska inte inkuderas - annars bli diffen fel\n        sec += 1\n        \nfor section in sections:\n    print(len(section['prs']))\n"

In [38]:
sections[0]

{'start': 'e53bda9e64c067f5612e069830acbf314ece7242',
 'prs': ['83ea173d150e715790ce7a1b2d329b3eeb4c3ea0',
  '96f2f2aff10666aa2b8118174a3f39a2e88a0bf4',
  'e84b9ee40075d8d9ad2eb217a5d9671e7b9073dd',
  'e255e56fa086e06127268e409adb82f440326273',
  'f2a0125663bbcf8701347c00329c311e7391608e',
  '51675d0839480ba7ada44cc93ba8a8df94d33de0',
  '5acb14b786b3e2f5cdcb54e158323608d19c0422',
  '5ceeb43a908b9e885c7fe3cc91e4b446255156a9',
  '94f976fd43c6ce00adb6d4b4d3e7c0bef8d78788',
  'd2ef7e9a3f46489a1fa57c957d8c3ce664622723',
  'db5c087a7e126c9e87f75968067ba756b3747333',
  '37c43da9ec4ced1108a4ed5320a1d3175f88eaec',
  'c80f656aad987881dac5cbccfc957dc6c3cef866',
  '39b7a6d5614dc2a3e0ce9d799fa1dc98c1e3cfd5',
  'e3ff3d95d7f87e6c0b24c5ad2983e84778c95741',
  'a659f1df7aa13282122fed472756e3b38b34ce55',
  'be20b2d903c21b961f45a22d51137944732cfcbb',
  '5357f795757ac026c506134b4500a14068196f3f',
  'ae5b2b7bab84a9c276ca14d0a46060735a0bd601',
  '09d693f445186a3e67ba2f0652e61ca7650ed4f4',
  '43245c93bf286a46

Hämta docs filer, så som de såg ut i början av varje PR

In [40]:
pr_shas = []
for section in sections:
    pr_shas += section['prs']
print(len(pr_shas))


427


In [41]:
extracts = {}
for sha in tqdm(pr_shas):
    # Modellen behöver veta hur docs var innan PR, därför ^1
    # antag att varje PR består av 1 commit i master
    # det går att lösa annars också, men det blir mer komplicerat och involverar GitHub's API
    subprocess.run(f'git checkout {sha}^1', shell=True, cwd="pandas")
    extracts[sha] = {
        'md': Extractor('md').extract('pandas/doc'),
        'rst': Extractor('rst').extract('pandas/doc')
    }

100%|██████████| 427/427 [00:36<00:00, 11.73it/s]


In [42]:
docs = {}
for sha, e in extracts.items():
    docs[sha] = e['md'] + e['rst']
len(docs)

427

- **TODO: Extrahera filer, filnamn i uppdaterat format (ladda från sido-repo)**
- **TODO: Metadata för varje PR**
- **TODO: Evaluation loop**

## Evaluation-loop

In [47]:
tp = 0
fp = 0
fn = 0

for section in sections:
    actual_changes = set(section['changes'])
    
    # TODO: Predict based on PR, docs
    model_changes = set(list(actual_changes)[:random.randrange(len(actual_changes)-1)])
    
    
    tp += len(actual_changes.intersection(model_changes))
    fp += len(model_changes.difference(actual_changes))
    fn += len(actual_changes.difference(model_changes))
    
# TODO - edge cases zero division
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("f1:", f1)

f1: 0.6725663716814159
