In [4]:
!which python

/Users/shabo/Documents/Backtick/exjobb/venv/bin/python


In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
from utils import load, dump
from extract import Extractor
from os import getcwd
from sklearn.metrics import f1_score, accuracy_score
from markdown import markdown
from bs4 import BeautifulSoup as Bfs
from itertools import chain
import re

In [79]:
dataset = load(f'{getcwd()}/../data/prd_backtick-se_cowait_annotated.json')

In [81]:
class TFIDFMapper():
    DOCT = 1
    SECT = 2
    STMT = 3
    
    def __init__(self, data, granularity):
        self.gran = granularity
        self.data = data
    
    def prepare(self):
        for pr in self.data:
            paths = [*zip(*pr['docs'])][0]
            contents = [*zip(*pr['docs'])][1]
            contents = [*map(self.clear_doc, contents)]
            contents = [*map(self.rendered, contents)]
            
            if self.gran == self.DOCT:
                pr['documents'] = np.array(contents)
                pr['locations'] = np.array(paths)
                
            elif self.gran == self.STMT:
                ptrn = r'[A-Z].*?[\.!?][\s]'
                pat = re.compile(ptrn, re.M)
                statements = [*map(pat.findall, contents)]
                statements = [*map(lambda s: s[:-2], chain(*statements))]
                
            else:
                pass
        
        has_content = lambda pr: bool(''.join(pr['documents']))
        self.data = [*filter(has_content, self.data)]
    
    def fit(self):
        for pr in self.data:
            vectorizer = TfidfVectorizer()
            pr['vectorizer'] = vectorizer
            pr['tfidf'] = vectorizer.fit_transform(pr['documents'])
    
    def predict(self):
        for pr in self.data:
            query_input = self.query_input(pr)
            query = pr['vectorizer'].transform([query_input])
            cosine_sims = linear_kernel(query, pr['tfidf']).flatten()
            pr['prediction'] = sorted(zip(cosine_sims, pr['locations']), reverse=True)
    
    def evaluate(self):
        y_hat = []
        y_tru = []
        
        for pr in self.data:
            target = pr['target']
            tarlen = len(target)
            title = pr['title']
            number = pr['number']

            preds = pr['prediction'][:tarlen]

            if tarlen:
                for i, pred in enumerate(preds):
                    y_tru.append(pred[1])
                    y_hat.append(target[i])
                
                print(number, title)
                
                print('Targets:')
                for tar in target:
                    print(tar)
                print('Predictions:')
                for pred in preds:
                    print(pred)
                print()

        accuracy = accuracy_score(y_tru, y_hat)
        print(f'{accuracy=}')
    
    def query_input(self, pr):
        """
        PR title
        PR body
        commit msg 1
        commit msg 2
        ...
        """
        
        title = pr['title']
        body = pr['body'] if pr['body'] else ''
        query_input = f'{title}\n{body}'
        
        for commit in pr['commits']:
            msg = commit['commit']['message']
            query_input += f'\n{msg}'
        
        return self.rendered(query_input)
    
    def dump(self, file):
        dump(self.data, file)
    
    @staticmethod
    def clear_doc(md):
        # Remove title table
        pattern = r'(---\ntitle:.*\n---\n)'
        return ''.join(re.split(pattern, md)[2:])
    
    @staticmethod
    def rendered(md):
        if md:
            html = markdown(md)
            return ''.join(Bfs(html).findAll(text=True))
        return ''
    
mapper = TFIDFMapper(dataset, SemanticMapper.DOCT)
mapper.prepare()
mapper.fit()
mapper.predict()
mapper.evaluate()

320 Pytest marks support for cowait test
Targets:
cowait/docs/kubernetes/testing.md
cowait/docs/get-started/tests.md
Predictions:
(0.4152569532370608, 'cowait/docs/get-started/tests.md')
(0.339346764847835, 'cowait/docs/kubernetes/testing.md')
Pytest marks support for cowait test
add marks argument to cowait test

325 Improve cowait test
Targets:
cowait/docs/kubernetes/testing.md
cowait/docs/get-started/tests.md
Predictions:
(0.2747287863532659, 'cowait/docs/get-started/tests.md')
(0.1722613877015308, 'cowait/docs/kubernetes/testing.md')
Improve cowait test
Adds two new flags:
- --verbose enables verbose output from pytest (false by default)
- --capture toggles output capturing (true by default)
Improved the pytest argument generation code
add verbosity, output capture arguments to cowait test

327 Improve logs command
Targets:
cowait/docs/kubernetes/pushing-and-running.md
cowait/docs/get-started/first-steps.md
Predictions:
(0.20002402860645369, 'cowait/docs/get-started/dashboard.md')


In [48]:
def clear_content(md):
    # Remove title table
    pattern = r'(---\ntitle:.*\n---\n)'
    txt = ''.join(re.split(pattern, md)[2:])
    
    # Split on sections
    #pattern = r'## .*\n'
    #split = lambda d: re.split(pattern, d)
    #data = [*map(split, data)]