In [10]:
import pandas as pd
import src as CoreEngine
from time import sleep

df = pd.read_csv('data/prs_issues.csv', delimiter='\a')
df.head()

Unnamed: 0,Issue_Number,PR_Number,Issue_Title,Issue_Body,Issue_Comments,PR_Title,PR_Body,PR_Comments,PR_Author_Name,PR_Author_Username,PR_Closed_Date
0,959.0,12,"Cleanup entries: Rely on ""field formatter""",The first two cleanups can be achieved using t...,All the points are already fixed in the latest...,"BugFix for #959 ""StringIndexOutOfBoundsExcepti...","Fixed #959 ""StringIndexOutOfBoundsException wi...",,Ingvar Jackal,IngvarJackal,"07/16/14, 06:57:22 PM"
1,155.0,172,Consistent encoding strings,JabRef's [current encoding list](https://githu...,I fully agree with what you write and just had...,Fix encoding strings,Fixes #155,,Jörg Lenhard,lenhard,"09/18/15, 02:21:46 PM"
2,290.0,300,SourceForge references,In PR #286 I have replaced most references to ...,http://jabref.sourceforge.net/journals/journal...,Fix #290 by removing the suggested url and text,Removes the reference to a missing file earlie...,,Oscar Gustafsson,oscargus,"11/07/15, 07:48:52 PM"
3,498.0,452,Open pdf file when clicking pdf icon,,"... is working fine.\n\nSeriously, you expect ...",Revise maintable,Complete check and rewrite of MainTable and re...,,Matthias Geiger,matthiasgeiger,"12/15/15, 10:39:22 AM"
4,545.0,595,JabRef 3.0: ACM Fetcher: Cannot parse number o...,When I do a search in the ACM in JabRef (no ma...,I can confirm this. I've started to see if I c...,Fixed #545 - ACM fetcher works again,Fixed #545 and some PMD cleanup.,,Oscar Gustafsson,oscargus,"12/28/15, 12:58:31 AM"


In [11]:
## this class will keep on changing tokens when one will have its limit exceeeded
class tokens():
    def __init__(self):
        self.token_list = [
            ''
        ]
        self.cur = 0
    
    def get_next(self) -> str:
        self.cur = (self.cur+1) % len(self.token_list)
        return self.token_list[self.cur]

In [12]:
tokens = tokens()
my_token = tokens.get_next()
my_token

''

# Generate predictions

In [4]:
# drop issues where body is NaN, since RF needs body
df = df[ df["Issue_Body"].notna() ]

In [4]:
# get the prediction from random forest model from issue number, issue title, issue body
def get_prediction( i_no, i_title, i_body ):
    issue = CoreEngine.issue_class.Issue( i_no, i_title, i_body )
    
    db = CoreEngine.DatabaseManager(
        dbfile="./output/main.db",
        cachefile="./ai_result_backup.db",
        label_file="./data/subdomain_labels.json",
    )
    
    external_rf = CoreEngine.External_Model_Interface(
        '#no need of OPENAI API key',
        db,
        "./output/rf_model.pkl",
        "./data/domain_labels.json",
        "./data/subdomain_labels.json",
        "./data/formatted_domain_labels.json",
        "./output/cache",
        "./output/response_cache/" 
    )

    prediction_rf = external_rf.predict_issue(issue)

    return prediction_rf    

In [18]:
# run at last
# df['RF_Predictions'] = df.apply( lambda x: 
#                                  get_prediction( x['Issue_Number'], 
#                                                  x['Issue_Title'], 
#                                                  x['Issue_Body'] )
#                                , axis=1)

In [6]:
df.head()

Unnamed: 0,Issue_Number,PR_Number,Issue_Title,Issue_Body,Issue_Comments,PR_Title,PR_Body,PR_Comments,PR_Author_Name,PR_Author_Username,PR_Closed_Date
0,959.0,12,"Cleanup entries: Rely on ""field formatter""",The first two cleanups can be achieved using t...,All the points are already fixed in the latest...,"BugFix for #959 ""StringIndexOutOfBoundsExcepti...","Fixed #959 ""StringIndexOutOfBoundsException wi...",,Ingvar Jackal,IngvarJackal,"07/16/14, 06:57:22 PM"
1,155.0,172,Consistent encoding strings,JabRef's [current encoding list](https://githu...,I fully agree with what you write and just had...,Fix encoding strings,Fixes #155,,Jörg Lenhard,lenhard,"09/18/15, 02:21:46 PM"
2,290.0,300,SourceForge references,In PR #286 I have replaced most references to ...,http://jabref.sourceforge.net/journals/journal...,Fix #290 by removing the suggested url and text,Removes the reference to a missing file earlie...,,Oscar Gustafsson,oscargus,"11/07/15, 07:48:52 PM"
4,545.0,595,JabRef 3.0: ACM Fetcher: Cannot parse number o...,When I do a search in the ACM in JabRef (no ma...,I can confirm this. I've started to see if I c...,Fixed #545 - ACM fetcher works again,Fixed #545 and some PMD cleanup.,,Oscar Gustafsson,oscargus,"12/28/15, 12:58:31 AM"
5,599.0,600,Import from external database fails for 3.1 an...,"Hi,\n\nJust tried out 3.1. Thank you for all ...",I guess I need to look a bit closer or have mo...,Avoid ConcurrentModificationException when rea...,Fixes #599,,Jörg Lenhard,lenhard,"12/29/15, 01:23:26 PM"


In [6]:
df.to_csv( 'prs_issues_rf_predictions.csv', header = True, encoding = 'utf-8', index = False, sep = '\a' )

# Build the ground truth

In [5]:
import requests

def get_pr_files(owner: str, repo: str, pr_number: str, token: str = my_token):
    """
    Fetch filenames and their contents changed in a pull request
    Returns: list[tuple] containing: (filename, file_content) 
    """
    url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pr_number}/files"
    headers = {'Authorization': f'token {token}'} if token else {}

    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        if response.status_code == 403:
            print("rate limit exceeded, change token")
            my_token = tokens.get_next()
            token = my_token
            response = requests.get(url, {'Authorization': f'token {token}'})
        else:
            raise Exception(f"Error fetching PR files: {response.status_code}")

    files_data = response.json()
    
    files = []
    for f in files_data:
        # only files ending in java
        if ( f['filename'].endswith('.java') ):
            # try getting the file contents
            fc_res = requests.get( f['raw_url'], headers=headers )

            # success
            if fc_res.status_code == 200:
                files.append(    ( f['filename'], fc_res.content )   )
            else:
                if  fc_res.status_code == 403:
                    print('rate limit exceeded, change token')
                    my_token = tokens.get_next()
                    token = my_token
                else:
                    print(f"could not get content of file: {f['filename']}. statuscode: {fc_res.status_code}")

    return files

In [6]:
#  needs filename of the java program -> return AST object
generate_ast = CoreEngine.generate_ast.generate_ast
# needs AST object -> parses AST. returns parse object
java_program = CoreEngine.java_ast.JavaProgram

def get_symbol_blob( pr_number, owner = 'jabref', repo = 'jabref', token = my_token ):
    """
    generates a symbol blob containing all the symbols in all files changed in 
    a pull request. returns the blob as string
    """
    # get the files changed and their contenets
    try:
        files = get_pr_files( owner, repo, pr_number, token )
    except Exception as e:
        print(e)
        return None

    all_symbols = ""
    
    for f in files:
        # generate ast from file blob
        ast = generate_ast( f[1] )
        prgm = java_program(ast)
        classes, functions = prgm.extract_classes_and_methods()
        
        all_symbols += '    ' + 'classes: ' + '    '.join(classes) +  'functions: ' + '    '.join(functions)

    return all_symbols if all_symbols else None

In [7]:
import numpy as np
import spacy
nlp = spacy.load("en_core_web_md")

def get_labels( pr_number: int, owner: str ='JabRef', repo: str = 'jabref', token: str = my_token ):
    """
        Generates symbol blob for a pull request. Compares it with all the domain and subdomain labels 
        using spacy similarity functions. Then generates the top 3 labels based on similarity scores. 
        Output them as list of labels(strings)
    """
    labels: List[str] = []
    
    try:
        symbol_blob = get_symbol_blob( pr_number, owner, repo, token )
    except Exception as e:
        print(e)
        return []

    if not symbol_blob:
        return []

    domain_path = "./data/domain_labels.json"
    subdomain_path = "./data/subdomain_labels.json"
    api_labels = CoreEngine.utils.read_jsonfile_into_dict(domain_path)
    sub_labels = CoreEngine.utils.read_jsonfile_into_dict(subdomain_path)

    domains_available = {}
    for item in api_labels["Items"]:
        domains_available[list(item.keys())[0]] = item[list(item.keys())[0]]

    base = nlp( symbol_blob )

    # store similarity score, domain name
    domain_scores = []
    
    for domain_label, desc_label in domains_available.items():
        cmp_str = f"{domain_label.lower()}: {desc_label.lower()}"
        domain_info = nlp( cmp_str )
        similarity_score = base.similarity( domain_info )
        domain_scores.append(   (similarity_score, domain_label) )

    # retain top 3 domains 
    domain_scores.sort( key=lambda x: x[0], reverse = True )
    domain_scores = domain_scores[:3]

    for score, domain in domain_scores:
        subdomains = sub_labels[domain]

        # there is a bug in the current pipeline, because of which 
        # the subdomain for 'Event Handling' is never predicted. Instead
        # the whole 'Event Handling' is returned as it is.
        if domain == "Event Handling":
            labels.append( domain )
            continue
            
        # only get the max subdomain score for this domain
        max_score = 0
        max_subdomain = None
        
        for sd in subdomains:
            cur_subdomain, cur_subdomain_info = list(sd.keys())[0], list(sd.values())[0]
            cmp_text = cur_subdomain + ': ' + cur_subdomain_info
            subdomain_info = nlp( cmp_text )
            score = base.similarity(subdomain_info)
            if score > max_score:
                max_score = score
                max_subdomain = cur_subdomain
            
        labels.append( f"{domain}-{max_subdomain}" )

    return labels

In [8]:
labels = get_labels(172)

In [9]:
labels

['Databases-Database Security',
 'Error Handling-Debugging Tools',
 'Data Structure-Data Sorting']

In [9]:
df["Ground_Truth"] = df.apply( lambda x: get_labels( x["PR_Number"] ) , axis = 1 )

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
HTTPSConnectionPool(host='github.com', port=443): Max retries exceeded with url: /JabRef/jabref/raw/55c9e794e36af33972b059e8436669881979a695/src%2Fmain%2Fjava%2Forg%2Fjabref%2Fgui%2Fcustomentrytypes%2FEntryCustomizationDialog.java (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7f2a57095bb0>: Failed to resolve 'github.com' ([Errno -5] No address associated with hostname)"))
list index out of range
list index out of range
list index out of range
list index out of range


In [13]:
df.head()

Unnamed: 0,Issue_Number,PR_Number,Issue_Title,Issue_Body,Issue_Comments,PR_Title,PR_Body,PR_Comments,PR_Author_Name,PR_Author_Username,PR_Closed_Date,Ground_Truth
0,959.0,12,"Cleanup entries: Rely on ""field formatter""",The first two cleanups can be achieved using t...,All the points are already fixed in the latest...,"BugFix for #959 ""StringIndexOutOfBoundsExcepti...","Fixed #959 ""StringIndexOutOfBoundsException wi...",,Ingvar Jackal,IngvarJackal,"07/16/14, 06:57:22 PM","[Databases-Database Security, Data Structure-D..."
1,155.0,172,Consistent encoding strings,JabRef's [current encoding list](https://githu...,I fully agree with what you write and just had...,Fix encoding strings,Fixes #155,,Jörg Lenhard,lenhard,"09/18/15, 02:21:46 PM","[Databases-Database Security, Error Handling-E..."
2,290.0,300,SourceForge references,In PR #286 I have replaced most references to ...,http://jabref.sourceforge.net/journals/journal...,Fix #290 by removing the suggested url and text,Removes the reference to a missing file earlie...,,Oscar Gustafsson,oscargus,"11/07/15, 07:48:52 PM","[Databases-Database Security, Error Handling-E..."
4,545.0,595,JabRef 3.0: ACM Fetcher: Cannot parse number o...,When I do a search in the ACM in JabRef (no ma...,I can confirm this. I've started to see if I c...,Fixed #545 - ACM fetcher works again,Fixed #545 and some PMD cleanup.,,Oscar Gustafsson,oscargus,"12/28/15, 12:58:31 AM","[Databases-Database Security, Error Handling-E..."
5,599.0,600,Import from external database fails for 3.1 an...,"Hi,\n\nJust tried out 3.1. Thank you for all ...",I guess I need to look a bit closer or have mo...,Avoid ConcurrentModificationException when rea...,Fixes #599,,Jörg Lenhard,lenhard,"12/29/15, 01:23:26 PM","[Databases-Database Security, Error Handling-E..."


In [16]:
# save data frame for backup
# df.to_csv( "ground_truth.csv", index=False, header=True, sep="\a", encoding="utf-8" )

In [19]:
# generate random forest predictions
df['RF_Predictions'] = df.apply( lambda x: 
                                 get_prediction( x['Issue_Number'], 
                                                 x['Issue_Title'], 
                                                 x['Issue_Body'] )
                               , axis=1)

Predict Issue Cache Miss - ./output/cache_rf_None_./output/rf_model.pkl_1873.0_Group names are matched wrong
Predict Issue Cache Miss - ./output/cache_rf_None_./output/rf_model.pkl_1721.0_Wrong usage of BibEntry field `id`
Predict Issue Cache Miss - ./output/cache_rf_None_./output/rf_model.pkl_2370.0_Relevance and read status cells: double-click behavior inverted
Predict Issue Cache Miss - ./output/cache_rf_None_./output/rf_model.pkl_2011.0_Add cleanup for escaping comments
Predict Issue Cache Miss - ./output/cache_rf_None_./output/rf_model.pkl_2390.0_Integrity check shows wrongfully that institution is a biblatex only element
Predict Issue Cache Miss - ./output/cache_rf_None_./output/rf_model.pkl_2394.0_Group names including brackets and spaces do not filter group anymore in v3.8
Predict Issue Cache Miss - ./output/cache_rf_None_./output/rf_model.pkl_2426.0_Backslashes are repeatedly escaped on save in Content-Selectors
Predict Issue Cache Miss - ./output/cache_rf_None_./output/rf_mod

In [43]:
df = df[df['Ground_Truth'].map(len) > 0] 
df = df[df['RF_Predictions'].map(len) > 0] 

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 573 entries, 0 to 634
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Issue_Number        573 non-null    float64
 1   PR_Number           573 non-null    int64  
 2   Issue_Title         573 non-null    object 
 3   Issue_Body          573 non-null    object 
 4   Issue_Comments      509 non-null    object 
 5   PR_Title            573 non-null    object 
 6   PR_Body             573 non-null    object 
 7   PR_Comments         0 non-null      float64
 8   PR_Author_Name      533 non-null    object 
 9   PR_Author_Username  573 non-null    object 
 10  PR_Closed_Date      573 non-null    object 
 11  Ground_Truth        573 non-null    object 
 12  RF_Predictions      573 non-null    object 
dtypes: float64(2), int64(1), object(10)
memory usage: 62.7+ KB


In [29]:
# save data frame for backup
# df.to_csv( "ground_truth_with_RF_prediction.csv", index=False, header=True, sep="\a", encoding="utf-8" )

In [40]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

mlb = MultiLabelBinarizer()
y_true_bin = mlb.fit_transform(df['Ground_Truth'])
y_pred_bin = mlb.transform(df['RF_Predictions'])



In [41]:
accuracy = accuracy_score(y_true_bin, y_pred_bin)
precision = precision_score(y_true_bin, y_pred_bin, average='micro')
recall = recall_score(y_true_bin, y_pred_bin, average='micro')
f1 = f1_score(y_true_bin, y_pred_bin, average='micro')

print(f"Subset Accuracy: {accuracy:.4f}")
print(f"Precision (micro): {precision:.4f}")
print(f"Recall (micro): {recall:.4f}")
print(f"F1 Score (micro): {f1:.4f}")

Subset Accuracy: 0.0000
Precision (micro): 0.9298
Recall (micro): 0.1233
F1 Score (micro): 0.2178


In [42]:
print("\nClassification Report:")
print(classification_report(y_true_bin, y_pred_bin, target_names=mlb.classes_))


Classification Report:
                                                          precision    recall  f1-score   support

                        Computer Graphics-Visual Effects       0.00      0.00      0.00        11
                             Data Structure-Data Sorting       0.93      0.40      0.56       527
                             Databases-Database Security       0.00      0.00      0.00       527
                                 Databases-Schema Design       0.00      0.00      0.00        38
                          Error Handling-Debugging Tools       0.00      0.00      0.00        89
                            Error Handling-Error Logging       0.00      0.00      0.00        10
                       Error Handling-Exception Handling       0.00      0.00      0.00       384
           Geographic Information System-Data Collection       0.00      0.00      0.00        15
                   Geographic Information System-Mapping       0.00      0.00      0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
