In [1]:
import os
import sys
import json
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pathlib
from pprint import pprint
from IPython.display import display
from functools import reduce
import pandas as pd
import pathlib
from sklearn.metrics import classification_report
import numpy as np
import json
from pprint import pprint

current_path = os.path.dirname(os.path.abspath("./"))
print(current_path)
sys.path.append(current_path)

# Local helper modules
import helper
from helper import read_json, matchMLKeyWords
import commonpath
from commonpath import FIG_DIR, OUTPUT_DIR, PROJ_REPOS_CHUNKS_QA_DIR, PROJ_REPOS_PATH
from collections import defaultdict

# !pip install rapidfuzz
# !pip install tldextract
import tldextract
from rapidfuzz import fuzz, utils

/home/jupyter-lihao/mlbindings/src
import facets_overview error!


# Model performance

In [2]:
# Define the specific order
MODEL_ORDER = [
    "distilbert-base-uncased", "distilbert-base-cased", 
    "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-large-cased",
    "albert-base-v2", "albert-large-v2", "albert-xlarge-v2", "albert-xxlarge-v2",
    "roberta-base-squad2", "roberta-large-squad2",
]

## Labelled dataset overview

In [3]:
LABELLED_DATA = commonpath.DATA_DIR / "labelled_data"
train_set_fp = LABELLED_DATA / "binding_QA_train.csv"
validation_set_fp = LABELLED_DATA / "binding_QA_validation.csv"
test_set_fp = LABELLED_DATA / "binding_QA_test.csv"

LABLLED_GT = {
    "eval_predictions.json": validation_set_fp,
    "predict_predictions.json": test_set_fp,
    "test_results_binding.txt": test_set_fp,
}

print("-"*10, "train set", "-"*10)
train_set_df = helper.readCSV(train_set_fp)
helper.printValueCountsPercentage(train_set_df["is binding"])

print("-"*10, "validation set", "-"*10)
validation_set_df = helper.readCSV(validation_set_fp)
helper.printValueCountsPercentage(validation_set_df["is binding"])

print("-"*10, "test set", "-"*10)
test_set_df = helper.readCSV(test_set_fp)
helper.printValueCountsPercentage(test_set_df["is binding"])

---------- train set ----------
1954 / 2346 = 83.29070758738278% False
392 / 2346 = 16.70929241261722% True
---------- validation set ----------
50 / 100 = 50.0% True
50 / 100 = 50.0% False
---------- test set ----------
50 / 100 = 50.0% True
50 / 100 = 50.0% False


In [4]:
def report_classification_res(prediction_filepath, no_answer=""):
    with open(prediction_filepath, 'r') as file:
        predictions = json.load(file)
    predictions_df = pd.DataFrame.from_dict({"id": predictions.keys(), "predicted_label": predictions.values()})
    predictions_df["id"] = predictions_df["id"].astype(int)

    # Load actual labels
    # prediction_filepath.name.startswith("eval")
    actual_labels_df = pd.read_csv(LABLLED_GT[prediction_filepath.name])
    # print(len(actual_labels_df))
    actual_labels_df = actual_labels_df[['id', 'answer']].rename(columns={'answer': 'actual_label'})
    # print(sum(actual_labels_df["actual_label"].isna()))
    
    # Merge predictions and actual labels
    merged_df = pd.merge(predictions_df, actual_labels_df, on='id', how='inner')
    # print(len(merged_df), len(predictions_df), len(actual_labels_df))
    # merged_df
    # 1. Binary Classification Performance
    # Convert NaN in actual labels to 'no_answer' and non-NaN to 'has_answer'
    merged_df['binary_actual'] = merged_df['actual_label'].apply(lambda x: 'not_binding' if pd.isna(x) else 'is_binding')
    # Do the same for predicted labels, assuming that 'empty' represents no answer
    # merged_df['binary_predicted'] = merged_df['predicted_label'].apply(lambda x: 'no_answer' if x == 'empty' else 'has_answer')
    merged_df['binary_predicted'] = merged_df['predicted_label'].apply(lambda x: 'not_binding' if x == no_answer else 'is_binding')
    
    # Calculate binary classification report
    binary_classification_report = classification_report(merged_df['binary_actual'], merged_df['binary_predicted'], digits=8)
    # print("Binary Classification Report:\n", binary_classification_report)
    binary_classification_report_dict = classification_report(merged_df['binary_actual'], merged_df['binary_predicted'], output_dict=True)
    return binary_classification_report_dict
    
# Initialize an empty DataFrame to store results
cols = ["Model", "eval_f1-score", "eval_precision", "eval_recall", "test_f1-score", "test_precision", "test_recall"]

for dataset in ["eval", "test"]:
    for metric in ["f1", "NoAns_f1", "HasAns_exact", "HasAns_f1"]:
        cols.append(f"qa_{dataset}_{metric}")
results_df = pd.DataFrame(columns=cols)
# display(results_df)

MODEL_PATH = commonpath.DATA_DIR / "binding_qa"
best = {
    "model": "",
}
compare_metric = "eval_HasAns_f1"
best[compare_metric] = -1.0

best_cls = {
    "model": "",
}
compare_metric_cls = "cls_eval_f1"
best_cls[compare_metric_cls] = -1.0

for json_path in MODEL_PATH.rglob("trainer_state.json"):
    if "checkpoint" in str(json_path.parent):
        continue
    # all_files = list(p.iterdir())
    # if len(all_files) == 1 and all_files[0].is_dir():
    #     p = all_files[0]
    # print(json_path.parent)
    p = json_path.parent
    # print("="*9, p, "="*9)
    dataset_metrics = {"Model": p.name}
    
    metrics = {}
    metrics_cls = {}
    for dataset in ["eval", "test"]:
        filepath = p / f"{dataset}_results.json"
        if not filepath.exists():
            if dataset != "test":
                print(f"{filepath} does not exist")
                continue
                
            filepath = p / f"predict_results.json"
            if not filepath.exists():
                print(f"{filepath} does not exist")
                continue

        res = read_json(filepath)
        for metric in ["f1", "NoAns_f1", "HasAns_exact", "HasAns_f1"]:
            key = f'{dataset}_{metric}'
            value = res[key] / 100.0
            metrics[key] = value
            # print(f"{key}: {value}", end=" ")
            dataset_metrics[f"qa_{key}"] = value
        # print()
        
        filepath = p / f"{dataset}_predictions.json"
        if not filepath.exists():
            if dataset != "test":
                print(f"{filepath} does not exist")
                continue
                
            filepath = p / f"predict_predictions.json"
            if not filepath.exists():
                print(f"{filepath} does not exist")
                continue
        classification_res = report_classification_res(filepath)
        
        for metric in ["f1-score", "precision", "recall"]:
            dataset_metrics[f"{dataset}_{metric}"] = classification_res["is_binding"][metric]
        metrics_cls[f"cls_{dataset}"] = classification_res
        metrics_cls[f"cls_{dataset}_f1"] = classification_res["is_binding"]["f1-score"]
        
    results_df = results_df._append(dataset_metrics, ignore_index=True)
    
    if metrics and metrics[compare_metric] > best[compare_metric]:
        for k, v in metrics.items():
            best[k] = v
        best["model"] = p.name

    if metrics_cls and metrics_cls[compare_metric_cls] > best_cls[compare_metric_cls]:
        best_cls[compare_metric_cls] = metrics_cls[compare_metric_cls]
        for k, v in metrics_cls.items():
            best_cls[k] = v
        best_cls["model"] = p.name
#     report_classification_res(p)
results_df_qa = results_df
results_df_qa.sort_values("qa_test_HasAns_f1")

  results_df = results_df._append(dataset_metrics, ignore_index=True)


Unnamed: 0,Model,eval_f1-score,eval_precision,eval_recall,test_f1-score,test_precision,test_recall,qa_eval_f1,qa_eval_NoAns_f1,qa_eval_HasAns_exact,qa_eval_HasAns_f1,qa_test_f1,qa_test_NoAns_f1,qa_test_HasAns_exact,qa_test_HasAns_f1
0,albert-xlarge-v2,0.903226,0.976744,0.84,0.876404,1.0,0.78,0.866667,0.98,0.68,0.753333,0.848,1.0,0.64,0.696
6,bert-large-cased,0.914894,0.977273,0.86,0.93617,1.0,0.88,0.914667,0.98,0.82,0.849333,0.883556,1.0,0.72,0.767111
3,bert-base-uncased,0.905263,0.955556,0.86,0.924731,1.0,0.86,0.891333,0.96,0.78,0.822667,0.885,1.0,0.76,0.77
9,bert-large-uncased,0.9375,0.978261,0.9,0.924731,1.0,0.86,0.921333,0.98,0.82,0.862667,0.899667,1.0,0.76,0.799333
7,distilbert-base-cased,0.927835,0.957447,0.9,0.93617,1.0,0.88,0.897667,0.96,0.74,0.835333,0.900857,1.0,0.74,0.801714
11,bert-base-cased,0.926316,0.977778,0.88,0.947368,1.0,0.9,0.901333,0.98,0.74,0.822667,0.901333,1.0,0.76,0.802667
5,distilbert-base-uncased,0.918367,0.9375,0.9,0.903226,0.976744,0.84,0.913333,0.94,0.86,0.886667,0.893,0.98,0.78,0.806
4,albert-large-v2,0.947368,1.0,0.9,0.924731,1.0,0.86,0.923,1.0,0.78,0.846,0.909,1.0,0.8,0.818
2,albert-base-v2,0.9375,0.978261,0.9,0.918367,0.9375,0.9,0.921,0.98,0.78,0.862,0.882048,0.94,0.76,0.824095
10,roberta-large-squad2,0.948454,0.978723,0.92,0.969072,1.0,0.94,0.943333,0.98,0.88,0.906667,0.934667,1.0,0.84,0.869333


In [5]:
results_df_qa["Rank"] = results_df_qa["Model"].apply(MODEL_ORDER.index)
results_df_qa = results_df_qa.sort_values("Rank")
display(results_df_qa)


def formatDataFrame(df, columns, digits=3):
    data = df[columns].copy()
    format_str = "{:." + str(digits) + "f}"
    for c in columns:
        data[c] = data[c].apply(lambda x: format_str.format(round(x, digits)) if isinstance(x, float) else x)
    return data

formatDataFrame(results_df_qa, ["Model", "qa_test_HasAns_f1", "qa_test_HasAns_exact", "test_f1-score", "test_precision", "test_recall"])

Unnamed: 0,Model,eval_f1-score,eval_precision,eval_recall,test_f1-score,test_precision,test_recall,qa_eval_f1,qa_eval_NoAns_f1,qa_eval_HasAns_exact,qa_eval_HasAns_f1,qa_test_f1,qa_test_NoAns_f1,qa_test_HasAns_exact,qa_test_HasAns_f1,Rank
5,distilbert-base-uncased,0.918367,0.9375,0.9,0.903226,0.976744,0.84,0.913333,0.94,0.86,0.886667,0.893,0.98,0.78,0.806,0
7,distilbert-base-cased,0.927835,0.957447,0.9,0.93617,1.0,0.88,0.897667,0.96,0.74,0.835333,0.900857,1.0,0.74,0.801714,1
3,bert-base-uncased,0.905263,0.955556,0.86,0.924731,1.0,0.86,0.891333,0.96,0.78,0.822667,0.885,1.0,0.76,0.77,2
9,bert-large-uncased,0.9375,0.978261,0.9,0.924731,1.0,0.86,0.921333,0.98,0.82,0.862667,0.899667,1.0,0.76,0.799333,3
11,bert-base-cased,0.926316,0.977778,0.88,0.947368,1.0,0.9,0.901333,0.98,0.74,0.822667,0.901333,1.0,0.76,0.802667,4
6,bert-large-cased,0.914894,0.977273,0.86,0.93617,1.0,0.88,0.914667,0.98,0.82,0.849333,0.883556,1.0,0.72,0.767111,5
2,albert-base-v2,0.9375,0.978261,0.9,0.918367,0.9375,0.9,0.921,0.98,0.78,0.862,0.882048,0.94,0.76,0.824095,6
4,albert-large-v2,0.947368,1.0,0.9,0.924731,1.0,0.86,0.923,1.0,0.78,0.846,0.909,1.0,0.8,0.818,7
0,albert-xlarge-v2,0.903226,0.976744,0.84,0.876404,1.0,0.78,0.866667,0.98,0.68,0.753333,0.848,1.0,0.64,0.696,8
8,albert-xxlarge-v2,0.916667,0.956522,0.88,0.959184,0.979167,0.94,0.908,0.96,0.8,0.856,0.933667,0.98,0.84,0.887333,9


Unnamed: 0,Model,qa_test_HasAns_f1,qa_test_HasAns_exact,test_f1-score,test_precision,test_recall
5,distilbert-base-uncased,0.806,0.78,0.903,0.977,0.84
7,distilbert-base-cased,0.802,0.74,0.936,1.0,0.88
3,bert-base-uncased,0.77,0.76,0.925,1.0,0.86
9,bert-large-uncased,0.799,0.76,0.925,1.0,0.86
11,bert-base-cased,0.803,0.76,0.947,1.0,0.9
6,bert-large-cased,0.767,0.72,0.936,1.0,0.88
2,albert-base-v2,0.824,0.76,0.918,0.938,0.9
4,albert-large-v2,0.818,0.8,0.925,1.0,0.86
0,albert-xlarge-v2,0.696,0.64,0.876,1.0,0.78
8,albert-xxlarge-v2,0.887,0.84,0.959,0.979,0.94


In [6]:
rq2_ml_bindings = pd.read_csv(commonpath.DATA_DIR / "rq2_ml_bindings.csv")
rq2_ml_bindings = helper.formatIDColumnsToStr(rq2_ml_bindings)
rq2_ml_repos = pd.read_csv(commonpath.DATA_DIR / "rq2_ml_repos.csv")
rq2_ml_repos = helper.formatIDColumnsToStr(rq2_ml_repos)
rq2_ml_bindings["Host Repo IDs"] = rq2_ml_bindings["Host Repo IDs"].apply(lambda x: eval(x))

In [7]:
ml_repos = pd.read_csv(commonpath.DATA_DIR / "not_toy_ml_repos.csv")
ml_repos = helper.formatIDColumnsToStr(ml_repos)
# filter out non programming languages
ml_repos = ml_repos[~ml_repos["Language"].isin(["Jupyter Notebook", "HTML"])]

print(len(rq2_ml_bindings["ID"].unique()), "bindings")
print(len(rq2_ml_repos["ID"].unique()), "ML repos")
helper.printPercentage(len(rq2_ml_repos["ID"].unique()), len(ml_repos))

def isOfficial(df):
    host_repo_ids = df["Host Repo IDs"]
    if df["Repository ID"] in host_repo_ids:
        return "same_repo"
    
    host_repo_info = rq2_ml_repos[rq2_ml_repos["ID"].isin(host_repo_ids)]
    same_repo_mask = host_repo_info["Name with Owner"] == df["Repository Name with Owner"]
    if sum(same_repo_mask) > 0:
        return "same_repo_name"
    
    host_owners = host_repo_info["Name with Owner"].apply(lambda s: s.split("/")[0])
    if isinstance(df["Repository Name with Owner"], str):
        same_repo_owner_mask = host_owners == df["Repository Name with Owner"].split("/")[0]
        if sum(same_repo_owner_mask) > 0:
            return "same_repo_owner"

    if isinstance(df["Repository URL"], str):
        if "//bitbucket.org/" in df["Repository URL"] or "//gitlab.com/" in df["Repository URL"] or "//github.com/" in df["Repository URL"]:
            repository_owner = df["Repository URL"].split("/")[-2]
            if sum(host_owners == repository_owner):
                return "same_repo_url_owner"
    
    if isinstance(df["Homepage URL"], str):
        if sum(df["Homepage URL"] == host_repo_info["Homepage URL"]) > 0:
            return "same_homepage_url"
    return "community"


print(rq2_ml_bindings.columns)
rq2_ml_bindings_is_official = rq2_ml_bindings.apply(isOfficial, axis=1)
rq2_ml_bindings["is official"] = rq2_ml_bindings_is_official
helper.printValueCountsPercentage(rq2_ml_bindings_is_official)
print("Community maintained:")
helper.printValueCountsPercentage(rq2_ml_bindings_is_official == "community")
print("Amoung official maintained - same repo")
helper.printValueCountsPercentage(rq2_ml_bindings_is_official[rq2_ml_bindings_is_official != "community"] == "same_repo")

2347 bindings
546 ML repos
546 / 11763 = 4.641673042591176% 
Index(['Unnamed: 0', 'ID', 'Platform', 'Name', 'Description', 'Keywords',
       'Licenses', 'Repository URL', 'Versions Count',
       'Dependent Projects Count', 'Status', 'Dependent Repositories Count',
       'Repository ID', 'Repository Host Type', 'Repository Name with Owner',
       'Repository Description', 'Repository Fork?', 'Repository Stars Count',
       'Repository License', 'Repository Status', 'Repository Keywords',
       'Binding Host', 'Homepage URL', 'Homepage Domain',
       'Binding Host Normalized', 'Binding Host Normalized - No Space',
       'Host Repo IDs', 'is official', 'Type'],
      dtype='object')
2206 / 2347 = 93.992330634853% community
78 / 2347 = 3.3233915636983387% same_repo
53 / 2347 = 2.258201959948871% same_repo_owner
7 / 2347 = 0.2982530890498509% same_repo_url_owner
3 / 2347 = 0.1278227524499361% same_homepage_url
Community maintained:
2206 / 2347 = 93.992330634853% True
141 / 2347 = 6.