### ABE0 with Hyperopt - Within Repo

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import hyperopt
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials
import mlflow


def objective(params):
    reg = KNeighborsRegressor(**params)
    mae = cross_val_score(reg, X_train, y_train, scoring='neg_mean_absolute_error').mean()
    # fmin() tries to minimize the objective, so we need to redirect neg_mean_absolute_error with "-"
    return {'loss': -mae, 'status': STATUS_OK}

SEARCH_SPACE = hp.choice('classifier_type', [
    {
        'n_neighbors': hp.choice('n_neighbors', list(np.arange(2, 10+1, 1, dtype=int))),
        'weights': hp.choice('weights', ['uniform', 'distance']),
        'algorithm': hp.choice('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'leaf_size': hp.choice('leaf_size', list(np.arange(10, 50+1, 10, dtype=int))),
        'p': hp.choice('p', list(np.arange(1, 2+1, 1, dtype=int))),
        'n_jobs': hp.choice('n_jobs', [16])
    }
])

PATH = "../sp_dataset/marked_data/"
# cross project - within repo
WITHIN_REPO = [
                {'train': ['mesos'], 'test': ['usergrid']},
                {'train': ['usergrid'], 'test': ['mesos']},
                {'train': ['appceleratorstudio'], 'test': ['aptanastudio']},
                {'train': ['appceleratorstudio'], 'test': ['titanium']},
                {'train': ['titanium'], 'test': ['appceleratorstudio']},
                {'train': ['aptanastudio'], 'test': ['titanium']},
                {'train': ['mule'], 'test': ['mulestudio']},
                {'train': ['mulestudio'], 'test': ['mule']}
              ]

report = []
for pair in WITHIN_REPO:
    # split to 60% training and 40% validation
    data = pd.read_csv(PATH + pair["train"][0] + ".csv")
    
    train_data = data[: int(len(data)*0.6)]
    train_data = data
    
    X_train = train_data["title"].tolist()
    y_train = train_data["storypoint"].tolist()
    val_data = data[int(len(data)*0.6):]

    test_data = pd.read_csv(PATH + pair["test"][0] + ".csv")
    X_test = test_data["title"].tolist()
    y_test = np.array(test_data["storypoint"].tolist())
    
    # apply BoW feature extraction
    vectorizer = TfidfVectorizer(norm='l2')
    vectorizer = vectorizer.fit(X_train)
    vectorizer = vectorizer.fit(X_test)
    
    X_train = vectorizer.transform(X_train).todense()
    X_test = vectorizer.transform(X_test).todense()

    algo = tpe.suggest
    with mlflow.start_run():
        best_result = fmin(fn=objective, 
                           space=SEARCH_SPACE,
                           algo=algo,
                           max_evals=32)
    best_model = hyperopt.space_eval(SEARCH_SPACE, best_result)    
    neigh = KNeighborsRegressor(n_neighbors=best_model['n_neighbors'],
                                weights=best_model['weights'],
                                algorithm=best_model['algorithm'],
                                leaf_size=best_model['leaf_size'],
                                p=best_model['p'])
    neigh.fit(X_train, y_train)
    preds = neigh.predict(X_test)
    mae = round(sum(abs(preds - y_test)) / len(preds), 2)
    report.append(f"""Train {pair["train"][0]} | Test {pair["test"][0]}: {mae}""")

100%|████████████████████████████████████████████████| 32/32 [00:33<00:00,  1.06s/trial, best loss: 1.5335119047619048]
100%|████████████████████████████████████████████████| 32/32 [00:11<00:00,  2.81trial/s, best loss: 0.9992291905307369]
100%|█████████████████████████████████████████████████| 32/32 [01:41<00:00,  3.17s/trial, best loss: 2.392136622101083]
100%|████████████████████████████████████████████████| 32/32 [01:54<00:00,  3.57s/trial, best loss: 2.3625957943189797]
100%|█████████████████████████████████████████████████| 32/32 [00:48<00:00,  1.51s/trial, best loss: 3.462289600863719]
100%|█████████████████████████████████████████████████| 32/32 [00:28<00:00,  1.11trial/s, best loss: 4.240739686016794]
100%|████████████████████████████████████████████████| 32/32 [00:18<00:00,  1.69trial/s, best loss: 2.6912651558433316]
100%|████████████████████████████████████████████████| 32/32 [00:22<00:00,  1.45trial/s, best loss: 3.7320596636119645]


In [2]:
maes = []
train_file = []
test_file = []

for record in report:
    print(record)
    train_file.append(record.split(" ")[1])
    test_file.append(record.split(" ")[4].strip(":"))
    mae_val = record.split(":")[-1]
    mae_val = float(mae_val.strip())
    maes.append(mae_val)

Train mesos | Test usergrid: 1.19
Train usergrid | Test mesos: 1.57
Train appceleratorstudio | Test aptanastudio: 4.22
Train appceleratorstudio | Test titanium: 3.45
Train titanium | Test appceleratorstudio: 2.45
Train aptanastudio | Test titanium: 4.16
Train mule | Test mulestudio: 3.45
Train mulestudio | Test mule: 2.93


In [3]:
pd.DataFrame(data={"train_file": train_file,
                   "test_file": test_file,
                   "mae": maes}).to_csv("within_repo_abe0_hyperopt.csv", index=False)

### ABE0 with Hyperopt - Cross Repo

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import hyperopt
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials
import mlflow


def objective(params):
    reg = KNeighborsRegressor(**params)
    mae = cross_val_score(reg, X_train, y_train, scoring='neg_mean_absolute_error').mean()
    # fmin() tries to minimize the objective, so we need to redirect neg_mean_absolute_error with "-"
    return {'loss': -mae, 'status': STATUS_OK}

SEARCH_SPACE = hp.choice('classifier_type', [
    {
        'n_neighbors': hp.choice('n_neighbors', list(np.arange(2, 10+1, 1, dtype=int))),
        'weights': hp.choice('weights', ['uniform', 'distance']),
        'algorithm': hp.choice('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
        'leaf_size': hp.choice('leaf_size', list(np.arange(10, 50+1, 10, dtype=int))),
        'p': hp.choice('p', list(np.arange(1, 2+1, 1, dtype=int))),
        'n_jobs': hp.choice('n_jobs', [16])
    }
])

PATH = "../sp_dataset/marked_data/"

# cross project - cross repo
CROSS_REPO = [
                {'train': ['clover'], 'test': ['usergrid']},
                {'train': ['talendesb'], 'test': ['mesos']},
                {'train': ['talenddataquality'], 'test': ['aptanastudio']},
                {'train': ['mule'], 'test': ['titanium']},
                {'train': ['talenddataquality'], 'test': ['appceleratorstudio']},
                {'train': ['mulestudio'], 'test': ['titanium']},
                {'train': ['appceleratorstudio'], 'test': ['mulestudio']},
                {'train': ['appceleratorstudio'], 'test': ['mule']}
             ]
report = []
for pair in CROSS_REPO:
    # split to 60% training and 40% validation
    data = pd.read_csv(PATH + pair["train"][0] + ".csv")
    
    train_data = data[: int(len(data)*0.6)]
    train_data = data
    
    X_train = train_data["title"].tolist()
    y_train = train_data["storypoint"].tolist()
    val_data = data[int(len(data)*0.6):]

    test_data = pd.read_csv(PATH + pair["test"][0] + ".csv")
    X_test = test_data["title"].tolist()
    y_test = np.array(test_data["storypoint"].tolist())
    
    # apply BoW feature extraction
    vectorizer = TfidfVectorizer(norm='l2')
    vectorizer = vectorizer.fit(X_train)
    vectorizer = vectorizer.fit(X_test)
    
    X_train = vectorizer.transform(X_train).todense()
    X_test = vectorizer.transform(X_test).todense()

    algo = tpe.suggest
    with mlflow.start_run():
        best_result = fmin(fn=objective, 
                           space=SEARCH_SPACE,
                           algo=algo,
                           max_evals=32)
    best_model = hyperopt.space_eval(SEARCH_SPACE, best_result)    
    neigh = KNeighborsRegressor(n_neighbors=best_model['n_neighbors'],
                                weights=best_model['weights'],
                                algorithm=best_model['algorithm'],
                                leaf_size=best_model['leaf_size'],
                                p=best_model['p'])
    neigh.fit(X_train, y_train)
    preds = neigh.predict(X_test)
    mae = round(sum(abs(preds - y_test)) / len(preds), 2)
    report.append(f"""Train {pair["train"][0]} | Test {pair["test"][0]}: {mae}""")

100%|████████████████████████████████████████████████| 32/32 [00:15<00:00,  2.04trial/s, best loss: 3.4074143829663925]
100%|████████████████████████████████████████████████| 32/32 [00:23<00:00,  1.35trial/s, best loss: 1.0738987442694836]
100%|█████████████████████████████████████████████████| 32/32 [00:39<00:00,  1.25s/trial, best loss: 3.831566938151462]
100%|█████████████████████████████████████████████████| 32/32 [00:30<00:00,  1.06trial/s, best loss: 2.839010982035168]
100%|████████████████████████████████████████████████| 32/32 [01:02<00:00,  1.96s/trial, best loss: 3.7783367168756854]
100%|█████████████████████████████████████████████████| 32/32 [00:43<00:00,  1.35s/trial, best loss: 4.383114341627062]
100%|████████████████████████████████████████████████| 32/32 [01:33<00:00,  2.92s/trial, best loss: 2.3658354284640146]
100%|█████████████████████████████████████████████████| 32/32 [01:17<00:00,  2.42s/trial, best loss: 2.324721680490613]


In [5]:
maes = []
train_file = []
test_file = []

for record in report:
    print(record)
    train_file.append(record.split(" ")[1])
    test_file.append(record.split(" ")[4].strip(":"))
    mae_val = record.split(":")[-1]
    mae_val = float(mae_val.strip())
    maes.append(mae_val)

Train clover | Test usergrid: 1.51
Train talendesb | Test mesos: 1.57
Train talenddataquality | Test aptanastudio: 4.2
Train mule | Test titanium: 3.32
Train talenddataquality | Test appceleratorstudio: 2.7
Train mulestudio | Test titanium: 4.73
Train appceleratorstudio | Test mulestudio: 3.51
Train appceleratorstudio | Test mule: 2.71


In [6]:
pd.DataFrame(data={"train_file": train_file,
                   "test_file": test_file,
                   "mae": maes}).to_csv("cross_repo_abe0_hyperopt.csv", index=False)

### ABE0 without Hyperopt - Within Repo

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
import numpy as np


PATH = "../sp_dataset/marked_data/"
# cross project - within repo
WITHIN_REPO = [
                {'train': ['mesos'], 'test': ['usergrid']},
                {'train': ['usergrid'], 'test': ['mesos']},
                {'train': ['appceleratorstudio'], 'test': ['aptanastudio']},
                {'train': ['appceleratorstudio'], 'test': ['titanium']},
                {'train': ['titanium'], 'test': ['appceleratorstudio']},
                {'train': ['aptanastudio'], 'test': ['titanium']},
                {'train': ['mule'], 'test': ['mulestudio']},
                {'train': ['mulestudio'], 'test': ['mule']}
              ]

report = []
for pair in WITHIN_REPO:
    # split to 60% training and 40% validation
    data = pd.read_csv(PATH + pair["train"][0] + ".csv")
    
    train_data = data[: int(len(data)*0.6)]
    train_data = data
    
    X_train = train_data["title"].tolist()
    y_train = train_data["storypoint"].tolist()
    val_data = data[int(len(data)*0.6):]

    test_data = pd.read_csv(PATH + pair["test"][0] + ".csv")
    X_test = test_data["title"].tolist()
    y_test = np.array(test_data["storypoint"].tolist())
    
    # apply BoW feature extraction
    vectorizer = TfidfVectorizer(norm='l2')
    vectorizer = vectorizer.fit(X_train)
    vectorizer = vectorizer.fit(X_test)
    
    X_train = vectorizer.transform(X_train).todense()
    X_test = vectorizer.transform(X_test).todense()
    
    neigh = KNeighborsRegressor(n_neighbors=3)
    neigh.fit(X_train, y_train)
    preds = neigh.predict(X_test)
    mae = round(sum(abs(preds - y_test)) / len(preds), 2)
    report.append(f"""Train {pair["train"][0]} | Test {pair["test"][0]}: {mae}""")

In [8]:
maes = []
train_file = []
test_file = []

for record in report:
    print(record)
    train_file.append(record.split(" ")[1])
    test_file.append(record.split(" ")[4].strip(":"))
    mae_val = record.split(":")[-1]
    mae_val = float(mae_val.strip())
    maes.append(mae_val)

Train mesos | Test usergrid: 1.24
Train usergrid | Test mesos: 1.63
Train appceleratorstudio | Test aptanastudio: 4.27
Train appceleratorstudio | Test titanium: 3.61
Train titanium | Test appceleratorstudio: 2.62
Train aptanastudio | Test titanium: 3.6
Train mule | Test mulestudio: 3.82
Train mulestudio | Test mule: 3.04


In [9]:
pd.DataFrame(data={"train_file": train_file,
                   "test_file": test_file,
                   "mae": maes}).to_csv("within_repo_abe0_no_hyperopt.csv", index=False)

### ABE0 without Hyperopt - Cross Repo

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
import numpy as np


PATH = "../sp_dataset/marked_data/"

# cross project - cross repo
CROSS_REPO = [
                {'train': ['clover'], 'test': ['usergrid']},
                {'train': ['talendesb'], 'test': ['mesos']},
                {'train': ['talenddataquality'], 'test': ['aptanastudio']},
                {'train': ['mule'], 'test': ['titanium']},
                {'train': ['talenddataquality'], 'test': ['appceleratorstudio']},
                {'train': ['mulestudio'], 'test': ['titanium']},
                {'train': ['appceleratorstudio'], 'test': ['mulestudio']},
                {'train': ['appceleratorstudio'], 'test': ['mule']}
             ]

report = []
for pair in CROSS_REPO:
    # split to 60% training and 40% validation
    data = pd.read_csv(PATH + pair["train"][0] + ".csv")
    
    train_data = data[: int(len(data)*0.6)]
    train_data = data
    
    X_train = train_data["title"].tolist()
    y_train = train_data["storypoint"].tolist()
    val_data = data[int(len(data)*0.6):]

    test_data = pd.read_csv(PATH + pair["test"][0] + ".csv")
    X_test = test_data["title"].tolist()
    y_test = np.array(test_data["storypoint"].tolist())
    
    # apply BoW feature extraction
    vectorizer = TfidfVectorizer(norm='l2')
    vectorizer = vectorizer.fit(X_train)
    vectorizer = vectorizer.fit(X_test)
    
    X_train = vectorizer.transform(X_train).todense()
    X_test = vectorizer.transform(X_test).todense()
    
    neigh = KNeighborsRegressor(n_neighbors=3)
    neigh.fit(X_train, y_train)
    preds = neigh.predict(X_test)
    mae = round(sum(abs(preds - y_test)) / len(preds), 2)
    report.append(f"""Train {pair["train"][0]} | Test {pair["test"][0]}: {mae}""")

In [11]:
maes = []
train_file = []
test_file = []

for record in report:
    print(record)
    train_file.append(record.split(" ")[1])
    test_file.append(record.split(" ")[4].strip(":"))
    mae_val = record.split(":")[-1]
    mae_val = float(mae_val.strip())
    maes.append(mae_val)

Train clover | Test usergrid: 1.02
Train talendesb | Test mesos: 1.56
Train talenddataquality | Test aptanastudio: 4.05
Train mule | Test titanium: 3.62
Train talenddataquality | Test appceleratorstudio: 3.18
Train mulestudio | Test titanium: 8.13
Train appceleratorstudio | Test mulestudio: 3.58
Train appceleratorstudio | Test mule: 3.27


In [12]:
pd.DataFrame(data={"train_file": train_file,
                   "test_file": test_file,
                   "mae": maes}).to_csv("cross_repo_abe0_no_hyperopt.csv", index=False)