In [7]:
import math
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from mlxtend.regressor import StackingRegressor
from xgboost import XGBRegressor, XGBRFRegressor

In [3]:
def read_data(filename):

    data=pd.read_csv(filename)
    data=data.drop(['name', 'version', 'name.1'], axis='columns')

    features=data.drop(['bug'], axis='columns')
    label=data['bug']

    test_features=features.values.reshape(-1,20)
    test_label=label.values.reshape(-1,1)

    return test_features, test_label

In [4]:
def mean_relative_error(actual, predicted):
    rel_err=[]
    actual,predicted=np.array(actual), np.array(predicted)
    for i in range(0,len(actual)):
        err=abs(actual[i]-predicted[i])/(actual[i]+1)
        rel_err.append(err)
    return np.mean(rel_err)

In [5]:
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

import warnings
warnings.filterwarnings("ignore")
def train_and_evaluate_model(model, params, train_features, test_features, train_label, test_label):
    rand_search = RandomizedSearchCV(model, params, cv=10)
    rand_search.fit(train_features, train_label)
    best_tuned_model = rand_search.best_estimator_

    y_pred_train = best_tuned_model.predict(train_features)
    y_pred_test = best_tuned_model.predict(test_features)

    mre_train = mean_relative_error(train_label, y_pred_train)
    mae_train = mean_absolute_error(train_label, y_pred_train)
    rmse_train = math.sqrt(mean_squared_error(train_label, y_pred_train))
    r2_train = r2_score(train_label, y_pred_train) 


    mre_test = mean_relative_error(test_label, y_pred_test)
    mae_test = mean_absolute_error(test_label, y_pred_test)
    rmse_test = math.sqrt(mean_squared_error(test_label, y_pred_test))
    r2_test = r2_score(test_label, y_pred_test) 


    return {
        "MRE_train": mre_train,
        "MAE_train": mae_train,
        "RMSE_train": rmse_train,
        "MRE_test": mre_test,
        "MAE_test": mae_test,
        "RMSE_test": rmse_test,
        "R2_train" : r2_train,
        "R2_test" : r2_test
    }


def run_techniques(project_directories  ,per_version_output_csv,combined_project_output_csv):
    all_results = []
    per_version_results = []
    per_version_results = []
    combined_project_results = []
    params_1 = {
                'svr__C':[0.001,0.1,1,5,10],
                'svr__gamma':[0.001, 0.1, 0.5,1],
                'extratreesregressor__n_estimators':[50,100,126,200],
                'extratreesregressor__min_samples_leaf':[20,30,50,10],
                'kneighborsregressor__n_neighbors':[5,7,10,15],
                'randomforestregressor__max_depth':[10,20,30,50],
                'randomforestregressor__n_estimators':[50,100,126,300],
                'meta_regressor__C':[0.001,0.1,1,5,10],
                'meta_regressor__gamma':[0.001,0.1,0.5,1],
                'meta_regressor__kernel':['rbf','linear']
                }
    params_tree = {
        'n_estimators': [50, 126],
        'min_samples_leaf': [30, 50],
        'max_depth': [20, 50]
    }

    params_d_tree = {
        'max_depth': [20, 50]
    }

    params_knn = {
        'n_neighbors': [5, 10]
    }

    params_svr = {
        'C': [5, 10],
        'gamma': [0.001, 0.1, 1]
    }

    for project_directory in project_directories:
        project_results = {"Project": os.path.basename(project_directory)}

        combined_project_t_features = []
        combined_project_t_label = []
        combined_project_te_features = []
        combined_project_te_label = []

        for file in os.listdir(project_directory):
            if file.endswith(".csv"):
                filename = os.path.join(project_directory, file)
                features, label = read_data(filename)
                train_features, test_features, train_label, test_label = train_test_split(features, label, test_size=0.2)
                combined_project_t_features.extend(train_features)
                combined_project_t_label.extend(train_label)
                combined_project_te_features.extend(test_features)
                combined_project_te_label.extend(test_label)

                versions_results = {"Version": file, "Project": os.path.basename(project_directory)}

                models = [
                    ("DecisionTree", DecisionTreeRegressor(), params_d_tree),
                    ("RandomForest", RandomForestRegressor(), params_tree),
                    ("ExtraTrees", ExtraTreesRegressor(), params_tree),
                    ("KNeighbors", KNeighborsRegressor(), params_knn),
                    ("SVR", SVR(), params_svr),
                ]

                for model_name, model, params in models:
                    print(f"Running {model_name} for {file}")
                    model_results = train_and_evaluate_model(model, params, train_features, test_features, train_label, test_label)
                    versions_results[model_name] = model_results
                print('Running Stacking Regression')
                reg1 = SVR()
                reg2 = ExtraTreesRegressor()
                reg3 = KNeighborsRegressor()
                reg4 = RandomForestRegressor()
                rand_search_sr = RandomizedSearchCV(
                    StackingRegressor(store_train_meta_features=True, regressors=[reg1, reg2, reg3, reg4], meta_regressor=SVR()), params_1, cv=5
                )
                print('Fitting Stacking Regression')
                rand_search_sr.fit(train_features, train_label)
                best_tuned_sr = rand_search_sr.best_estimator_
                print('Predicting with Stacking Regression')
                y_pred_sr_train = best_tuned_sr.predict(train_features)
                y_pred_sr_test = best_tuned_sr.predict(test_features)

                mre_sr_train = mean_relative_error(train_label, y_pred_sr_train)
                mae_sr_train = mean_absolute_error(train_label, y_pred_sr_train)
                rmse_sr_train = math.sqrt(mean_squared_error(train_label, y_pred_sr_train))
                r2_sr_train = r2_score(train_label, y_pred_sr_train)  # Calculate R2 score for test set

                mre_sr_test = mean_relative_error(test_label, y_pred_sr_test)
                mae_sr_test = mean_absolute_error(test_label, y_pred_sr_test)
                rmse_sr_test = math.sqrt(mean_squared_error(test_label, y_pred_sr_test))
                r2_sr_test = r2_score(test_label, y_pred_sr_test)  # Calculate R2 score for test set


                sr_results = {
                    "MRE_train": mre_sr_train,
                    "MAE_train": mae_sr_train,
                    "RMSE_train": rmse_sr_train,
                    "MRE_test": mre_sr_test,
                    "MAE_test": mae_sr_test,
                    "RMSE_test": rmse_sr_test,
                    "R2_train":r2_sr_train,
                    "R2_test":r2_sr_test
                }
                versions_results["StackingRegressor"] = sr_results

                

            per_version_results.append(versions_results)

        # Train and test the models with the combined project dataset
        print(f"Running models for the combined project dataset: {os.path.basename(project_directory)}")
        models_results_combined_project = {"Project": os.path.basename(project_directory)}

        for model_name, model, params in models:
            print(f"Running {model_name} for combined project dataset")
            model_results_combined_project = train_and_evaluate_model(
                model, params, combined_project_t_features, combined_project_te_features, combined_project_t_label,
                combined_project_te_label
            )
            models_results_combined_project[model_name] = model_results_combined_project
        print('Running Stacking Regression')
        reg1 = SVR()
        reg2 = ExtraTreesRegressor()
        reg3 = KNeighborsRegressor()
        reg4 = RandomForestRegressor()
        rand_search_sr = RandomizedSearchCV(
            StackingRegressor(store_train_meta_features=True, regressors=[reg1, reg2, reg3, reg4], meta_regressor=SVR()), params_1, cv=5
        )
        print('Fitting Stacking Regression')
        rand_search_sr.fit(combined_project_t_features, combined_project_t_label)
        best_tuned_sr = rand_search_sr.best_estimator_
        print('Predicting with Stacking Regression')
        y_pred_sr_train = best_tuned_sr.predict(combined_project_t_features)
        y_pred_sr_test = best_tuned_sr.predict(combined_project_te_features)

        mre_sr_train = mean_relative_error(combined_project_t_label, y_pred_sr_train)
        mae_sr_train = mean_absolute_error(combined_project_t_label, y_pred_sr_train)
        rmse_sr_train = math.sqrt(mean_squared_error(combined_project_t_label, y_pred_sr_train))
        r2_sr_train = r2_score(combined_project_t_label, y_pred_sr_train)  # Calculate R2 score for test set

        mre_sr_test = mean_relative_error(combined_project_te_label, y_pred_sr_test)
        mae_sr_test = mean_absolute_error(combined_project_te_label, y_pred_sr_test)
        rmse_sr_test = math.sqrt(mean_squared_error(combined_project_te_label, y_pred_sr_test))
        r2_sr_test = r2_score(combined_project_te_label, y_pred_sr_test)  # Calculate R2 score for test set

        sr_results = {
            "MRE_train": mre_sr_train,
            "MAE_train": mae_sr_train,
            "RMSE_train": rmse_sr_train,
            "R2_train" : r2_sr_train,
            "MRE_test": mre_sr_test,
            "MAE_test": mae_sr_test,
            "RMSE_test": rmse_sr_test,
            "R2_test" : r2_sr_test
        }
        models_results_combined_project["StackingRegressor"] = model_results_combined_project

        combined_project_results.append(models_results_combined_project)

        all_results.append(project_results)

    # Save results to CSV
    combined_project_results_df = pd.DataFrame(combined_project_results)
    combined_project_results_df.to_csv(combined_project_output_csv, index=False)

    per_version_results_df = pd.DataFrame(per_version_results)
    per_version_results_df.to_csv(per_version_output_csv, index=False)
    '''
datasets_folder = "C:\\Users\\PC2\\Downloads\\datasets"
project_directories = [os.path.join(datasets_folder, project) for project in os.listdir(datasets_folder) if os.path.isdir(os.path.join(datasets_folder, project))]
output_csv = "C://Users//PC2//Downloads//combinedd_results.csv"
outputt_csv = "C://Users//PC2//Downloads//project_combined_results.csv"
run_techniques(project_directories, outputt_csv,output_csv)
'''

In [8]:
import os
import random
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import math


def choose_random_testing_project(project_directories):
    return random.choice(project_directories)

def run_experiments(test_project_directory, train_directories):
    all_results = []
    
    for _ in range(10):  # Repeat experiments 10 times
        version_results = {"Project": test_project_directory}
        combined_dataset_results = []

        for project_directory in train_directories:
            project_results = []

            for file in os.listdir(project_directory):
                if file.endswith(".csv"):
                    filename = os.path.join(project_directory, file)
                    features, label = read_data(filename)
                    project_results.append({"Version": file, "features": features, "label": label})

            combined_features = pd.concat([pd.DataFrame(result["features"]) for result in project_results], ignore_index=True)
            combined_label = pd.concat([pd.DataFrame(result["label"]) for result in project_results], ignore_index=True)

            combined_dataset_results.append({
                "Project": project_directory,
                "Combined_Features": combined_features,
                "Combined_Label": combined_label
            })

        all_combined_features = pd.concat([pd.DataFrame(result["Combined_Features"]) for result in combined_dataset_results],
                                          ignore_index=True)
        all_combined_label = pd.concat([pd.DataFrame(result["Combined_Label"]) for result in combined_dataset_results],
                                       ignore_index=True)

        train_features, test_features, train_label, test_label = train_test_split(
            all_combined_features, all_combined_label, test_size=0.2
        )

        models = [
            ("DecisionTree", DecisionTreeRegressor(), {"max_depth": [20, 50]}),
            ("RandomForest", RandomForestRegressor(), {"n_estimators": [50, 126], "min_samples_leaf": [30, 50], "max_depth": [20, 50]}),
            ("ExtraTrees", ExtraTreesRegressor(), {"n_estimators": [50, 126], "min_samples_leaf": [30, 50], "max_depth": [20, 50]}),
            ("KNeighbors", KNeighborsRegressor(), {"n_neighbors": [5, 10]}),
            ("SVR", SVR(), {"C": [5, 10], "gamma": [0.001, 0.1, 1]})
        ]

        for model_name, model, params in models:
            print(f"Running {model_name} for test project {test_project_directory}")
            model_results = train_and_evaluate_model(model, params, train_features, test_features, train_label,
                                                     test_label)
            version_results[model_name] = model_results

        print('Running Stacking Regression')
        reg1 = SVR()
        reg2 = ExtraTreesRegressor()
        reg3 = KNeighborsRegressor()
        reg4 = RandomForestRegressor()
        rand_search_sr = RandomizedSearchCV(
            StackingRegressor(store_train_meta_features=True, regressors=[reg1, reg2, reg3, reg4], meta_regressor=SVR()), 
            {"svr__C": [0.001, 0.1, 1, 5, 10], "svr__gamma": [0.001, 0.1, 0.5, 1],
             "meta_regressor__C": [0.001, 0.1, 1, 5, 10], "meta_regressor__gamma": [0.001, 0.1, 0.5, 1],
             "meta_regressor__kernel": ['rbf', 'linear']}, cv=5
        )
        print('Fitting Stacking Regression')
        rand_search_sr.fit(train_features, train_label)
        best_tuned_sr = rand_search_sr.best_estimator_
        print('Predicting with Stacking Regression')
        y_pred_sr_train = best_tuned_sr.predict(train_features)
        y_pred_sr_test = best_tuned_sr.predict(test_features)

        mre_sr_train = mean_relative_error(train_label, y_pred_sr_train)
        mae_sr_train = mean_absolute_error(train_label, y_pred_sr_train)
        rmse_sr_train = math.sqrt(mean_squared_error(train_label, y_pred_sr_train))
        r2_sr_train = r2_score(train_label, y_pred_sr_train)  # Calculate R2 score for test set

        mre_sr_test = mean_relative_error(test_label, y_pred_sr_test)
        mae_sr_test = mean_absolute_error(test_label, y_pred_sr_test)
        rmse_sr_test = math.sqrt(mean_squared_error(test_label, y_pred_sr_test))
        r2_sr_test = r2_score(test_label, y_pred_sr_test)  # Calculate R2 score for test set

        sr_results = {
            "MRE_train": mre_sr_train,
            "MAE_train": mae_sr_train,
            "RMSE_train": rmse_sr_train,
            "MRE_test": mre_sr_test,
            "MAE_test": mae_sr_test,
            "RMSE_test": rmse_sr_test,
            "R2_train": r2_sr_train,
            "R2_test": r2_sr_test
        }
        version_results["StackingRegressor"] = sr_results

        all_results.append(version_results)

    # Save results to CSV
    combined_results_df = pd.DataFrame(all_results)
    combined_results_df.to_csv("output/results.csv", index=False)

# Usage
test_project_directory = choose_random_testing_project(project_directories)
train_directories = [dir for dir in project_directories if dir != test_project_directory]

run_experiments(test_project_directory, train_directories)


Running DecisionTree for test project C:\Users\PC2\Downloads\datasets\ivy
Running RandomForest for test project C:\Users\PC2\Downloads\datasets\ivy


KeyboardInterrupt: 

In [None]:
import pandas as pd
import ast



# Create a DataFrame from the CSV file
df = pd.read_csv(outputt_csv)

# Convert string representation of dictionaries to actual dictionaries
df['DecisionTree'] = df['DecisionTree'].apply(ast.literal_eval)
df['RandomForest'] = df['RandomForest'].apply(ast.literal_eval)
df['ExtraTrees'] = df['ExtraTrees'].apply(ast.literal_eval)
df['KNeighbors'] = df['KNeighbors'].apply(ast.literal_eval)
df['SVR'] = df['SVR'].apply(ast.literal_eval)
df['StackingRegressor'] = df['StackingRegressor'].apply(ast.literal_eval)

# Expand the dictionaries into separate columns
df = pd.concat([df, df['DecisionTree'].apply(pd.Series).add_prefix('DecisionTree_')], axis=1)
df = pd.concat([df, df['RandomForest'].apply(pd.Series).add_prefix('RandomForest_')], axis=1)
df = pd.concat([df, df['ExtraTrees'].apply(pd.Series).add_prefix('ExtraTrees_')], axis=1)
df = pd.concat([df, df['KNeighbors'].apply(pd.Series).add_prefix('KNeighbors_')], axis=1)
df = pd.concat([df, df['SVR'].apply(pd.Series).add_prefix('SVR_')], axis=1)
df = pd.concat([df, df['StackingRegressor'].apply(pd.Series).add_prefix('StackingRegressor_')], axis=1)

# Drop the original columns with dictionaries
df = df.drop(['DecisionTree', 'RandomForest', 'ExtraTrees', 'KNeighbors', 'SVR', 'StackingRegressor'], axis=1)

df.to_csv("C://Users//PC2//Downloads//project.csv")


In [None]:
import pandas as pd
import ast



# Create a DataFrame from the CSV file
df = pd.read_csv(output_csv)

# Convert string representation of dictionaries to actual dictionaries
df['DecisionTree'] = df['DecisionTree'].apply(ast.literal_eval)
df['RandomForest'] = df['RandomForest'].apply(ast.literal_eval)
df['ExtraTrees'] = df['ExtraTrees'].apply(ast.literal_eval)
df['KNeighbors'] = df['KNeighbors'].apply(ast.literal_eval)
df['SVR'] = df['SVR'].apply(ast.literal_eval)
df['StackingRegressor'] = df['StackingRegressor'].apply(ast.literal_eval)

# Expand the dictionaries into separate columns
df = pd.concat([df, df['DecisionTree'].apply(pd.Series).add_prefix('DecisionTree_')], axis=1)
df = pd.concat([df, df['RandomForest'].apply(pd.Series).add_prefix('RandomForest_')], axis=1)
df = pd.concat([df, df['ExtraTrees'].apply(pd.Series).add_prefix('ExtraTrees_')], axis=1)
df = pd.concat([df, df['KNeighbors'].apply(pd.Series).add_prefix('KNeighbors_')], axis=1)
df = pd.concat([df, df['SVR'].apply(pd.Series).add_prefix('SVR_')], axis=1)
df = pd.concat([df, df['StackingRegressor'].apply(pd.Series).add_prefix('StackingRegressor_')], axis=1)

# Drop the original columns with dictionaries
df = df.drop(['DecisionTree', 'RandomForest', 'ExtraTrees', 'KNeighbors', 'SVR', 'StackingRegressor'], axis=1)

df.to_csv("C://Users//PC2//Downloads//version.csv")


In [67]:
import pandas as pd
import ast



# Create a DataFrame from the CSV file
df = pd.read_csv("C://Users//PC2//Downloads//all_results.csv")

# Convert string representation of dictionaries to actual dictionaries
df['DecisionTree'] = df['DecisionTree'].apply(ast.literal_eval)
df['RandomForest'] = df['RandomForest'].apply(ast.literal_eval)
df['ExtraTrees'] = df['ExtraTrees'].apply(ast.literal_eval)
df['KNeighbors'] = df['KNeighbors'].apply(ast.literal_eval)
df['SVR'] = df['SVR'].apply(ast.literal_eval)
df['StackingRegressor'] = df['StackingRegressor'].apply(ast.literal_eval)

# Expand the dictionaries into separate columns
df = pd.concat([df, df['DecisionTree'].apply(pd.Series).add_prefix('DecisionTree_')], axis=1)
df = pd.concat([df, df['RandomForest'].apply(pd.Series).add_prefix('RandomForest_')], axis=1)
df = pd.concat([df, df['ExtraTrees'].apply(pd.Series).add_prefix('ExtraTrees_')], axis=1)
df = pd.concat([df, df['KNeighbors'].apply(pd.Series).add_prefix('KNeighbors_')], axis=1)
df = pd.concat([df, df['SVR'].apply(pd.Series).add_prefix('SVR_')], axis=1)
df = pd.concat([df, df['StackingRegressor'].apply(pd.Series).add_prefix('StackingRegressor_')], axis=1)

# Drop the original columns with dictionaries
df = df.drop(['DecisionTree', 'RandomForest', 'ExtraTrees', 'KNeighbors', 'SVR', 'StackingRegressor'], axis=1)

df.to_csv("C://Users//PC2//Downloads//all.csv")
