# Preparation

## Module

In [None]:

import os
import json
from IPython.display import clear_output

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.sparse import csr_matrix, hstack, vstack

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, f1_score, precision_score, recall_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, learning_curve, validation_curve
from trainer import GridSearchCVTrainer

from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor

## Set random seeds

In [None]:
import numpy as np
import random
import os

# Set random seed for numpy
np.random.seed(42)

# Set random seed for random
random.seed(42)

# Set random seed for os
os.environ['PYTHONHASHSEED'] = '42'

## Evaluation function

In [None]:
def evaluate_model(model, model_name, X_test, y_test, y_logscale=False):
    y_pred = model.predict(X_test)
    if(y_logscale):
        y_pred = np.exp(y_pred)
        
    lines = [model_name + '\'s evaluation results:']
    
    mse = mean_squared_error(y_test, y_pred) 
    rmse = np.sqrt(mse) 
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    lines.append(f' - Mean squared error:      {mse:.2f}')
    lines.append(f' - Root mean squared error: {rmse:.2f}')
    lines.append(f' - Mean absolute error:     {mae:.2f}')
    lines.append(f' - R2 error:                {r2:.2f}')
    
    y_pred = np.round(y_pred).astype(int)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)

    lines.append(f' - F1 score:                {f1:.2f}')
    lines.append(f' - Precision:               {precision:.2f}')
    lines.append(f' - Recall:                  {recall:.2f}')
    lines.append(f' - Accuracy:                {accuracy:.2f}')
    lines.append('-------------------------------------------------')
    lines.append('')
    
    return lines

## Data function

In [None]:
def get_data(project_name):
    # Import and remove NaN value
    data_train = pd.concat([pd.read_csv('data/' + project_name + '/' + project_name + '_train.csv'),
                        pd.read_csv('data/' + project_name + '/' + project_name + '_valid.csv')])
    data_test = pd.read_csv('data/' + project_name + '/' + project_name + '_test.csv')

    data_train['description'].replace(np.nan, '', inplace=True)
    data_test['description'].replace(np.nan, '', inplace=True)

    # Vectorize title
    title_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=2)
    title_vectorizer.fit(pd.concat([data_train['title'], data_test['title']]))

    # Vectorize description
    description_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=2)
    description_vectorizer.fit(pd.concat([data_train['description'], data_test['description']]))


    X_train = hstack([title_vectorizer.transform(data_train['title']).astype(float),
                    description_vectorizer.transform(data_train['description']).astype(float),
                    data_train['title'].apply(lambda x : len(x)).to_numpy().reshape(-1, 1),
                    data_train['description'].apply(lambda x : len(x)).to_numpy().reshape(-1, 1)
                    ])

    y_train = data_train['storypoint'].to_numpy().astype(float)

    X_test = hstack([title_vectorizer.transform(data_test['title']).astype(float),
                    description_vectorizer.transform(data_test['description']).astype(float),
                    data_test['title'].apply(lambda x : len(x)).to_numpy().reshape(-1, 1),
                    data_test['description'].apply(lambda x : len(x)).to_numpy().reshape(-1, 1)
                    ])

    y_test = data_test['storypoint'].to_numpy().astype(float)
    
    return X_train, y_train, X_test, y_test

## Constants

In [None]:
folder_path = 'settings/BoW'

# List all project names
project_names = [name for name in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, name))]

# Create a dictionary mapping setting names to models
model_names  = ['Elastic Net', 'Support Vector Regressor', 'Random Forest Regressor', 'XGBoost Regressor', 'LightGBM Regressor']
models = {
    'Elastic Net': (ElasticNet(), 'Elastic Net model'),
    'Support Vector Regressor': (SVR(), 'SVR model'),
    'Random Forest Regressor': (RandomForestRegressor(), 'Random Forest model'),
    'XGBoost Regressor': (XGBRegressor(), 'XGBoost Regressor model'),
    'LightGBM Regressor': (LGBMRegressor(), 'LightGBM regressor model')
}

result_directory =  'results/BoW/'

# Assert

This parts will train the models again and evalutation to check if the results are the same

In [None]:
for project_name in project_names:
    result_file = result_directory + project_name + '.txt'
    with open(result_file, 'r') as f:
        result = f.readlines()
    result = ''.join(result)

    X_train, y_train, X_test, y_test = get_data(project_name)

    check_result = ''
    trained_models = []
    for model_name in model_names:
        model = models[model_name][0]
        eval_name = models[model_name][1]
    
        # Load the best hyperparameters
        with open(f'settings/BoW/{project_name}/{model_name.lower()}_checkpoint.json') as f:
            best_params = json.load(f)['best_params']
            model.set_params(**best_params)
        if(model.__dict__.get('n_jobs') is not None):
            model.set_params(n_jobs=-1)
        
        model.fit(X_train, np.log(y_train))
        trained_models.append(model)

        check_result += '\n'.join(evaluate_model(model, eval_name, X_test, y_test, y_logscale=True)) + '\n'
    
    stack_gen = StackingCVRegressor(regressors=(trained_models[3], trained_models[4], trained_models[1], trained_models[0], trained_models[2]),
                                    meta_regressor=trained_models[np.argmin([mean_squared_error(np.exp(model.predict(X_test)), y_test) for model in trained_models])],
                                    use_features_in_secondary=True, n_jobs=-1, random_state=42, verbose=0)
    stack_gen.fit(X_train, np.log(y_train))
    check_result += '\n'.join(evaluate_model(stack_gen, 'Stacking model', X_test, y_test, y_logscale=True)) + '\n'
    
    assert check_result == result, f'{project_name}: error occured'
    clear_output(wait=True)

print('All models are correct!')
        