In [99]:
import pandas as pd
from utils import read_dataset

dataset = read_dataset()

In [100]:
from utils import read_dataset_metadata

target_column, feature_columns, feature_columns_with_delta, feature_columns_only_delta = read_dataset_metadata()

In [101]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from utils import create_method_evaluator
#from sklearn.svm import SVC

regression_metrics_functions = {
    "MSE": lambda y_true, y_pred: mean_squared_error(y_true, y_pred),
    "$R^2$": lambda y_true, y_pred: r2_score(y_true, y_pred)
}

evaluate_regression = create_method_evaluator(LinearRegression(), regression_metrics_functions)

In [114]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from utils import get_X_y, get_X_y_without_duplicates
   
def run_evaluation(method_evaluator, dataset, feature_columns, target_column):
    X, y = get_X_y(dataset, feature_columns, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return method_evaluator(X_train, y_train, X_test, y_test)

def run_evaluation_without_duplicates(method_evaluator, dataset, feature_columns, target_column):
    X, y = get_X_y_without_duplicates(dataset, feature_columns, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return method_evaluator(X_train, y_train, X_test, y_test)

def run_evaluation_with_Kfolds_without_duplicates(method_evaluator, dataset, feature_columns, target_column, get_X_y = get_X_y_without_duplicates, splitter = kfold_split_5):
    X, y = get_X_y(dataset, feature_columns, target_column)
    models = []
    scores = []
    
    i = 0
    for X_train, X_test, y_train, y_test in splitter(X, y):
        model, score = method_evaluator(X_train, y_train, X_test, y_test)

        models.append(model)
        score[0]["fold"] = i
        scores += score
    return models, scores

def train_test_split_03(X, y):
    return [(train_test_split(X, y, test_size=0.3, random_state=0))]

def kfold_split_5(X, y):
    result = []
    for train_index, test_index in KFold(n_splits=5).split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        result.append((X_train, X_test, y_train, y_test))
    return result


In [103]:
# Playground
def run_all_regressions(run_data_process, method_evaluator, dataset, feature_columns_set):
    result = pd.DataFrame(columns = ["fold", "features", "número de instâncias", "MSE", "$R^2$"])
    print(f"número de instâncias total: {len(dataset)}")

    for feature_set_name, feature_columns in feature_columns_set:
        models, score = run_data_process(method_evaluator, dataset, feature_columns, target_column)
        row = pd.DataFrame(score)
        row["features"] = feature_set_name
        result = pd.concat( [result, row], sort=False)

    return result

def run_all_regressions_with_past_class(run_data_process):
    params = [
        ("artigo atual e anterior", feature_columns),
        ("artigo atual, anterior e a diferença", feature_columns_with_delta),
        ("diferença", feature_columns_only_delta),
    ]
    return run_all_regressions(run_data_process, evaluate_regression, dataset, params)

def list_except(list, except_item):
    return [f for f in list if f != except_item]

def run_all_regressions_without_past_class(run_data_process):
    past_class = "past_category"
    params = [
        ("artigo atual e anterior", list_except(feature_columns, past_class)),
        ("artigo atual, anterior e a diferença", list_except(feature_columns_with_delta, past_class)),
        ("diferença", list_except(feature_columns_only_delta, past_class)),
    ]
    return run_all_regressions(run_data_process,evaluate_regression, dataset, params)

In [104]:
run_all_regressions_with_past_class(run_evaluation)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$
0,,artigo atual e anterior,54834,0.074269,0.973971
0,,"artigo atual, anterior e a diferença",54834,0.074264,0.973973
0,,diferença,54834,0.076483,0.973195


In [105]:
run_all_regressions_with_past_class(run_evaluation_without_duplicates)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$
0,,artigo atual e anterior,47063,0.072858,0.972244
0,,"artigo atual, anterior e a diferença",47063,0.072858,0.972244
0,,diferença,38234,0.108685,0.955649


In [115]:
run_all_regressions_with_past_class(run_evaluation_with_Kfolds_without_duplicates)

número de instâncias total: 54834


ValueError: not enough values to unpack (expected 5, got 2)

In [0]:
run_all_regressions_without_past_class(run_evaluation)

In [0]:
run_all_regressions_without_past_class(run_evaluation_without_duplicates)

In [0]:
run_all_regressions_without_past_class(run_evaluation_with_Kfolds_without_duplicates)