In [1]:
import pandas as pd
from utils import read_dataset

dataset = read_dataset()

In [2]:
from utils import read_dataset_metadata

target_column, feature_columns, feature_columns_with_delta, feature_columns_only_delta = read_dataset_metadata()

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from utils import create_method_evaluator
#from sklearn.svm import SVC

regression_metrics_functions = {
    "MSE": lambda y_true, y_pred: mean_squared_error(y_true, y_pred),
    "$R^2$": lambda y_true, y_pred: r2_score(y_true, y_pred)
}

evaluate_regression = create_method_evaluator(LinearRegression(), regression_metrics_functions)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from utils import get_X_y, get_X_y_without_duplicates

def train_test_split_03(X, y):
    return [(train_test_split(X, y, test_size=0.3, random_state=0))]

def kfold_split_5(X, y):
    result = []
    for train_index, test_index in KFold(n_splits=5).split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        result.append((X_train, X_test, y_train, y_test))
    return result

def run_evaluation(method_evaluator, dataset, feature_columns, target_column):
    X, y = get_X_y(dataset, feature_columns, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return method_evaluator(X_train, y_train, X_test, y_test)

def run_evaluation_without_duplicates(method_evaluator, dataset, feature_columns, target_column):
    X, y = get_X_y_without_duplicates(dataset, feature_columns, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return method_evaluator(X_train, y_train, X_test, y_test)

def run_evaluation_with_Kfolds_without_duplicates(method_evaluator, dataset, feature_columns, target_column, get_X_y = get_X_y_without_duplicates, splitter = kfold_split_5):
    X, y = get_X_y(dataset, feature_columns, target_column)
    models = []
    scores = []
    
    i = 0
    for X_train, X_test, y_train, y_test in splitter(X, y):
        model, score = method_evaluator(X_train, y_train, X_test, y_test)
        i+=1
        models.append(model)
        score[0]["fold"] = i
        scores += score
    return models, scores




In [5]:
# Playground
def run_all_regressions(run_data_process, method_evaluator, dataset, feature_columns_set):
    result = pd.DataFrame(columns = ["features", "#", "fold", "MSE", "$R^2$"])
    print(f"número de instâncias total: {len(dataset)}")

    for feature_set_name, feature_columns in feature_columns_set:
        models, score = run_data_process(method_evaluator, dataset, feature_columns, target_column)
        row = pd.DataFrame(score)
        row["features"] = feature_set_name
        result = pd.concat( [result, row], sort=False)

    return result

def run_all_regressions_with_past_class(run_data_process):
    params = [
        ("atual e anterior", feature_columns),
        ("atual, anterior e a diferença", feature_columns_with_delta),
        ("diferença", feature_columns_only_delta),
    ]
    return run_all_regressions(run_data_process, evaluate_regression, dataset, params)

def list_except(list, except_item):
    return [f for f in list if f != except_item]

def run_all_regressions_without_past_class(run_data_process):
    past_class = "past_category"
    params = [
        ("atual e anterior", list_except(feature_columns, past_class)),
        ("atual, anterior e a diferença", list_except(feature_columns_with_delta, past_class)),
        ("diferença", list_except(feature_columns_only_delta, past_class)),
    ]
    return run_all_regressions(run_data_process,evaluate_regression, dataset, params)

In [6]:
evaluatio_past_class  = run_all_regressions_with_past_class(run_evaluation).drop(columns=["fold"])
evaluatio_past_class

número de instâncias total: 54834


Unnamed: 0,features,#,MSE,$R^2$
0,atual e anterior,54834,0.074269,0.973971
0,"atual, anterior e a diferença",54834,0.074264,0.973973
0,diferença,54834,0.076483,0.973195


In [7]:
evaluatio_past_class_wo_duplicates = run_all_regressions_with_past_class(run_evaluation_without_duplicates).drop(columns=["fold"])
evaluatio_past_class_wo_duplicates

número de instâncias total: 54834


Unnamed: 0,features,#,MSE,$R^2$
0,atual e anterior,47063,0.072858,0.972244
0,"atual, anterior e a diferença",47063,0.072858,0.972244
0,diferença,38234,0.108685,0.955649


In [8]:
evaluatio_past_class_wo_duplicates_cvk5 = run_all_regressions_with_past_class(run_evaluation_with_Kfolds_without_duplicates)
evaluatio_past_class_wo_duplicates_cvk5

número de instâncias total: 54834


Unnamed: 0,features,#,fold,MSE,$R^2$
0,atual e anterior,47063,1,0.225783,0.86838
1,atual e anterior,47063,2,0.165626,0.92243
2,atual e anterior,47063,3,0.006604,0.997763
3,atual e anterior,47063,4,0.006635,0.997688
4,atual e anterior,47063,5,0.006966,0.99758
0,"atual, anterior e a diferença",47063,1,0.225783,0.86838
1,"atual, anterior e a diferença",47063,2,0.165626,0.92243
2,"atual, anterior e a diferença",47063,3,0.006604,0.997763
3,"atual, anterior e a diferença",47063,4,0.006676,0.997674
4,"atual, anterior e a diferença",47063,5,0.006966,0.99758


In [9]:
evaluatio_wo_past_class = run_all_regressions_without_past_class(run_evaluation).drop(columns=["fold"])
evaluatio_wo_past_class

número de instâncias total: 54834


Unnamed: 0,features,#,MSE,$R^2$
0,atual e anterior,54834,1.006452,0.647267
0,"atual, anterior e a diferença",54834,1.006452,0.647267
0,diferença,54834,2.840107,0.004621


In [10]:
evaluatio_wo_class_wo_duplicates = run_all_regressions_without_past_class(run_evaluation_without_duplicates).drop(columns=["fold"])
evaluatio_wo_class_wo_duplicates

número de instâncias total: 54834


Unnamed: 0,features,#,MSE,$R^2$
0,atual e anterior,47063,1.022249,0.610559
0,"atual, anterior e a diferença",47063,1.022247,0.61056
0,diferença,38192,2.31102,0.054841


In [11]:
evaluatio_wo_past_class_wo_duplicates_cvk5 = run_all_regressions_without_past_class(run_evaluation_with_Kfolds_without_duplicates)
evaluatio_wo_past_class_wo_duplicates_cvk5

número de instâncias total: 54834


Unnamed: 0,features,#,fold,MSE,$R^2$
0,atual e anterior,47063,1,1.27932,0.254221
1,atual e anterior,47063,2,1.228306,0.424728
2,atual e anterior,47063,3,1.008683,0.658277
3,atual e anterior,47063,4,1.03387,0.639731
4,atual e anterior,47063,5,0.991782,0.655439
0,"atual, anterior e a diferença",47063,1,1.27932,0.254221
1,"atual, anterior e a diferença",47063,2,1.229792,0.424033
2,"atual, anterior e a diferença",47063,3,1.008683,0.658277
3,"atual, anterior e a diferença",47063,4,1.034242,0.639602
4,"atual, anterior e a diferença",47063,5,0.991634,0.655491


In [12]:
evaluatio_wo_past_class.merge(evaluatio_past_class, left_on='features', right_on='features')

Unnamed: 0,features,#_x,MSE_x,$R^2$_x,#_y,MSE_y,$R^2$_y
0,atual e anterior,54834,1.006452,0.647267,54834,0.074269,0.973971
1,"atual, anterior e a diferença",54834,1.006452,0.647267,54834,0.074264,0.973973
2,diferença,54834,2.840107,0.004621,54834,0.076483,0.973195


In [13]:
evaluatio_wo_class_wo_duplicates.merge(evaluatio_past_class_wo_duplicates, left_on='features', right_on='features')

Unnamed: 0,features,#_x,MSE_x,$R^2$_x,#_y,MSE_y,$R^2$_y
0,atual e anterior,47063,1.022249,0.610559,47063,0.072858,0.972244
1,"atual, anterior e a diferença",47063,1.022247,0.61056,47063,0.072858,0.972244
2,diferença,38192,2.31102,0.054841,38234,0.108685,0.955649


In [14]:
#evaluatio_past_class_wo_duplicates_cvk5.merge(evaluatio_wo_past_class_wo_duplicates_cvk5, left_on='features', right_on='features')
pd.merge(evaluatio_past_class_wo_duplicates_cvk5, evaluatio_wo_past_class_wo_duplicates_cvk5, on=['features', 'fold'])

Unnamed: 0,features,#_x,fold,MSE_x,$R^2$_x,#_y,MSE_y,$R^2$_y
0,atual e anterior,47063,1,0.225783,0.86838,47063,1.27932,0.254221
1,atual e anterior,47063,2,0.165626,0.92243,47063,1.228306,0.424728
2,atual e anterior,47063,3,0.006604,0.997763,47063,1.008683,0.658277
3,atual e anterior,47063,4,0.006635,0.997688,47063,1.03387,0.639731
4,atual e anterior,47063,5,0.006966,0.99758,47063,0.991782,0.655439
5,"atual, anterior e a diferença",47063,1,0.225783,0.86838,47063,1.27932,0.254221
6,"atual, anterior e a diferença",47063,2,0.165626,0.92243,47063,1.229792,0.424033
7,"atual, anterior e a diferença",47063,3,0.006604,0.997763,47063,1.008683,0.658277
8,"atual, anterior e a diferença",47063,4,0.006676,0.997674,47063,1.034242,0.639602
9,"atual, anterior e a diferença",47063,5,0.006966,0.99758,47063,0.991634,0.655491
