In [63]:
import pandas as pd
from utils import read_dataset, read_dataset_metadata

dataset = read_dataset()

In [64]:
from utils import read_dataset_metadata

target_column, feature_columns, feature_columns_with_delta, feature_columns_only_delta = read_dataset_metadata()

In [65]:
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
#from sklearn.svm import SVC

regression_metrics_functions = {
    "MSE": lambda y_true, y_pred: mean_squared_error(y_true, y_pred),
    "$R^2$": lambda y_true, y_pred: r2_score(y_true, y_pred)
}

def evaluate_functions(functions_dict, y_true, y_pred):
    return {k: v(y_true, y_pred) for k, v in functions_dict.items()}

def evaluate_regression(X_train, y_train, X_test, y_test):
    model = LinearRegression().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = evaluate_functions(regression_metrics_functions, y_test, y_pred)
    score["número de instâncias"] = len(X_train) + len(X_test)
    return [model], [score]

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from utils import get_X_y, get_X_y_without_duplicates
   
def run_regression(dataset, feature_columns, target_column):
    X, y = get_X_y(dataset, feature_columns, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return evaluate_regression(X_train, y_train, X_test, y_test)

def run_regression_without_duplicates(dataset, feature_columns, target_column):
    X, y = get_X_y_without_duplicates(dataset, feature_columns, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return evaluate_regression(X_train, y_train, X_test, y_test)

def run_regression_with_Kfolds_without_duplicates(dataset, feature_columns, target_column):
    X, y = get_X_y_without_duplicates(dataset, feature_columns, target_column)
    models = []
    scores = [] #pd.DataFrame(columns = ["fold", "número de instâncias", "MSE", "$R^2$"])

    i = 0
    for train_index, test_index in KFold(n_splits=5).split(X):
        i+=1
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model, score = evaluate_regression(X_train, y_train, X_test, y_test)

        models.append(model)
        score[0]["fold"] = i
        scores += score
       # scores = pd.concat( [scores, score], sort=False)

    return models, scores


In [67]:
# Playground
def run_all_regressions(run_regression, dataset, feature_columns_set):
    result = pd.DataFrame(columns = ["fold", "features", "número de instâncias", "MSE", "$R^2$"])
    print(f"número de instâncias total: {len(dataset)}")

    for feature_set_name, feature_columns in feature_columns_set:
        models, score = run_regression(dataset, feature_columns, target_column)
        row = pd.DataFrame(score)
        row["features"] = feature_set_name
        result = pd.concat( [result, row], sort=False)

    return result

def run_all_regressions_with_past_class(run_regression):
    params = [
        ("artigo atual e anterior", feature_columns),
        ("artigo atual, anterior e a diferença", feature_columns_with_delta),
        ("diferença", feature_columns_only_delta),
    ]
    return run_all_regressions(run_regression, dataset, params)

def list_except(list, except_item):
    return [f for f in list if f != except_item]

def run_all_regressions_without_past_class(run_regression):
    past_class = "past_category"
    params = [
        ("artigo atual e anterior", list_except(feature_columns, past_class)),
        ("artigo atual, anterior e a diferença", list_except(feature_columns_with_delta, past_class)),
        ("diferença", list_except(feature_columns_only_delta, past_class)),
    ]
    return run_all_regressions(run_regression, dataset, params)

Unnamed: 0,MSE,$R^2$
0,0.074269,0.973971


In [68]:
run_all_regressions_with_past_class(run_regression)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$
0,,artigo atual e anterior,,0.074269,0.973971
0,,"artigo atual, anterior e a diferença",,0.074264,0.973973
0,,diferença,,0.076483,0.973195


In [69]:
run_all_regressions_with_past_class(run_regression_without_duplicates)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$
0,,artigo atual e anterior,,0.072858,0.972244
0,,"artigo atual, anterior e a diferença",,0.072858,0.972244
0,,diferença,,0.108685,0.955649


In [75]:
run_all_regressions_with_past_class(run_regression_with_Kfolds_without_duplicates)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$
0,1,artigo atual e anterior,,0.225783,0.86838
1,2,artigo atual e anterior,,0.165626,0.92243
2,3,artigo atual e anterior,,0.006604,0.997763
3,4,artigo atual e anterior,,0.006635,0.997688
4,5,artigo atual e anterior,,0.006966,0.99758
0,1,"artigo atual, anterior e a diferença",,0.225783,0.86838
1,2,"artigo atual, anterior e a diferença",,0.165626,0.92243
2,3,"artigo atual, anterior e a diferença",,0.006604,0.997763
3,4,"artigo atual, anterior e a diferença",,0.006676,0.997674
4,5,"artigo atual, anterior e a diferença",,0.006966,0.99758


In [71]:
run_all_regressions_without_past_class(run_regression)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$
0,,artigo atual e anterior,,1.006452,0.647267
0,,"artigo atual, anterior e a diferença",,1.006452,0.647267
0,,diferença,,2.840107,0.004621


In [72]:
run_all_regressions_without_past_class(run_regression_without_duplicates)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$
0,,artigo atual e anterior,,1.022249,0.610559
0,,"artigo atual, anterior e a diferença",,1.022247,0.61056
0,,diferença,,2.31102,0.054841


In [73]:
run_all_regressions_without_past_class(run_regression_with_Kfolds_without_duplicates)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$,0
0,,artigo atual e anterior,,,,"{'MSE': 1.2793200914328358, '$R^2$': 0.2542213..."
1,,artigo atual e anterior,,,,"{'MSE': 1.2283064583374865, '$R^2$': 0.4247283..."
2,,artigo atual e anterior,,,,"{'MSE': 1.0086825650239006, '$R^2$': 0.6582772..."
3,,artigo atual e anterior,,,,"{'MSE': 1.0338703739399262, '$R^2$': 0.6397310..."
4,,artigo atual e anterior,,,,"{'MSE': 0.9917820273636019, '$R^2$': 0.6554391..."
0,,"artigo atual, anterior e a diferença",,,,"{'MSE': 1.2793200914309095, '$R^2$': 0.2542213..."
1,,"artigo atual, anterior e a diferença",,,,"{'MSE': 1.229791632539883, '$R^2$': 0.42403281..."
2,,"artigo atual, anterior e a diferença",,,,"{'MSE': 1.008682565026632, '$R^2$': 0.65827729..."
3,,"artigo atual, anterior e a diferença",,,,"{'MSE': 1.0342415950694563, '$R^2$': 0.6396016..."
4,,"artigo atual, anterior e a diferença",,,,"{'MSE': 0.9916335460622238, '$R^2$': 0.6554907..."
