In [5]:
import pandas as pd
from utils import read_dataset

dataset = read_dataset()

In [6]:
from utils import read_dataset_metadata

target_column, feature_columns, feature_columns_only_actual, feature_columns_with_delta, feature_columns_only_delta = read_dataset_metadata()

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from utils import create_method_evaluator
#from sklearn.svm import SVC

regression_metrics_functions = {
    "MSE": lambda y_true, y_pred: mean_squared_error(y_true, y_pred),
    "$R^2$": lambda y_true, y_pred: r2_score(y_true, y_pred)
}

evaluate_regression = create_method_evaluator(LinearRegression(), regression_metrics_functions)

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from utils import get_X_y, get_X_y_without_duplicates

def train_test_split_03(X, y):
    return [(train_test_split(X, y, test_size=0.3, random_state=0))]

def kfold_split_5(X, y):
    result = []
    for train_index, test_index in KFold(n_splits=10).split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        result.append((X_train, X_test, y_train, y_test))
    return result

def run_evaluation(method_evaluator, dataset, feature_columns, target_column):
    X, y = get_X_y(dataset, feature_columns, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return method_evaluator(X_train, y_train, X_test, y_test)

def run_evaluation_without_duplicates(method_evaluator, dataset, feature_columns, target_column):
    X, y = get_X_y_without_duplicates(dataset, feature_columns, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return method_evaluator(X_train, y_train, X_test, y_test)

def run_evaluation_with_Kfolds_without_duplicates(method_evaluator, dataset, feature_columns, target_column, get_X_y = get_X_y_without_duplicates, splitter = kfold_split_5):
    X, y = get_X_y(dataset, feature_columns, target_column)
    models = []
    scores = []
    
    i = 0
    for X_train, X_test, y_train, y_test in splitter(X, y):
        model, score = method_evaluator(X_train, y_train, X_test, y_test)
        i+=1
        models.append(model)
        score[0]["fold"] = i
        scores += score
    return models, scores




In [9]:
# Playground
def run_all_regressions(run_data_process, method_evaluator, dataset, feature_columns_set):
    result = pd.DataFrame(columns = ["features", "fold", "#", "MSE", "$R^2$"])
    print(f"número de instâncias total: {len(dataset)}")

    for feature_set_name, feature_columns in feature_columns_set:
        models, score = run_data_process(method_evaluator, dataset, feature_columns, target_column)
        row = pd.DataFrame(score)
        row["features"] = feature_set_name
        result = pd.concat( [result, row], sort=False)

    return result

def run_all_regressions_with_past_class(run_data_process):
    params = [
        ("atual", feature_columns_only_actual),
        ("atual e anterior", feature_columns),
        ("atual, anterior e a diferença", feature_columns_with_delta),
        ("diferença", feature_columns_only_delta),
    ]
    return run_all_regressions(run_data_process, evaluate_regression, dataset, params)

def list_except(list, except_item):
    return [f for f in list if f != except_item]

def run_all_regressions_without_past_class(run_data_process):
    past_class = "past_category"
    params = [
        ("atual", list_except(feature_columns_only_actual, past_class)),
        ("atual e anterior", list_except(feature_columns, past_class)),
        ("atual, anterior e a diferença", list_except(feature_columns_with_delta, past_class)),
        ("diferença", list_except(feature_columns_only_delta, past_class)),
    ]
    return run_all_regressions(run_data_process,evaluate_regression, dataset, params)

In [10]:
evaluatio_past_class  = run_all_regressions_with_past_class(run_evaluation).drop(columns=["fold"])
evaluatio_past_class

número de instâncias total: 54834


Unnamed: 0,features,#,MSE,$R^2$
0,atual,54834,0.076955,0.973029
0,atual e anterior,54834,0.074269,0.973971
0,"atual, anterior e a diferença",54834,0.074264,0.973973
0,diferença,54834,0.076483,0.973195


In [11]:
evaluatio_past_class_wo_duplicates = run_all_regressions_with_past_class(run_evaluation_without_duplicates).drop(columns=["fold"])
evaluatio_past_class_wo_duplicates

número de instâncias total: 54834


Unnamed: 0,features,#,MSE,$R^2$
0,atual,38592,0.089481,0.96367
0,atual e anterior,47063,0.072858,0.972244
0,"atual, anterior e a diferença",47063,0.072858,0.972244
0,diferença,38234,0.108685,0.955649


In [12]:
evaluatio_past_class_wo_duplicates_cvk5 = run_all_regressions_with_past_class(run_evaluation_with_Kfolds_without_duplicates)
evaluatio_past_class_wo_duplicates_cvk5

número de instâncias total: 54834


Unnamed: 0,features,fold,#,MSE,$R^2$
0,atual,1,38592,0.262095,0.853502
1,atual,2,38592,0.256972,0.832344
2,atual,3,38592,0.233123,0.861215
3,atual,4,38592,0.207902,0.897965
4,atual,5,38592,0.006511,0.997849
5,atual,6,38592,0.006949,0.997394
6,atual,7,38592,0.007682,0.997165
7,atual,8,38592,0.007616,0.997312
8,atual,9,38592,0.006675,0.997446
9,atual,10,38592,0.006977,0.997534


In [13]:
evaluatio_wo_past_class = run_all_regressions_without_past_class(run_evaluation).drop(columns=["fold"])
evaluatio_wo_past_class

número de instâncias total: 54834


Unnamed: 0,features,#,MSE,$R^2$
0,atual,54834,1.013912,0.644652
0,atual e anterior,54834,1.006452,0.647267
0,"atual, anterior e a diferença",54834,1.006452,0.647267
0,diferença,54834,2.840107,0.004621


In [14]:
evaluatio_wo_class_wo_duplicates = run_all_regressions_without_past_class(run_evaluation_without_duplicates).drop(columns=["fold"])
evaluatio_wo_class_wo_duplicates

número de instâncias total: 54834


Unnamed: 0,features,#,MSE,$R^2$
0,atual,38400,1.056137,0.572643
0,atual e anterior,47063,1.022249,0.610559
0,"atual, anterior e a diferença",47063,1.022247,0.61056
0,diferença,38192,2.31102,0.054841


In [15]:
evaluatio_wo_past_class_wo_duplicates_cvk5 = run_all_regressions_without_past_class(run_evaluation_with_Kfolds_without_duplicates)
evaluatio_wo_past_class_wo_duplicates_cvk5

número de instâncias total: 54834


Unnamed: 0,features,fold,#,MSE,$R^2$
0,atual,1,38400,1.330518,0.249636
1,atual,2,38400,1.169056,0.226622
2,atual,3,38400,1.114616,0.348039
3,atual,4,38400,1.305406,0.371025
4,atual,5,38400,0.947284,0.681357
5,atual,6,38400,1.021592,0.622827
6,atual,7,38400,1.052634,0.602399
7,atual,8,38400,1.065607,0.630432
8,atual,9,38400,0.970029,0.625177
9,atual,10,38400,1.016479,0.642234


In [16]:
evaluatio_wo_past_class.merge(evaluatio_past_class, left_on='features', right_on='features')

Unnamed: 0,features,#_x,MSE_x,$R^2$_x,#_y,MSE_y,$R^2$_y
0,atual,54834,1.013912,0.644652,54834,0.076955,0.973029
1,atual e anterior,54834,1.006452,0.647267,54834,0.074269,0.973971
2,"atual, anterior e a diferença",54834,1.006452,0.647267,54834,0.074264,0.973973
3,diferença,54834,2.840107,0.004621,54834,0.076483,0.973195


In [17]:
evaluatio_wo_class_wo_duplicates.merge(evaluatio_past_class_wo_duplicates, left_on='features', right_on='features')

Unnamed: 0,features,#_x,MSE_x,$R^2$_x,#_y,MSE_y,$R^2$_y
0,atual,38400,1.056137,0.572643,38592,0.089481,0.96367
1,atual e anterior,47063,1.022249,0.610559,47063,0.072858,0.972244
2,"atual, anterior e a diferença",47063,1.022247,0.61056,47063,0.072858,0.972244
3,diferença,38192,2.31102,0.054841,38234,0.108685,0.955649


In [18]:
cvk5 = evaluatio_wo_past_class_wo_duplicates_cvk5.merge(evaluatio_past_class_wo_duplicates_cvk5, left_on=['features','fold'], right_on=['features','fold'])
cvk5['#_x'] = cvk5['#_x'].astype('int') 
cvk5['#_y'] = cvk5['#_y'].astype('int') 
cvk5.groupby('features').mean()
#pd.merge(evaluatio_past_class_wo_duplicates_cvk5, evaluatio_wo_past_class_wo_duplicates_cvk5, on=['features', 'fold'])

Unnamed: 0_level_0,#_x,MSE_x,$R^2$_x,#_y,MSE_y,$R^2$_y
features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
atual,38400,1.099322,0.499975,38592,0.10025,0.942973
atual e anterior,47063,1.093289,0.529472,47063,0.080204,0.956176
"atual, anterior e a diferença",47063,1.093828,0.529202,47063,0.080274,0.95615
diferença,38192,2.320636,-0.008469,38234,0.100551,0.941857


In [19]:
def r_list(list):
    return f"x1 = c({list})".replace('[','').replace(']','')
    
def print_r(list):
    print(r_list(list))
    

a = list(cvk5['MSE_x'])
b = list(cvk5['MSE_y'])
print_r(a)
print_r(b)

x1 = c(1.3305177037194071, 1.1690564119163591, 1.114616061883867, 1.3054061845311862, 0.9472838034867953, 1.021591508626753, 1.052634202809754, 1.0656068150331433, 0.970029032412126, 1.016478539443852, 1.2721931323879212, 1.181796817844923, 1.1343726000778847, 1.283490428849286, 0.9836875042367854, 1.0369718294757249, 1.0317947983501004, 1.0203596366283363, 0.9029444753732814, 1.0852748905057643, 1.275060775118802, 1.1812739664932812, 1.1353348428662642, 1.285604466407252, 0.9836875042373532, 1.0369718294769803, 1.0315639867093367, 1.0205304832532582, 0.9029742596582351, 1.0852748905057918, 1.9621299922576183, 1.684905884553386, 1.8939985531947923, 2.0060865224561675, 2.706464057610556, 2.516794892353221, 2.5332272007641916, 2.7451189933708466, 2.533000163428874, 2.6246321632097636)
x1 = c(0.26209483264878863, 0.25697237909758686, 0.23312320227836741, 0.20790225270896454, 0.006510796407432374, 0.006949155294636323, 0.007682180565258078, 0.0076159122525771596, 0.006675249835870024, 0.00

In [20]:
cvk5_1 = cvk5[cvk5['features'] == 'atual e anterior']
cvk5_2 = cvk5[cvk5['features'] == 'atual, anterior e a diferença']
a = list(cvk5_1['MSE_x']+cvk5_1['MSE_y'])
b = list(cvk5_2['MSE_x']+cvk5_2['MSE_y'])
print_r(a)
print_r(b)


x1 = c(1.4892150608435115, 1.4032810600437116, 1.3328044490759166, 1.4137455533398164, 0.9892130608874495, 1.0428418322315391, 1.037538634953457, 1.0258276468027923, 0.9090401594890005, 1.0914143066955213)
x1 = c(1.4920827035742126, 1.402763095483509, 1.333766691864214, 1.4162786151586206, 0.9892114536149152, 1.0431201676280868, 1.037307823312734, 1.0260015389573531, 0.9090699437739135, 1.0914143066955635)


In [21]:
from scipy import stats
x1 = [1.279320, 1.228306, 1.008683, 1.033870, 0.991782, 1.279320, 1.229792, 1.008683, 1.034242, 0.991634, 1.875927, 1.991474, 2.625568, 2.683716, 2.586487]
x2 = [0.225783, 0.165626, 0.006604, 0.006635, 0.006966, 0.225783, 0.165626, 0.006604, 0.006676, 0.006966, 0.253829, 0.237799, 0.005805, 0.006829, 0.007014]

stats.ttest_rel(x1,x2)

Ttest_relResult(statistic=8.434834382206255, pvalue=7.356307816466631e-07)

In [22]:
cvk5

Unnamed: 0,features,fold,#_x,MSE_x,$R^2$_x,#_y,MSE_y,$R^2$_y
0,atual,1,38400,1.330518,0.249636,38592,0.262095,0.853502
1,atual,2,38400,1.169056,0.226622,38592,0.256972,0.832344
2,atual,3,38400,1.114616,0.348039,38592,0.233123,0.861215
3,atual,4,38400,1.305406,0.371025,38592,0.207902,0.897965
4,atual,5,38400,0.947284,0.681357,38592,0.006511,0.997849
5,atual,6,38400,1.021592,0.622827,38592,0.006949,0.997394
6,atual,7,38400,1.052634,0.602399,38592,0.007682,0.997165
7,atual,8,38400,1.065607,0.630432,38592,0.007616,0.997312
8,atual,9,38400,0.970029,0.625177,38592,0.006675,0.997446
9,atual,10,38400,1.016479,0.642234,38592,0.006977,0.997534
