In [1]:
import pandas as pd

dataset_1 = pd.read_csv("data/6-join_multiple_class_without_C-evolution.csv")
dataset_2 = pd.read_csv("data/6-join-single-class-without-C-evolution.csv")
dataset = pd.concat([dataset_1, dataset_2])

In [2]:
target_column = pd.read_csv("data/target-column", header=None)[0][0]
feature_columns = list(pd.read_csv("data/features-columns", header=None)[0])
feature_columns_with_delta = list(pd.read_csv("data/features-columns-with-delta", header=None)[0])
feature_columns_only_delta = list(pd.read_csv("data/features-columns-only-delta", header=None)[0])

In [3]:
number_class = {
    'stub':0,
    'start':1,
    'b':2,
    'a':3,
    'ga':4,
    'fa':5,
}
dataset["actual_category"] = dataset["actual_category"].apply(lambda x: number_class[x])
dataset["past_category"] = dataset["past_category"].apply(lambda x: number_class[x])

In [4]:
def get_X_y(dataset, feature_columns, target_column):
    X = dataset[feature_columns]
    y = dataset[target_column]
    return X, y

def get_X_y_without_duplicates(dataset, feature_columns, target_column):
    columns = [target_column] + feature_columns
    filtered = dataset[columns].drop_duplicates()
    return get_X_y(filtered, feature_columns, target_column)

In [5]:
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
#from sklearn.svm import SVC

def evaluate_regression(X_train, y_train, X_test, y_test):
    model = LinearRegression().fit(X_train, y_train)
    y_predicted = model.predict(X_test)

    mse = mean_squared_error(y_predicted, y_test)
    r2 = r2_score(y_test, y_predicted)
    score = pd.DataFrame([[ len(y_train)+len(y_test), mse, r2]], columns=["número de instâncias", "MSE", "$R^2$"])
    return [model], score

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
   
def run_regression(dataset, feature_columns, target_column):
    X, y = get_X_y(dataset, feature_columns, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return evaluate_regression(X_train, y_train, X_test, y_test)

def run_regression_without_duplicates(dataset, feature_columns, target_column):
    X, y = get_X_y_without_duplicates(dataset, feature_columns, target_column)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    return evaluate_regression(X_train, y_train, X_test, y_test)

def run_regression_with_Kfolds_without_duplicates(dataset, feature_columns, target_column):
    i = 0
    X, y = get_X_y_without_duplicates(dataset, feature_columns, target_column)
    models = []
    scores = pd.DataFrame(columns = ["fold", "número de instâncias", "MSE", "$R^2$"])

    for train_index, test_index in KFold(n_splits=5).split(X):
        i+=1
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model, score = evaluate_regression(X_train, y_train, X_test, y_test)

        models.append(model)
        score["fold"] = i
        scores = pd.concat( [scores, score], sort=False)

    return models, scores


In [7]:
# Playground
def run_all_regressions(run_regression, dataset, feature_columns_set):
    result = pd.DataFrame(columns = ["fold", "features", "número de instâncias", "MSE", "$R^2$"])
    print(f"número de instâncias total: {len(dataset)}")

    for feature_set_name, feature_columns in feature_columns_set:
        models, row = run_regression(dataset, feature_columns, target_column)
        row["features"] = feature_set_name
        result = pd.concat( [result, row], sort=False)

    return result

def run_all_regressions_with_past_class(run_regression):
    params = [
        ("artigo atual e anterior", feature_columns),
        ("artigo atual, anterior e a diferença", feature_columns_with_delta),
        ("diferença", feature_columns_only_delta),
    ]
    return run_all_regressions(run_regression, dataset, params)

def list_except(list, except_itens):

def run_all_regressions_without_past_class(run_regression):
    past_class = "past_category"
    params = [
        ("artigo atual e anterior", [f for f in feature_columns if f is not past_class]),
        ("artigo atual, anterior e a diferença", feature_columns_with_delta),
        ("diferença", feature_columns_only_delta),
    ]
    return run_all_regressions(run_regression, dataset, params)

In [15]:
oi = ["a", "b", "c"]
print(oi)
ai = [f for f in oi if f not in ["a", "b"]]
print(ai)

['a', 'b', 'c']
['c']


In [8]:
run_all_regressions_with_past_class(run_regression)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$
0,,artigo atual e anterior,54834,0.074269,0.973971
0,,"artigo atual, anterior e a diferença",54834,0.074264,0.973973
0,,diferença,54834,0.076483,0.973195


In [9]:
run_all_regressions_with_past_class(run_regression_without_duplicates)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$
0,,artigo atual e anterior,47063,0.072858,0.972244
0,,"artigo atual, anterior e a diferença",47063,0.072858,0.972244
0,,diferença,38234,0.108685,0.955649


In [10]:
run_all_regressions_with_past_class(run_regression_with_Kfolds_without_duplicates)

número de instâncias total: 54834


Unnamed: 0,fold,features,número de instâncias,MSE,$R^2$
0,1,artigo atual e anterior,47063,0.225783,0.86838
0,2,artigo atual e anterior,47063,0.165626,0.92243
0,3,artigo atual e anterior,47063,0.006604,0.997763
0,4,artigo atual e anterior,47063,0.006635,0.997688
0,5,artigo atual e anterior,47063,0.006966,0.99758
0,1,"artigo atual, anterior e a diferença",47063,0.225783,0.86838
0,2,"artigo atual, anterior e a diferença",47063,0.165626,0.92243
0,3,"artigo atual, anterior e a diferença",47063,0.006604,0.997763
0,4,"artigo atual, anterior e a diferença",47063,0.006676,0.997674
0,5,"artigo atual, anterior e a diferença",47063,0.006966,0.99758
