In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import os
import sys

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

sys.path.insert(0, os.path.join("..", "src"))
sys.path.insert(0, os.path.join("..", "examples"))

from augmentation import TransformData
from data_generators import get_data_from_file
from tools import experiment, run_model_for_raw_and_augmented_data, smape

import warnings

warnings.filterwarnings("ignore")

In [None]:
model = RandomForestRegressor(n_estimators=200, random_state=42)

nums = [
    "678",
    "679",
    "680",
    "710",
    "801",
    "802",
]  # , "818", "819", "905"] 3 скважины оставим на тест

experiments = []

for num in nums:
    filename = os.path.join("..", "examples", "data", f"df_{num}.csv")
    exp = get_data_from_file(filename)
    experiments.append(exp)

experiments = pd.DataFrame(experiments, columns=["df", "train_test_split"])

N_possible_values = range(6, 30, 2)
K_possible_values = range(6, 22, 2)

res = []
for i, row in experiments.iterrows():
    print
    df, train_test_split = row["df"], row["train_test_split"]
    print(f"processing experiment {i}...")
    pivot_result_table = []
    for n in N_possible_values:
        for k in K_possible_values:
            result_raw_data, result_augmented_data = experiment(
                model=model,
                df=df,
                train_test_split=train_test_split,
                N=n,
                K=k,
                tabgan=False,
            )

            pivot_result_table.append([i, n, k, result_raw_data, result_augmented_data])

    pivot_result_table = pd.DataFrame(
        data=pivot_result_table,
        columns=["experiment", "N", "K", "raw_data_mape", "augmented_data_mape"],
    )

    pivot_result_table[["raw_data_mape", "augmented_data_mape"]] /= pivot_result_table[
        ["raw_data_mape", "augmented_data_mape"]
    ].mean()

    pivot_result_table = pivot_result_table.sort_values("augmented_data_mape")
    pivot_result_table["exp rang"] = range(len(pivot_result_table))
    res.append(pivot_result_table)

res = pd.concat(res)
print("done")

processing experiment 0...
processing experiment 1...


In [None]:
res["n"] = res["N"].astype(int).copy()
res["k"] = res["K"].astype(int).copy()
res = res.groupby(["N", "K"]).mean()

In [None]:
piv = res.pivot(index="n", columns="k", values="exp rang")
plt.figure(figsize=(12, 8))
sns.heatmap(piv, annot=True)

In [None]:
# вибираем
N_best = 10
K_best = 8

In [None]:
experiments = []
for num in ["818", "819", "905"]:
    filename = os.path.join("..", "examples", "data", f"df_{num}.csv")
    exp = get_data_from_file(filename)
    exp[0]["num"] = num
    experiments.append(exp)

experiments = pd.DataFrame(experiments, columns=["df", "train_test_split"])

In [None]:
experiments

In [None]:
res = []
rr = []
for i, row in experiments.iterrows():
    print(f"experiment {i} in processing...")
    df, train_test_split = row["df"], row["train_test_split"]

    e = run_model_for_raw_and_augmented_data(
        model, df, train_test_split, N=N_best, K=K_best, tabgan=False
    )

    e = e[~np.isnan(e.y)]
    e["exp"] = i
    rr.append(e)
    result_raw_data = smape(e.y, e.pred_raw)
    result_augmented_data = smape(e.y, e.pred_augm)

    res.append([N_best, K_best, result_raw_data, result_augmented_data])

res = pd.DataFrame(
    data=res,
    columns=["N", "K", "raw_data_mape", "augmented_data_mape"],
)
rr = pd.concat(rr)
res
print("done")

In [None]:
res

In [None]:
for i in rr.exp.unique():
    d = rr[rr.exp == i]
    plt.figure(figsize=(15, 6))
    plt.plot(d.time, d.pred_raw, "o", markersize=8, label="pred")
    plt.plot(d.time, d.pred_augm, "o", markersize=8, label="pred_augm")
    plt.plot(d.time, d.y, "v", markersize=12, label="y")
    plt.legend()
    plt.show()

## Проведем подбор гиперпараметров для tabgan 

In [None]:
model = RandomForestRegressor(n_estimators=200)

nums = ["678", "679", "680", "710", "801", "802", "818", "819", "905"]

experiments = []

for num in nums:
    filename = os.path.join("..", "examples", "data", f"df_{num}.csv")
    exp = get_data_from_file(filename)
    experiments.append(exp)

experiments = pd.DataFrame(experiments, columns=["df", "train_test_split"])

N_possible_values = range(6, 30, 2)
K_possible_values = range(6, 22, 2)

res = []
for i, row in experiments.iterrows():
    df, train_test_split = row["df"], row["train_test_split"]
    print(f"processing experiment {i}...")
    pivot_result_table = []
    for n in N_possible_values:
        for k in K_possible_values:
            result_raw_data, result_augmented_data = experiment(
                model=model,
                df=df,
                train_test_split=train_test_split,
                N=n,
                K=k,
                augm=False,
                tabgan=True,
            )

            pivot_result_table.append([i, n, k, result_raw_data, result_augmented_data])

    pivot_result_table = pd.DataFrame(
        data=pivot_result_table,
        columns=["experiment", "N", "K", "raw_data_mape", "augmented_data_mape"],
    )

    pivot_result_table[["raw_data_mape", "augmented_data_mape"]] /= pivot_result_table[
        ["raw_data_mape", "augmented_data_mape"]
    ].mean()

    pivot_result_table = pivot_result_table.sort_values("augmented_data_mape")
    pivot_result_table["exp rang"] = range(len(pivot_result_table))
    res.append(pivot_result_table)

res = pd.concat(res)
print("done")