In [None]:
# Version history:
# 2022-12-13: moved sections in a logical order
# 2022-12-03: merged changes from v4
# 2022-12-03: created from nb102_baseline_model_v2.ipynb
# 

# ===== Part0 - env preparation =====

## System info

In [None]:
# Print system id
!nvidia-smi
!hostname
!uname -a
!df -kh /tmp

In [None]:
!python -V  # If version < 3.9 then some f-string features may not work

## Mount drive (if required)

In [None]:
TO_USE_COLAB = None
try:
    from google.colab import drive
    drive.mount("/content/drive")
    TO_USE_COLAB = True
except:
    TO_USE_COLAB = False
TO_USE_COLAB

## Env vars

In [None]:
# Ensure reproducability, NEW: seems to not necessary
#import os
#os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # NEW 2022-12-05, see https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility

# ===== Part 1: prepare dataset =====

## Imports 

In [None]:
import datetime
import numpy as np
import os
import pandas as pd
import pytz
from sklearn.model_selection import train_test_split
import time

## Paths and settings

In [None]:
def get_ts():
  return datetime.datetime.now(tz=pytz.timezone("Europe/Minsk")).strftime("%Y-%m-%dT%H%M%S")
START_TS = get_ts()
START_TS

In [None]:
if TO_USE_COLAB:
    PATH_MAIN_DIR = f"/content/drive/MyDrive/_PR_ROOT/_2022/2022-11_NLP-Huawei_Final_project/stocktwits_finsentiment_analysis/notebooks"
else:
    PATH_MAIN_DIR = "."
assert os.path.isdir(PATH_MAIN_DIR)

In [None]:
%cd $PATH_MAIN_DIR
!pwd

In [None]:
PATH_OUT_DIR = f"../data/interim/050_output__nb200/_out_dir_{START_TS}"
os.mkdir(PATH_OUT_DIR)

In [None]:
# Files and folders

DIR_DATA_SRC = r'../data/interim/040_output__nb010_v1'
#FNAMES = ['VIX_RmSW=0_RmRep=0_1y_top10.csv', 'VIX_RmSW=0_RmRep=0_1y_top10.csv' ]  # Loads in <1 sec
FNAMES = ['AMZN_RmSW=0_RmRep=0_1y.csv.gz', 'NFLX_RmSW=0_RmRep=0_1y.csv.gz', ]  # Loads in <1 sec
#FNAMES = ['AAPL_RmSW=0_RmRep=0_1y.csv.gz', ]  # Loads in 20-30 sec

assert os.path.isdir(DIR_DATA_SRC)
for f in FNAMES:
    assert os.path.isfile(os.path.join(DIR_DATA_SRC, f)), f"File not found: {f}"

In [None]:
# Dataset preparation settings

DROP_RECORDS_BEFORE_DATE_INCLUSIVE = '2019-07-20'  # Last date in datasets is 2020-07-21
LABEL_GEN_STRATEGY = "d1_C=d1_O=0.5%=2cls"  # This string is a "key", see function XXX for explanations
COL_FEATURES = ['symbol', 'message', 'datetime', 'user', 'message_id', 'Date']  #, 'Time']
COL_LABEL = 'label'
COL_PCR = 'price_change_ratio'

# SPLIT_SHUFFLING_SEED = 42  # If None, then no shuffling is done
TEST_SIZE = 0.15
TRAIN_SIZE = 1.0 - TEST_SIZE

## Defs
Here are "pure" functions.

In [None]:
def print_df_details(df: pd.DataFrame):
    print("\nHead:\n", df.head())
    print("\nTail:\n", df.tail())
    print('\nInfo:')
    df.info()  # This method prints by itself
    print('\nDescribe:\n', df.describe(include='all'))  #, datetime_is_numeric=True)) - to suppress warnings   

In [None]:
def load_pandas_file(file_path: str, verbose=True):
    # Prepare
    assert os.path.isfile(file_path), f"Cannot find file: '{file_path}', cur folder: '{os. getcwd()}'"    
    print("Loading data from: ", file_path)
        
    # Do the load
    start_time = time.time()
    df = pd.read_csv(file_path)
    print(f"Success. Shape: {df.shape}, elapsed seconds: {time.time() - start_time:.2f}")
    
    # Dump details if required
    if verbose:
        print_df_details(df)
    return df

In [None]:
def merge_dfs(df_list: list, verbose=True) -> pd.DataFrame:
    if verbose:
        for df in df_list:
            print(df.shape, end=';')
    res_df = pd.concat(df_list, ignore_index=True)
    if verbose:
        print("->", res_df.shape)
    return res_df

In [None]:
def get_ts():
  return datetime.datetime.now(tz=pytz.timezone("Europe/Minsk")).strftime("%Y-%m-%dT%H%M%S")
# START_TS = get_ts()
# START_TS

In [None]:
def drop_old_dates_inplace(df: pd.DataFrame, drop_date_inclusive: str, verbose=True) -> pd.DataFrame:
    assert isinstance(drop_date_inclusive, str)
    old_shape = df.shape
    df.drop(df[df['Date'] <= drop_date_inclusive].index, inplace = True)
    print(f"Old dates dropped. Shape before: {old_shape}, after: {df.shape}")
    if verbose:
        print_df_details(df)

In [None]:
def get_label(ch):
  if ch > 0.5:
    return 1
  elif ch < -0.5:
    return -1
  else:
    return 0


def generate_labels_and_pcr_list(df: pd.DataFrame, strategy_str: str) -> list:
    # price_change_ratio = pcr 
    if strategy_str == "d1_C=d1_O=0.5%=2cls":
        assert (df['d1_O'] > 0.0).all()  # Prices must be > 0
        assert (df['d1_C'] > 0.0).all()  # Prices must be > 0
        rel_change_perc = (df['d1_C'] / df['d1_O'] - 1.0) * 100.0
        # Convert from percentages to labels -1, 0, 1
        res_series = rel_change_perc.apply(get_label)
    else:
        assert False, "Unexpeced strategy_str"
    return res_series.to_list(), rel_change_perc.to_list()     

In [None]:
def do_feature_selection(df: pd.DataFrame):
    res_df = df[COL_FEATURES]
    print(f"Selected cols: {res_df.columns}")
    return res_df.copy()

In [None]:
def do_label_transformation(df: pd.DataFrame):
    temp_df = df.drop(df[df[COL_LABEL] == 0].index, inplace= False).copy()
    temp_df[COL_LABEL].replace({-1:0}, inplace = True)
    return temp_df

In [None]:
def calc_real_profit_perc(y_pred, pcr_list) -> float:
    return np.NaN  # TODO: This function is not correct, as it's necessary to aggregate predictions by date and ticker

    profit_ratio = 1.0
    assert len(y_pred) == len(pcr_list), f"{len(y_pred)}, {len(pcr_list)}"
    for i, (pred, pcr) in enumerate(zip(y_pred, pcr_list)):
        price_ratio = (pcr / 100.0 + 1.0)  # Convert from percents [-5% .. 5%] -> [-0.05 .. 0.05] -> [0.95 .. 1.05]
        assert 0.0 < price_ratio < np.inf, f"{i}, {price_ratio}" 
        if pred == 1:
            # Long
            profit_ratio *= price_ratio
        elif pred == 0:
            # Short
            profit_ratio /= price_ratio
        else:
            assert False, "Unexpected label"
    return (profit_ratio - 1.0) * 100.0  # Profit in percents (0% - nothing changed)

In [None]:
def calc_hash_for_seq(values, hash_len=6):
    assert isinstance(values, (list, np.ndarray, pd.Series))
    h = hash(tuple(values))
    return str(h)[-hash_len:]

# Small unit tests
print(calc_hash_for_seq([1, 2, 3]))
print(calc_hash_for_seq(np.array([1, 2, 3])))
print(calc_hash_for_seq(pd.Series([1, 2, 3])))

In [None]:
def make_label_distribution_equal(df: pd.DataFrame) -> pd.DataFrame:
    
    counts = df.label.value_counts()
    assert len(counts == 2)  # We expect only labels 0 and 1

    bigger_label = 0 if counts[0] > counts[1] else 1
    diff = abs(counts[0] - counts[1])

    res_df = df.drop(index=df[df.label == bigger_label].sample(n = diff, replace=False, random_state=42).index)
    return res_df

## Do prepare datasets

In [None]:
# Load raw data, dropping old dates
df_list = []
for fname in FNAMES:
    full_name = os.path.join(DIR_DATA_SRC, fname)
    assert os.path.isfile(full_name), full_name
    df_temp = load_pandas_file(full_name, verbose=False)
    drop_old_dates_inplace(df_temp, DROP_RECORDS_BEFORE_DATE_INCLUSIVE, verbose=False)
    df_list.append(df_temp)

In [None]:
# Concat loaded parts to one dataframe
df_raw = merge_dfs(df_list)

In [None]:
# Choose columns for final dataset
df_final = do_feature_selection(df_raw)

In [None]:
# Append the target column
labels, pcr_list = generate_labels_and_pcr_list(df_raw, strategy_str=LABEL_GEN_STRATEGY)
df_final[COL_LABEL] = labels
df_final[COL_PCR] = pcr_list

In [None]:
# Drop labels for neutral class
df_final = do_label_transformation(df_final)

In [None]:
# print_df_details(df_final)

In [None]:
df_final[COL_LABEL].value_counts()

In [None]:
# Making labels distribution equal
df_final = make_label_distribution_equal(df_final)
df_final[COL_LABEL].value_counts()

In [None]:
df_final

# ===== Part 2: Model execution and scoring =====

## Imports (part 2)

In [None]:
if TO_USE_COLAB:
    !pip install optuna

In [None]:
import gc
import gensim.downloader
import matplotlib.pyplot as plt
from optuna import create_study
from pprint import pprint
import random
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_sequence

## Defs (part 2)

In [None]:
# def train_model_and_get_predictions__sklearn_classifier(model_tag: str, 
#     X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.Series, seed: int) -> pd.Series:

#     # Initial checks
#     assert COL_PCR not in X_train.columns  # To avoid data leaks
    
#     # Create the model with specified seed
#     if model_tag == "dummy__most_frequent":
#         model = DummyClassifier(strategy="most_frequent", random_state=seed)
#     elif model_tag == "dummy__uniform":
#         model = DummyClassifier(strategy="uniform", random_state=seed)
#     else:
#         assert False, f"Unexpected model tag: {model_tag}"
    
#     # Train the model    
#     model.fit(X_train, y_train)
    
#     # Get predictions
#     y_pred = model.predict(X_test)
    
#     return y_pred    

In [None]:
# References:
# https://pytorch.org/docs/stable/notes/randomness.html
def init_seeds(seed=123):
    # Python and CPU-related entropy  
    random.seed(seed)      
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # torch.use_deterministic_algorithms(True)   # Raises a CUBLAS error on some cases
    # os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # Does not help for the error above

    # GPU-related entropy
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # gpu vars
        torch.backends.cudnn.benchmark = False  # See 
        torch.backends.cudnn.deterministic = True

In [None]:
# From here: https://pytorch.org/docs/stable/notes/randomness.html
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    random.seed(worker_seed)
    np.random.seed(worker_seed)

In [None]:
# Local imports
import os
import sys
sys.path.append(os.path.join(PATH_MAIN_DIR, '../src/ant1'))

from data_preprocessing import read_data, read_test, Tokenizer, TextDataset,\
    Vocab  #,train_test_split
from utils import show_example
from model import prepare_emb_matrix, RecurrentClassifier
from trainer import Trainer

DEVICE = torch.device('cuda')
#assert torch.cuda.is_available()  # To be sure

DATA_PORTION_TO_USE = 0.1  # May be useful for fast checks
#DATA_PORTION_TO_USE = 0.2  # May be useful for fast checks
#DATA_PORTION_TO_USE = 1.0  # May be useful for fast checks
#VAL_FOLD_FRAC = 0.2  # 20% for validation data
NUM_WORKERS = 2
MAX_VOCAB_SIZE = 30000  # Was hardcoded

#init_seeds(42)


## Optuna - prepare data

In [None]:
# Fill these vars for running Optuna:
# X_train, y_train, X_test, y_test, seed

seed = 42
X_train, X_test, y_train, y_test = train_test_split(
    df_final[COL_FEATURES + [COL_PCR]], df_final[COL_LABEL],
    # stratify=df_final[COL_LABEL], # Note: stratification leads to the same test set (though shuffled)
    shuffle=True,
    random_state = seed, 
    test_size = TEST_SIZE
)
# Note: equal hash means binary equality, equal sum means the same rows but shuffled
print(f"After split: {seed}, {X_train.shape}; {X_test.shape}; {y_train.shape},{calc_hash_for_seq(y_train)},{sum(y_train)};"
      + f" {y_test.shape},{calc_hash_for_seq(y_test)},{sum(y_test)}")

# Separate price_change_ratio from the data
# pcr_train = X_train[COL_PCR]; X_train.drop(COL_PCR, axis=1, inplace=True)
# pcr_test = X_test[COL_PCR]; X_test.drop(COL_PCR, axis=1, inplace=True)

X_train.drop(COL_PCR, axis=1, inplace=True)
X_test.drop(COL_PCR, axis=1, inplace=True)

In [None]:
# Decimate data if required
if DATA_PORTION_TO_USE < 1.0:
    assert all(X_train.index == y_train.index)
    X_train = X_train.sample(frac=DATA_PORTION_TO_USE, random_state=seed)
    y_train = y_train.sample(frac=DATA_PORTION_TO_USE, random_state=seed)
    assert all(X_train.index == y_train.index)
print(X_train.shape)

In [None]:
tok = Tokenizer()
tok_texts = [tok.tokenize(t) for t in X_train.message]
# vocab = Vocab(tok_texts, max_vocab_size=30000)
vocab = Vocab(tok_texts, max_vocab_size=MAX_VOCAB_SIZE)

In [None]:
# Split into train + val folds
#train_texts, val_texts, train_labels, val_labels = train_test_split(X_train.message.to_list(), y_train.to_list(), test_size=VAL_FOLD_FRAC)
train_texts = X_train.message.to_list()
val_texts = X_test.message.to_list()
train_labels = y_train.to_list()
val_labels = y_test.to_list()

train_dataset = TextDataset([tok.tokenize(t) for t in train_texts], train_labels, vocab)
val_dataset = TextDataset([tok.tokenize(t) for t in val_texts], val_labels, vocab)

In [None]:
# will download embeddings or load them from disk
%time gensim_model = gensim.downloader.load("glove-wiki-gigaword-100", )

In [None]:
%time emb_matrix = prepare_emb_matrix(gensim_model, vocab)

In [None]:
#del gensim_model  # To save memory

## tuning with Optuna

In [None]:
init_seeds(seed)


In [None]:
# Check required vars
assert train_dataset is not None
assert val_dataset is not None
assert g_train_dl is not None

BEST_ACC = 0.0

def objective(trial):
    global BEST_ACC
    
    n_hidden_layers = trial.suggest_int("n_hidden_layers", 0, 3)
    hidden_layer_size = trial.suggest_int("hidden_layer_size", 10, 1000)
    
    config = {
        #"freeze": True,  # NEW 2022-10
        "freeze": trial.suggest_categorical("freeze", [True, False]),
        "cell_type": trial.suggest_categorical("cell_type", ["RNN", "LSTM", "GRU"]),
        "cell_dropout": trial.suggest_loguniform("cell_dropout", 1e-9, 0.9),
        "num_layers": trial.suggest_int("num_layers", 1, 3),
        "hidden_size": trial.suggest_int("hidden_size", 10, 1000),
        "out_activation": trial.suggest_categorical("out_activation", 
                                                    ["sigmoid", "tanh", "relu", "elu"]),
        "bidirectional": trial.suggest_categorical("bidirectional", [True, False]),
        "out_dropout": trial.suggest_loguniform("out_dropout", 1e-9, 0.9),
        "out_sizes": [hidden_layer_size] * n_hidden_layers,
    }

    trainer_config = {
        "lr": trial.suggest_loguniform("lr", 1e-5, 1e-3),
        "n_epochs": 5,  
        #"n_epochs": trial.suggest_int("n_epochs", 5, 20),
        "weight_decay": trial.suggest_loguniform("weight_decay", 1e-9, 1e-1),
        "batch_size": 128,
        "device": "cuda" if torch.cuda.is_available() else "cpu",
        "verbose": False,
    }
    
    pprint({**config, **trainer_config})

    # Create data loaders with current seed
    g_train_dl = torch.Generator()
    g_train_dl.manual_seed(seed)
    train_dataloader = DataLoader(train_dataset, 
                                  batch_size=trainer_config["batch_size"],
                                  shuffle=True,
                                  num_workers=NUM_WORKERS,
                                  worker_init_fn=seed_worker,  # NEW 2022-12-05
                                  generator=g_train_dl,                 # NEW 2022-12-05 
                                  collate_fn=train_dataset.collate_fn)
    val_dataloader = DataLoader(val_dataset, 
                                batch_size=trainer_config["batch_size"],
                                shuffle=False,
                                num_workers=NUM_WORKERS,
                                collate_fn=val_dataset.collate_fn)    
            
    clf_model = RecurrentClassifier(config, vocab, emb_matrix)
    t = Trainer(trainer_config)
    t.fit(clf_model, train_dataloader, val_dataloader)
    val_acc =  t.history["val_acc"][-1]
    if val_acc > BEST_ACC:
        BEST_ACC = val_acc
        #t.save("optuna_model.ckpt")
        t.save(f"{PATH_OUT_DIR}/optuna_model.ckpt")
    
    # Clear GPU memory
#     del t
#     del clf_model
#     torch.cuda.empty_cache()  # NEW 2022-12-05

    return val_acc

In [None]:
study = create_study(direction="maximize")
# you can set more trials
study.optimize(objective, n_trials=2)
#study.optimize(objective, n_trials=100)

## Launch and save best model

In [None]:
config = {
    "freeze": False,
    "cell_type": "LSTM",
    "cell_dropout": 4.798608743508714e-08,
    "num_layers": 2,
    "hidden_size": 571,
    "out_activation": "relu",
    "bidirectional": False,
    "out_dropout": 0.00031547702436796987,
    "out_sizes": [374, 374, 374],
}

trainer_config = {
    "lr": 0.0002448300553686371,
    "n_epochs": 5,
    "weight_decay": 3.333872612242237e-06,
    "batch_size": 128,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}
#clf_model = RecurrentClassifier(config, vocab, emb_matrix)

#[0.5016433596611023, 0.4965958297252655, 0.4965958297252655, 0.4965958297252655, 0.5137926936149597]
#CLB: [0.4965958297252655, 0.5296396017074585, 0.5178424715995789, 0.5468951463699341, 0.5472473502159119]

In [None]:
# cfg = {'batch_size': 128,
#  'bidirectional': True,
#  'cell_dropout': 6.654706392076999e-05,
#  'cell_type': 'LSTM',
#  'device': 'cuda',
#  'freeze': True,
#  'hidden_size': 363,
#  'lr': 0.00014010022035173308,
#  #'n_epochs': 5,
#  'n_epochs': 10,
#  'num_layers': 3,
#  'out_activation': 'tanh',
#  'out_dropout': 3.6227682571088135e-09,
#  'out_sizes': [],
#  'verbose': False,
#  'weight_decay': 5.513482572816943e-08}

# config = {
#     "freeze": cfg['freeze'],
#     "cell_type": cfg['cell_type'],
#     "cell_dropout": cfg['cell_dropout'],
#     "num_layers": cfg['num_layers'],
#     "hidden_size": cfg['hidden_size'],
#     "out_activation": cfg['out_activation'],
#     "bidirectional": cfg['bidirectional'],
#     "out_dropout": cfg['out_dropout'],
#     "out_sizes": cfg['out_sizes'],
# }

# trainer_config = {
#     "lr": cfg['lr'],
#     "n_epochs": cfg['n_epochs'],
#     "weight_decay": cfg['weight_decay'],
#     "batch_size": 128,
#     "device": "cuda" if torch.cuda.is_available() else "cpu"
# }
# clf_model = RecurrentClassifier(config, vocab, emb_matrix)

# [0.5000587105751038, 0.49812182784080505, 0.5076887011528015, 0.5145556926727295, 0.5227138996124268]
# DP=1: [0.5197793245315552, 0.5039323568344116, 0.5282896757125854, 0.5182533264160156, 0.5300504565238953]


In [None]:
# 2022-12-05T1346: from Trial 20 (0.508 .. 0.557)
cfg = {'batch_size': 128,
 'bidirectional': False,
 'cell_dropout': 0.00021809889870001576,
 'cell_type': 'LSTM',
 'device': 'cuda',
 'freeze': False,
 'hidden_size': 837,
 'lr': 0.0005646640795179067,
 #'n_epochs': 5,
 'n_epochs': 10,
 'num_layers': 1,
 'out_activation': 'relu',
 'out_dropout': 3.26822157581284e-05,
 'out_sizes': [165],
 'verbose': False,
 'weight_decay': 1.4724415358265257e-06}

config = {
    "freeze": cfg['freeze'],
    "cell_type": cfg['cell_type'],
    "cell_dropout": cfg['cell_dropout'],
    "num_layers": cfg['num_layers'],
    "hidden_size": cfg['hidden_size'],
    "out_activation": cfg['out_activation'],
    "bidirectional": cfg['bidirectional'],
    "out_dropout": cfg['out_dropout'],
    "out_sizes": cfg['out_sizes'],
}

trainer_config = {
    "lr": cfg['lr'],
    "n_epochs": cfg['n_epochs'],
    "weight_decay": cfg['weight_decay'],
    "batch_size": 128,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

# Seed42:
# val_loss: ['0.693', >>>'0.688', '0.690', '0.698', '0.715', '0.777', '0.837', '0.878', '1.068', '1.069']
# val_acc : ['0.503', '0.540', '0.552', '0.549', '0.552', '0.550', >>>'0.553', '0.552', '0.546', '0.548']

# Seed 43:
# val_loss: ['0.692', '0.691', >>>'0.689', '0.698', '0.725', '0.750', '0.837', '0.962', '1.035', '1.227']
# val_acc : ['0.519', '0.538', '0.541', >>>'0.552', '0.552', '0.551', '0.551', '0.550', '0.548', '0.546']

In [None]:
init_seeds(seed)

In [None]:
# Create data loaders with current seed
g_train_dl = torch.Generator()
g_train_dl.manual_seed(seed)
train_dataloader = DataLoader(train_dataset, 
                              batch_size=trainer_config["batch_size"],
                              shuffle=True,
                              num_workers=NUM_WORKERS,
                              worker_init_fn=seed_worker,  # NEW 2022-12-05
                              generator=g_train_dl,                 # NEW 2022-12-05 
                              collate_fn=train_dataset.collate_fn)
val_dataloader = DataLoader(val_dataset, 
                            batch_size=trainer_config["batch_size"],
                            shuffle=False,
                            num_workers=NUM_WORKERS,
                            collate_fn=val_dataset.collate_fn)    


#init_seeds(44)
init_seeds(seed)
g_train_dl = torch.Generator()
g_train_dl.manual_seed(torch.initial_seed())

clf_model = RecurrentClassifier(config, vocab, emb_matrix)

train_dataloader = DataLoader(train_dataset, 
                              batch_size=trainer_config["batch_size"],
                              shuffle=True,
                              num_workers=NUM_WORKERS,
                              worker_init_fn=seed_worker,  # NEW 2022-12-05
                              generator=g_train_dl,                 # NEW 2022-12-05 
                              collate_fn=train_dataset.collate_fn)
val_dataloader = DataLoader(val_dataset, 
                            batch_size=trainer_config["batch_size"],
                            shuffle=False,
                            num_workers=NUM_WORKERS,
                            collate_fn=val_dataset.collate_fn)
t = Trainer(trainer_config)
t.fit(clf_model, train_dataloader, val_dataloader)

In [None]:
print("val_loss:", [f"{x:.3f}" for x in t.history['val_loss']])
print("val_acc :", [f"{x:.3f}" for x in t.history['val_acc']])


In [None]:
plt.plot(t.history['train_loss'])
plt.grid()

In [None]:
plt.plot(t.history['val_loss'], "bo-")
plt.grid()

In [None]:
print(t.history['val_acc'])
plt.plot(t.history['val_acc'], "bo-")
plt.grid()

In [None]:
# Clear GPU memory
del t
del clf_model
gc.collect()
torch.cuda.empty_cache()  # NEW 2022-12-05

In [None]:
assert False  # Below is Optuna

## TMP: Pre-debug section

In [None]:
DO_DEBUG_STOP = True

# Launch split-train-predict-metrics cycle for several seeds
def get_model_score_distribution(model_tag: str, df: pd.DataFrame, launch_cnt: int = 5, verbose=True):

    # Global vars, required for DO_DEBUG_STOP case, to continue writing code on the root notebook level
    global X_train, y_train, X_test, y_test, seed

    result = []
    print("Legend: seed; X_train shape; X_test_shape; y_train shape,hash,sum; y_test shape,hash,sum")
    for seed in range(42, 42 + launch_cnt):
        X_train, X_test, y_train, y_test = train_test_split(
            df[COL_FEATURES + [COL_PCR]], df[COL_LABEL],
            # stratify=df[COL_LABEL], # Note: stratification leads to the same test set (though shuffled)
            shuffle=True,
            random_state = seed, 
            test_size = TEST_SIZE
        )
        # Note: equal hash means binary equality, equal sum means the same rows but shuffled
        print(f"After split: {seed}, {X_train.shape}; {X_test.shape}; {y_train.shape},{calc_hash_for_seq(y_train)},{sum(y_train)};"
              + f" {y_test.shape},{calc_hash_for_seq(y_test)},{sum(y_test)}")

        # Separate price_change_ratio from the data
        pcr_train = X_train[COL_PCR]; X_train.drop(COL_PCR, axis=1, inplace=True)
        pcr_test = X_test[COL_PCR]; X_test.drop(COL_PCR, axis=1, inplace=True)

        if DO_DEBUG_STOP:
            assert False, "Debug-stop fired. Now you could use the above global vars on any notebook cells."

        # Launch model-specific method
        y_pred = None
        if model_tag.startswith('dummy_'):
            y_pred = train_model_and_get_predictions__sklearn_classifier(model_tag, X_train, y_train, X_test, seed)
        elif model_tag == 'ant1':
            y_pred = train_model_and_get_predictions__ant1(model_tag, X_train, y_train, X_test, seed)
        else:
            assert False, f"Unexpected model tag: {model_tag}"

        # Calc score
        score1 = accuracy_score(y_test, y_pred)
        #score2 = calc_real_profit_perc(y_pred, pcr_test)
        #score3 = calc_real_profit_perc(y_train[:100], pcr_train[:100])
        #result.append(f"{score1:.5f}, {score2:.2f}%, {score3:.2f}%")
        result.append(score1)
            
        if verbose:
            print(confusion_matrix(y_test, y_pred))
            print(classification_report(y_test, y_pred, digits=3))
                    
    return result

## Launch the model training/estimation

In [None]:
model_tag = "dummy__most_frequent"
results = get_model_score_distribution(model_tag, df_final, verbose=False)
print("Sorted results (accuracy):", sorted(results))
print(f"Mean accuracy: {np.mean(results):.3f} +- {np.std(results):.3f}")

In [None]:
model_tag = "dummy__uniform"
results = get_model_score_distribution(model_tag, df_final, launch_cnt = 5, verbose=False)
print("Sorted results (accuracy):", sorted(results))
print(f"Mean accuracy: {np.mean(results):.3f} +- {np.std(results):.3f}")

In [None]:
model_tag = "ant1"
results = get_model_score_distribution(model_tag, df_final, launch_cnt = 5, verbose=False)
print("Sorted results (accuracy):", sorted(results))
print(f"Mean accuracy: {np.mean(results):.3f} +- {np.std(results):.3f}")

In [None]:
var2 = 1

In [None]:
def x():
  global var3, var4
  var3 = 3
  var4 = 4

In [None]:
x()

In [None]:
var4