In [None]:
# Version history:
# 2022-12-06: created from nb210_model-ant2-NB-colab.ipynb

# ===== Part0 - env preparation =====

## System info

In [None]:
# Print system id
!nvidia-smi
!hostname
!uname -a
!df -kh /tmp

In [None]:
!python -V  # If version < 3.9 then some f-string features may not work

## Mount drive (if required)

In [None]:
TO_USE_COLAB = None
try:
    from google.colab import drive
    drive.mount("/content/drive")
    TO_USE_COLAB = True
except:
    TO_USE_COLAB = False
TO_USE_COLAB

## Env vars

In [None]:
#import os
#os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # NEW 2022-12-05, see https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility

# ===== Part 1: prepare dataset =====

## Imports 

In [None]:
import datetime
import numpy as np
import os
import pandas as pd
import pytz
from sklearn.model_selection import train_test_split
import time

## Paths and settings

In [None]:
def get_ts():
  return datetime.datetime.now(tz=pytz.timezone("Europe/Minsk")).strftime("%Y-%m-%dT%H%M%S")
START_TS = get_ts()
START_TS

In [None]:
if TO_USE_COLAB:
    PATH_MAIN_DIR = f"/content/drive/MyDrive/_PR_ROOT/_2022/2022-11_NLP-Huawei_Final_project/stocktwits_finsentiment_analysis/notebooks"
else:
    PATH_MAIN_DIR = "."
assert os.path.isdir(PATH_MAIN_DIR)

In [None]:
%cd $PATH_MAIN_DIR
!pwd

In [None]:
PATH_OUT_DIR = f"../data/interim/050_output__nb200/_out_dir_{START_TS}"
os.mkdir(PATH_OUT_DIR)

In [None]:
# Files and folders

DIR_DATA_SRC = r'../data/interim/040_output__nb010_v1'
#FNAMES = ['VIX_RmSW=0_RmRep=0_1y_top10.csv', 'VIX_RmSW=0_RmRep=0_1y_top10.csv' ]  # Loads in <1 sec
FNAMES = ['AMZN_RmSW=0_RmRep=0_1y.csv.gz', 'NFLX_RmSW=0_RmRep=0_1y.csv.gz', ]  # Loads in <1 sec
#FNAMES = ['AAPL_RmSW=0_RmRep=0_1y.csv.gz', ]  # Loads in 20-30 sec

assert os.path.isdir(DIR_DATA_SRC)
for f in FNAMES:
    assert os.path.isfile(os.path.join(DIR_DATA_SRC, f)), f"File not found: {f}"

In [None]:
# Dataset preparation settings

DROP_RECORDS_BEFORE_DATE_INCLUSIVE = '2019-07-20'  # Last date in datasets is 2020-07-21
LABEL_GEN_STRATEGY = "d1_C=d1_O=0.5%=2cls"  # This string is a "key", see function XXX for explanations
COL_FEATURES = ['symbol', 'message', 'datetime', 'user', 'message_id', 'Date']  #, 'Time']
COL_LABEL = 'label'
COL_PCR = 'price_change_ratio'

# SPLIT_SHUFFLING_SEED = 42  # If None, then no shuffling is done
TEST_SIZE = 0.15
TRAIN_SIZE = 1.0 - TEST_SIZE

## Defs
Here are "pure" functions.

In [None]:
def print_df_details(df: pd.DataFrame):
    print("\nHead:\n", df.head())
    print("\nTail:\n", df.tail())
    print('\nInfo:')
    df.info()  # This method prints by itself
    print('\nDescribe:\n', df.describe(include='all'))  #, datetime_is_numeric=True)) - to suppress warnings   

In [None]:
def load_pandas_file(file_path: str, verbose=True):
    # Prepare
    assert os.path.isfile(file_path), f"Cannot find file: '{file_path}', cur folder: '{os. getcwd()}'"    
    print("Loading data from: ", file_path)
        
    # Do the load
    start_time = time.time()
    df = pd.read_csv(file_path)
    print(f"Success. Shape: {df.shape}, elapsed seconds: {time.time() - start_time:.2f}")
    
    # Dump details if required
    if verbose:
        print_df_details(df)
    return df

In [None]:
def merge_dfs(df_list: list, verbose=True) -> pd.DataFrame:
    if verbose:
        for df in df_list:
            print(df.shape, end=';')
    res_df = pd.concat(df_list, ignore_index=True)
    if verbose:
        print("->", res_df.shape)
    return res_df

In [None]:
def get_ts():
  return datetime.datetime.now(tz=pytz.timezone("Europe/Minsk")).strftime("%Y-%m-%dT%H%M%S")
START_TS = get_ts()
START_TS

In [None]:
def drop_old_dates_inplace(df: pd.DataFrame, drop_date_inclusive: str, verbose=True) -> pd.DataFrame:
    assert isinstance(drop_date_inclusive, str)
    old_shape = df.shape
    df.drop(df[df['Date'] <= drop_date_inclusive].index, inplace = True)
    print(f"Old dates dropped. Shape before: {old_shape}, after: {df.shape}")
    if verbose:
        print_df_details(df)

In [None]:
def get_label(ch):
  if ch > 0.5:
    return 1
  elif ch < -0.5:
    return -1
  else:
    return 0


def generate_labels_and_pcr_list(df: pd.DataFrame, strategy_str: str) -> list:
    # price_change_ratio = pcr 
    if strategy_str == "d1_C=d1_O=0.5%=2cls":
        assert (df['d1_O'] > 0.0).all()  # Prices must be > 0
        assert (df['d1_C'] > 0.0).all()  # Prices must be > 0
        rel_change_perc = (df['d1_C'] / df['d1_O'] - 1.0) * 100.0
        # Convert from percentages to labels -1, 0, 1
        res_series = rel_change_perc.apply(get_label)
    else:
        assert False, "Unexpeced strategy_str"
    return res_series.to_list(), rel_change_perc.to_list()     

In [None]:
def do_feature_selection(df: pd.DataFrame):
    res_df = df[COL_FEATURES]
    print(f"Selected cols: {res_df.columns}")
    return res_df.copy()

In [None]:
def do_label_transformation(df: pd.DataFrame):
    temp_df = df.drop(df[df[COL_LABEL] == 0].index, inplace= False).copy()
    temp_df[COL_LABEL].replace({-1:0}, inplace = True)
    return temp_df

In [None]:
def calc_real_profit_perc(y_pred, pcr_list) -> float:
    return np.NaN  # TODO: This function is not correct, as it's necessary to aggregate predictions by date and ticker

    profit_ratio = 1.0
    assert len(y_pred) == len(pcr_list), f"{len(y_pred)}, {len(pcr_list)}"
    for i, (pred, pcr) in enumerate(zip(y_pred, pcr_list)):
        price_ratio = (pcr / 100.0 + 1.0)  # Convert from percents [-5% .. 5%] -> [-0.05 .. 0.05] -> [0.95 .. 1.05]
        assert 0.0 < price_ratio < np.inf, f"{i}, {price_ratio}" 
        if pred == 1:
            # Long
            profit_ratio *= price_ratio
        elif pred == 0:
            # Short
            profit_ratio /= price_ratio
        else:
            assert False, "Unexpected label"
    return (profit_ratio - 1.0) * 100.0  # Profit in percents (0% - nothing changed)

In [None]:
def calc_hash_for_seq(values, hash_len=6):
    assert isinstance(values, (list, np.ndarray, pd.Series))
    h = hash(tuple(values))
    return str(h)[-hash_len:]

# Small unit tests
print(calc_hash_for_seq([1, 2, 3]))
print(calc_hash_for_seq(np.array([1, 2, 3])))
print(calc_hash_for_seq(pd.Series([1, 2, 3])))

In [None]:
def make_label_distribution_equal(df: pd.DataFrame) -> pd.DataFrame:
    
    counts = df.label.value_counts()
    assert len(counts == 2)  # We expect only labels 0 and 1

    bigger_label = 0 if counts[0] > counts[1] else 1
    diff = abs(counts[0] - counts[1])

    res_df = df.drop(index=df[df.label == bigger_label].sample(n = diff, replace=False, random_state=42).index)
    return res_df

## Do prepare datasets

In [None]:
# Load raw data, dropping old dates
df_list = []
for fname in FNAMES:
    full_name = os.path.join(DIR_DATA_SRC, fname)
    assert os.path.isfile(full_name), full_name
    df_temp = load_pandas_file(full_name, verbose=False)
    drop_old_dates_inplace(df_temp, DROP_RECORDS_BEFORE_DATE_INCLUSIVE, verbose=False)
    df_list.append(df_temp)

In [None]:
# Concat loaded parts to one dataframe
df_raw = merge_dfs(df_list)

In [None]:
# Choose columns for final dataset
df_final = do_feature_selection(df_raw)

In [None]:
# Append the target column
labels, pcr_list = generate_labels_and_pcr_list(df_raw, strategy_str=LABEL_GEN_STRATEGY)
df_final[COL_LABEL] = labels
df_final[COL_PCR] = pcr_list

In [None]:
# Drop labels for neutral class
df_final = do_label_transformation(df_final)

In [None]:
# print_df_details(df_final)

In [None]:
df_final[COL_LABEL].value_counts()

In [None]:
# Making labels distribution equal
df_final = make_label_distribution_equal(df_final)
df_final[COL_LABEL].value_counts()

In [None]:
df_final

# ===== Part 2: Model execution and scoring =====

## Imports (part 2)

In [None]:
#if TO_USE_COLAB:
#    !pip install optuna

In [None]:
# import gc
# import gensim.downloader
# import matplotlib.pyplot as plt
# from optuna import create_study
# from pprint import pprint
# import random
# from sklearn.dummy import DummyClassifier
# from sklearn.metrics import classification_report, f1_score, accuracy_score
# from sklearn.metrics import confusion_matrix
# import torch
# from torch.utils.data import DataLoader
# from torch.nn.utils.rnn import pack_sequence

In [None]:
# Baseline-related imports
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix

# NB-related imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB

## Defs (part 2)

In [None]:
def train_model_and_get_predictions__sklearn_classifier(model_tag: str, 
    X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.Series, seed: int) -> pd.Series:

    # Initial checks
    assert COL_PCR not in X_train.columns  # To avoid data leaks
    
    # Create the model with specified seed
    if model_tag == "dummy__most_frequent":
        model = DummyClassifier(strategy="most_frequent", random_state=seed)
    elif model_tag == "dummy__uniform":
        model = DummyClassifier(strategy="uniform", random_state=seed)
    else:
        assert False, f"Unexpected model tag: {model_tag}"
    
    # Train the model    
    model.fit(X_train, y_train)
    
    # Get predictions
    y_pred = model.predict(X_test)
    
    return y_pred    

In [None]:
def train_model_and_get_predictions__NB_classifier(model_tag: str, 
    X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.Series, seed: int) -> pd.Series:

    # Initial checks
    assert COL_PCR not in X_train.columns  # To avoid data leaks
    
    # Create the vectorizer and model with specified params
    if model_tag == "NaiveBayes_words_ng1-3_alhpa0.1":
        tf_idf = TfidfVectorizer(use_idf=True, ngram_range=(1,3), analyzer='word')
        model = MultinomialNB(alpha=0.1)
    elif model_tag == "NaiveBayes_words_ng2-2_alhpa0.1":
        tf_idf = TfidfVectorizer(use_idf=True, ngram_range=(2,2), analyzer='word')
        model = MultinomialNB(alpha=0.1)        
    else:
        assert False, f"Unexpected model tag: {model_tag}"
    
    # Prepare tf-idf features (!huge sparse matrix)
    train_features = tf_idf.fit_transform(X_train.message)
    test_features = tf_idf.transform(X_test.message)

    # Train the model    
    model.fit(train_features, y_train)
    
    # Get predictions
    y_pred = model.predict(test_features)
    
    return y_pred    

In [None]:
DO_DEBUG_STOP = False  # The code below will be stopped after putting data into global vars

# Launch split-train-predict-metrics cycle for several seeds
def get_model_score_distribution(model_tag: str, df: pd.DataFrame, launch_cnt: int = 5, verbose=True):

    # Global vars, required for DO_DEBUG_STOP case, to continue writing code on the root notebook level
    global X_train, y_train, X_test, y_test, seed

    result = []
    print("Legend: seed; X_train shape; X_test_shape; y_train shape,hash,sum; y_test shape,hash,sum")
    for seed in range(42, 42 + launch_cnt):
        X_train, X_test, y_train, y_test = train_test_split(
            df[COL_FEATURES + [COL_PCR]], df[COL_LABEL],
            # stratify=df[COL_LABEL], # Note: stratification leads to the same test set (though shuffled)
            shuffle=True,
            random_state = seed, 
            test_size = TEST_SIZE
        )
        # Note: equal hash means binary equality, equal sum means the same rows but shuffled
        print(f"After split: {seed}, {X_train.shape}; {X_test.shape}; {y_train.shape},{calc_hash_for_seq(y_train)},{sum(y_train)};"
              + f" {y_test.shape},{calc_hash_for_seq(y_test)},{sum(y_test)}")

        # Separate price_change_ratio from the data
        pcr_train = X_train[COL_PCR]; X_train.drop(COL_PCR, axis=1, inplace=True)
        pcr_test = X_test[COL_PCR]; X_test.drop(COL_PCR, axis=1, inplace=True)

        if DO_DEBUG_STOP:
            assert False, "Debug-stop fired. Now you could use the above global vars on any notebook cells."

        # Launch model-specific method
        y_pred = None
        if model_tag.startswith('dummy_'):
            y_pred = train_model_and_get_predictions__sklearn_classifier(model_tag, X_train, y_train, X_test, seed)
        elif model_tag.startswith('NaiveBayes_'):
            y_pred = train_model_and_get_predictions__NB_classifier(model_tag, X_train, y_train, X_test, seed)
        else:
            assert False, f"Unexpected model tag: {model_tag}"

        # Calc score
        score1 = accuracy_score(y_test, y_pred)
        #score2 = calc_real_profit_perc(y_pred, pcr_test)
        #score3 = calc_real_profit_perc(y_train[:100], pcr_train[:100])
        #result.append(f"{score1:.5f}, {score2:.2f}%, {score3:.2f}%")
        result.append(score1)
            
        if verbose:
            print(confusion_matrix(y_test, y_pred))
            print(classification_report(y_test, y_pred, digits=3))
                    
    return result

## Launch the model training/estimation

In [None]:
model_tag = "dummy__most_frequent"
results = get_model_score_distribution(model_tag, df_final, verbose=False)
print("Sorted results (accuracy):", sorted(results))
print(f"Mean accuracy: {np.mean(results):.3f} +- {np.std(results):.3f}")

In [None]:
model_tag = "dummy__uniform"
results = get_model_score_distribution(model_tag, df_final, launch_cnt = 5, verbose=False)
print("Sorted results (accuracy):", sorted(results))
print(f"Mean accuracy: {np.mean(results):.3f} +- {np.std(results):.3f}")

In [None]:
#model_tag = "NaiveBayes_words_ng1-3_alhpa0.1"  # Mean accuracy: 0.568 +- 0.002
model_tag = "NaiveBayes_words_ng2-2_alhpa0.1"  # Mean accuracy: 0.559 +- 0.002
results = get_model_score_distribution(model_tag, df_final, launch_cnt = 5, verbose=False)
print("Sorted results (accuracy):", sorted(results))
print(f"Mean accuracy: {np.mean(results):.3f} +- {np.std(results):.3f}")