# Model LogisticRegression

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, f1_score

from typing import NamedTuple

In [3]:
# Global settings

MODEL_NAME = "LogisticRegression"

# SELECT Data Source file name
dsn: str = 'SOL-USD_1h_2024-01-01-2024-12-31_110_feature' # Select data source name
# ToDo Посмотреть TON, 


select_columns: list | None = None


target = "y_next_trend"   # ['y_next_trend', 'y_next_pct']


test_months: int = 2


SEED = 42
max_iter = 100_000


### Data

In [4]:
data: pd.DataFrame = pd.read_csv(f"data/{dsn}")
data['datetime'] = pd.to_datetime(data['datetime'])
# data = data.set_index("datetime")

if select_columns:
    data = data[select_columns]

data[target] = np.where(data[target] == 1, 1, 0)


In [5]:
# Features 
features = [i for i in data.columns if (not i.startswith("y_") and not i == "datetime")]

In [6]:
test_start_date = data['datetime'].max() - pd.DateOffset(months=test_months)

train_data = data[data['datetime'] < test_start_date]
test_data = data[data['datetime'] >= test_start_date]



In [47]:
class TrainedModel(NamedTuple):
    scaler: StandardScaler
    model: LogisticRegression


def train_model_classifier(train_data_, feature_cols, target_column, solver='saga', penalty=None, C=1.0, l1_ratio=None) -> TrainedModel:

    # Разделение на признаки (X) и целевую переменную (y)
    X_train = train_data_[feature_cols]
    y_train = train_data_[target_column]

    # 1. Применение StandardScaler к тренировочной, валидационной и тестовой выборкам
    scaler = StandardScaler()

    # Обучаем скейлер на тренировочной выборке и трансформируем её
    X_train_scaled = scaler.fit_transform(X_train)

    
    # 2. Обучение модели логистической регрессии на тренировочной выборке
    model = LogisticRegression(
        solver=solver, random_state=SEED, penalty=penalty, C=C, 
        max_iter=max_iter,
        l1_ratio=l1_ratio,
        
        )  # Используем liblinear для бинарной классификации
    model.fit(X_train_scaled, y_train)

    return TrainedModel(scaler=scaler, model=model)


In [48]:


def calculate_metrics_table(y_true, y_pred_prob, thresholds=[0.5, 0.6, 0.7, 0.8, 0.9]):
    metrics_table = []
    
    for threshold in thresholds:
        y_pred = (y_pred_prob >= threshold).astype(int)
        metrics = {
            'Cutoff': threshold,
            'Precision': precision_score(y_true, y_pred, zero_division=0),
            'Recall': recall_score(y_true, y_pred, zero_division=0),
            'Accuracy': accuracy_score(y_true, y_pred),
            'F1-Score': f1_score(y_true, y_pred, zero_division=0)
        }
        metrics_table.append(metrics)
    
    return pd.DataFrame(metrics_table) #*100

In [49]:
def check_model(name, model):
    y_train_pred_prob = model.model.predict_proba(model.scaler.transform(train_data[features]))[:, 1]
    y_test_pred_prob = model.model.predict_proba(model.scaler.transform(test_data[features]))[:, 1]

    # ROC AUC
    train_roc_auc = roc_auc_score(train_data[target], y_train_pred_prob)
    test_roc_auc = roc_auc_score(test_data[target], y_test_pred_prob)

    print(f"\n=== Метрики. {name}. TRAIN ===")
    print(f"ROC AUC: {train_roc_auc:.4f}")
    metric_result = calculate_metrics_table(train_data[target], y_train_pred_prob)
    print(metric_result)

    
    print(f"\n=== Метрики. {name} TEST ===")
    print(f"ROC AUC: {test_roc_auc:.4f}")
    metric_result = calculate_metrics_table(test_data[target], y_test_pred_prob)
    print(metric_result)




In [50]:
trained_model = train_model_classifier(train_data, features, target)

In [51]:
check_model(f"{MODEL_NAME} NO regularization", trained_model)


=== Метрики. LogisticRegression NO regularization. TRAIN ===
ROC AUC: 0.5634
   Cutoff  Precision    Recall  Accuracy  F1-Score
0     0.5   0.540725  0.563527  0.539733  0.551891
1     0.6   0.642623  0.053669  0.509021  0.099065
2     0.7   0.805556  0.007941  0.500069  0.015727
3     0.8   1.000000  0.003012  0.498554  0.006006
4     0.9   1.000000  0.000274  0.497177  0.000547

=== Метрики. LogisticRegression NO regularization TEST ===
ROC AUC: 0.4918
   Cutoff  Precision    Recall  Accuracy  F1-Score
0     0.5   0.504244  0.393899  0.488737  0.442293
1     0.6   0.543103  0.083554  0.492150  0.144828
2     0.7   0.350000  0.009284  0.481229  0.018088
3     0.8   0.500000  0.001326  0.485324  0.002646
4     0.9   0.000000  0.000000  0.485324  0.000000


In [52]:
trained_model_l2 = train_model_classifier(train_data, features, target, penalty="l2", C=10.0)

In [53]:
check_model(f"{MODEL_NAME} with L2", trained_model_l2)


=== Метрики. LogisticRegression with L2. TRAIN ===
ROC AUC: 0.5632
   Cutoff  Precision    Recall  Accuracy  F1-Score
0     0.5   0.540163  0.561610  0.539044  0.550678
1     0.6   0.635452  0.052026  0.508194  0.096178
2     0.7   0.800000  0.007667  0.499931  0.015189
3     0.8   1.000000  0.003012  0.498554  0.006006
4     0.9   1.000000  0.000274  0.497177  0.000547

=== Метрики. LogisticRegression with L2 TEST ===
ROC AUC: 0.4935
   Cutoff  Precision    Recall  Accuracy  F1-Score
0     0.5   0.511073  0.397878  0.494198  0.447427
1     0.6   0.557522  0.083554  0.494198  0.145329
2     0.7   0.368421  0.009284  0.481911  0.018111
3     0.8   0.500000  0.001326  0.485324  0.002646
4     0.9   0.000000  0.000000  0.485324  0.000000


In [54]:
trained_model_l1 = train_model_classifier(train_data, features, target, penalty="l1", C=10.0)

In [55]:
check_model(f"{MODEL_NAME} with L1", trained_model_l1)


=== Метрики. LogisticRegression with L1. TRAIN ===
ROC AUC: 0.5633
   Cutoff  Precision    Recall  Accuracy  F1-Score
0     0.5   0.540484  0.562979  0.539457  0.551502
1     0.6   0.643098  0.052300  0.508745  0.096733
2     0.7   0.787879  0.007119  0.499656  0.014111
3     0.8   1.000000  0.002464  0.498278  0.004917
4     0.9   0.000000  0.000000  0.497039  0.000000

=== Метрики. LogisticRegression with L1 TEST ===
ROC AUC: 0.4935
   Cutoff  Precision    Recall  Accuracy  F1-Score
0     0.5   0.509275  0.400531  0.492833  0.448404
1     0.6   0.556522  0.084881  0.494198  0.147296
2     0.7   0.350000  0.009284  0.481229  0.018088
3     0.8   0.500000  0.001326  0.485324  0.002646
4     0.9   0.000000  0.000000  0.485324  0.000000


In [58]:
trained_model_elasticnet = train_model_classifier(train_data, features, target, penalty="elasticnet", l1_ratio=0.5)

In [59]:
check_model(f"{MODEL_NAME} with elasticnet", trained_model_elasticnet)


=== Метрики. LogisticRegression with elasticnet. TRAIN ===
ROC AUC: 0.5619
   Cutoff  Precision    Recall  Accuracy  F1-Score
0     0.5   0.539625  0.566813  0.538906  0.552885
1     0.6   0.670543  0.047371  0.509159  0.088491
2     0.7   0.818182  0.004929  0.498967  0.009799
3     0.8   1.000000  0.001917  0.498003  0.003826
4     0.9   0.000000  0.000000  0.497039  0.000000

=== Метрики. LogisticRegression with elasticnet TEST ===
ROC AUC: 0.5006
   Cutoff  Precision    Recall  Accuracy  F1-Score
0     0.5   0.515755  0.412467  0.498294  0.458364
1     0.6   0.584746  0.091512  0.498976  0.158257
2     0.7   0.461538  0.007958  0.484642  0.015645
3     0.8   0.500000  0.001326  0.485324  0.002646
4     0.9   0.000000  0.000000  0.485324  0.000000


In [76]:
train_data[features]

Unnamed: 0,close,high,low,open,year,month,day,hour,minute,shift_1,...,obv,MACD,MACDsignal,MACDhist,close_roc,ema_close_up_down,distance_close_ema,rsi_line,cluster,last_n_ticker_cluster
0,116.118713,116.862259,114.584198,114.639999,2024,1,2,9,0,114.718681,...,1.394537e+09,3.091948,2.741690,0.350258,6.050886,1,7.748282,1,0,2
1,115.550720,116.235474,114.777100,116.103531,2024,1,2,10,0,116.118713,...,1.321068e+09,3.117456,2.816843,0.300613,5.069145,1,6.717044,1,2,2
2,113.918266,115.545654,113.386223,115.545654,2024,1,2,11,0,115.550720,...,1.211747e+09,2.971690,2.847813,0.123878,2.426103,1,4.756552,0,1,2
3,113.075485,114.289963,113.060837,113.842278,2024,1,2,12,0,113.918266,...,1.197013e+09,2.756391,2.829528,-0.073138,1.100286,1,3.661270,0,1,2
4,113.493744,114.036896,113.057915,113.057915,2024,1,2,13,0,113.075485,...,1.197986e+09,2.589662,2.781555,-0.191893,1.840779,1,3.816333,0,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7256,175.228424,175.228424,174.580582,175.123611,2024,10,30,17,0,175.119507,...,1.842832e+10,-1.122529,-0.738307,-0.384223,-1.829929,-1,-2.375672,0,0,5
7257,174.239685,175.637756,174.101288,175.222687,2024,10,30,18,0,175.228424,...,1.842832e+10,-1.207928,-0.832231,-0.375697,-2.080807,-1,-3.147352,0,2,5
7258,174.485397,174.637329,173.740509,174.231384,2024,10,30,19,0,174.239685,...,1.842832e+10,-1.241469,-0.914078,-0.327390,-1.957964,-1,-2.714437,0,3,5
7259,176.364243,176.364243,173.476990,174.450211,2024,10,30,20,0,174.485397,...,1.842832e+10,-1.103720,-0.952007,-0.151713,-0.459525,-1,-0.781683,0,0,5


In [80]:
for i in trained_model_elasticnet.model.predict_proba(trained_model_elasticnet.scaler.transform(train_data[features]))[:,1]:
    print(i * 100)

37.37949325150502
39.79392751481407
43.37428511674331
51.400061403016494
56.759374349884986
57.43924240697531
62.94791254997685
56.81581671996925
50.79655269480311
60.84120024722024
55.393317139499466
54.66485207826097
61.73933481404962
52.15966444303989
46.46587398972343
52.82083598716875
39.528950482400774
44.735613819535075
51.8512015292238
49.543651453881324
46.40323229143258
44.00596505278471
41.77044401608749
49.81372479807628
45.12151369853564
39.919366063655495
51.53143613151813
31.19455964040263
37.14854356524825
48.74599363567287
54.26756078252426
35.89785729796203
36.4805220331556
84.0194194334062
81.76319102219851
53.46724009007663
50.0993880311414
54.59225738457929
50.97818842267735
47.80193156133108
49.01442981880795
64.04125268211433
52.0406389343988
36.74991535950285
40.44633635438195
46.27849752007522
47.69306119591322
61.55690053075471
60.5363918022696
52.64519742485242
57.08276012469745
55.3778296707365
45.35405596843242
45.80762800854383
44.68528783266718
38.5106643

In [None]:
# Наложить график 