In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from os import listdir

## Data prep

In [None]:
# собираем все временные ряды в один (!! потом убирал разницы в местах перехода между разными акциями)
directory = 'long_history/*.csv'

all_stocks = pd.DataFrame(columns=['Names'])
for fname in glob.glob(directory):
   df=pd.read_csv(fname)
   df['Names'] = fname[13:17]
   all_stocks = pd.concat([all_stocks, df])

In [None]:
all_stocks.reset_index(inplace=True, drop=True)

In [None]:
# расчет изменения цены день-к-дню в %
all_stocks['close_dif']=(all_stocks['Close'] - all_stocks['Close'].shift(1))/all_stocks['Close'].shift(1)
all_stocks['close_dif'].plot.hist(bins=500, figsize=(15,5))

In [None]:
all_stocks.describe()

In [None]:
# список индексов, где слишком сильные отклонения (многие из-за перехода от одной акции к другой) +/- несколько дней
# чтобы потом посмотреть, что там случилось
strange_index = all_stocks[np.abs(all_stocks['close_dif'])>0.25].index.values.tolist()
strange_index.extend([x-1 for x in strange_index])
strange_index.extend([x-1 for x in strange_index])
strange_index.extend([x+1 for x in strange_index])

In [None]:
# убираем сравнение там, где переход между акциями
all_stocks['close_dif'][all_stocks['Names'] != all_stocks['Names'].shift(1)] = np.nan

In [None]:
# смотрим странные выбросы - проверял по новостям, действительно ли происходило что-то экстраординарное. Да, происходило.
all_stocks[all_stocks.index.isin(strange_index)][50:100]

In [None]:
#  выделяем просто большие изменения (+/- 3% - больше стандартного отклонения). Сколько таких.
strange_index1 = all_stocks[np.abs(all_stocks['close_dif'])>0.03].index.values.tolist()
len(strange_index1)/all_stocks.shape[0]

In [None]:
# как распределяются большие изменения
stocks_std = all_stocks['close_dif'].std()
surges = all_stocks[np.abs(all_stocks['close_dif'])>stocks_std]
surges['close_dif'].plot.hist(bins=500, figsize=(25,3), title='Median diff='+str(surges['close_dif'].abs().median()))

In [None]:
stocks_std

In [None]:
# размечаем дни, когда были скачки > станд.откл-е
all_stocks['surges']= np.abs(all_stocks['close_dif']) > stocks_std 
all_stocks['surges'].value_counts()

In [None]:
# убираем акции, где цена открытия и оборот = 0
all_stocks = all_stocks[all_stocks['Open']!=0]
all_stocks = all_stocks[all_stocks['Volume']!=0]

In [None]:
# проверяем, что нет нулей
all_stocks.describe()

In [None]:
# убираем лишние столбцы и сохраняем весь массив
all_stocks.drop(['index', 'Dividends','Stock Splits' ], axis=1, inplace=True)
all_stocks.to_csv('all_data_surges.csv')

In [None]:
# сохраняем в отдельные файлы
names_list = all_stocks['Names'].unique().tolist()
for i in names_list:
    stock_surge = all_stocks[all_stocks['Names']==i]
    # stock_surge.loc[columns='surges', index=0]=False
    stock_surge.iloc[0]['surges'] = 'False'
    stock_surge.to_csv('long_history/!surges/surges - %s.csv' %(i))

In [None]:
all_stocks[(all_stocks.index>633710) & (all_stocks.index<633716)]

In [None]:
all_stocks['surges'].value_counts()

### create X & Y for Model

In [None]:
df_all = pd.read_csv('all_data_surges.csv')
df_all.shape

In [None]:
df_all['surges'].value_counts(normalize=True)
# df_all

In [None]:
x_window = 20
counter = 1

X=[]
Y=[]
for counter in range(1, df_all.shape[0]-x_window-1):
    if df_all['close_dif'][counter : counter + x_window].isna().sum()==0: #проверяем, что нет NaN (то есть переход между акциями)
        X.append(df_all['close_dif'][counter : counter + x_window].values)
        Y.append(df_all['surges'][counter + x_window + 1])

In [None]:
len(X), len(Y)

In [None]:
X_df = pd.DataFrame(X)
X_df.shape

## Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_df, Y, random_state=42)

In [None]:
len(X_train), len(X_test), len(Y_train), len(Y_test)

### Reg

In [None]:
logreg = LogisticRegressionCV()

In [None]:
logreg.fit(X_train, Y_train)

In [None]:
print("Train F1: "+str(f1_score(Y_train, logreg.predict(X_train))))
print("Test F1: "+str(f1_score(Y_test, logreg.predict(X_test))))
print("Train ROC AUC: "+str(roc_auc_score(Y_train, logreg.predict(X_train))))
print("Train ROC AUC: "+str(roc_auc_score(Y_test, logreg.predict(X_test))))

In [None]:
print(np.unique(logreg.predict(X_train), return_counts=True))
print(np.unique(logreg.predict(X_test), return_counts=True))

### DT

In [None]:
dt_c = DecisionTreeClassifier(random_state=42)

In [None]:
dt_c.fit(X_train, Y_train)

In [None]:
print("Train F1: "+str(f1_score(Y_train, dt_c.predict(X_train))))
print("Test F1: "+str(f1_score(Y_test, dt_c.predict(X_test))))
print("Train ROC AUC: "+str(roc_auc_score(Y_train, dt_c.predict(X_train))))
print("Test ROC AUC: "+str(roc_auc_score(Y_test, dt_c.predict(X_test))))
print("Train Recall: "+str(recall_score(Y_train, dt_c.predict(X_train))))
print("Test Recall: "+str(recall_score(Y_test, dt_c.predict(X_test))))

In [None]:
dt_c.predict_proba(X_test)

In [None]:
# print(np.unique(dt_c.predict(X_train), return_counts=True))
# print(np.unique(dt_c.predict(X_test), return_counts=True))


print(pd.DataFrame(dt_c.predict(X_train))[0].value_counts(normalize=True))
print(pd.DataFrame(dt_c.predict(X_test))[0].value_counts(normalize=True))

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(Y_test, dt_c.predict(X_test), labels=dt_c.classes_))
disp.plot();

### GradBoost

In [None]:
gb_cl = GradientBoostingClassifier()

In [None]:
gb_cl.fit(X_train, Y_train)

In [None]:
print("Train F1: "+str(f1_score(Y_train, gb_cl.predict(X_train))))
print("Test F1: "+str(f1_score(Y_test, gb_cl.predict(X_test))))
print("Train ROC AUC: "+str(roc_auc_score(Y_train, gb_cl.predict(X_train))))
print("Test ROC AUC: "+str(roc_auc_score(Y_test, gb_cl.predict(X_test))))
print("Train Recall: "+str(recall_score(Y_train, gb_cl.predict(X_train))))
print("Test Recall: "+str(recall_score(Y_test, gb_cl.predict(X_test))))

In [None]:
print(np.unique(gb_cl.predict(X_train), return_counts=True))
print(np.unique(gb_cl.predict(X_test), return_counts=True))

### RandomForrest

In [None]:
rf_cl = RandomForestClassifier()

In [None]:
rf_cl.fit(X_train, Y_train)

In [None]:
print("Train F1: "+str(f1_score(Y_train, rf_cl.predict(X_train))))
print("Test F1: "+str(f1_score(Y_test, rf_cl.predict(X_test))))
print("Train ROC AUC: "+str(roc_auc_score(Y_train, rf_cl.predict(X_train))))
print("Test ROC AUC: "+str(roc_auc_score(Y_test, rf_cl.predict(X_test))))
print("Train Recall: "+str(recall_score(Y_train, rf_cl.predict(X_train))))
print("Test Recall: "+str(recall_score(Y_test, rf_cl.predict(X_test))))

In [None]:
print(np.unique(rf_cl.predict(X_train), return_counts=True))
print(np.unique(rf_cl.predict(X_test), return_counts=True))

## Garbage

In [None]:
directory = 'long_history/*.csv'
li = []
for fname in glob.glob(directory):
   df=pd.read_csv(fname)
   li.append(df)

all_stocks = pd.concat(li, axis=0, ignore_index=True)