In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import timescaledb_model as tsdb
from analyze import get_files_infos_df, store_file, get_files_infos_windows_df
from tqdm import tqdm 
from utils import multi_read_df_from_paths
from constant import DATA_PATH
import os

Logs of timescaledb_model go to ../data/bourse.log


In [3]:
# files_info_path = os.path.join(DATA_PATH, 'files_infos.pkl')
# files_infos_df = get_files_infos_df()
# files_infos_df = get_files_infos_windows_df()  # for windows
# files_infos_df.to_pickle(files_info_path)

## Files Dataframe - Filtering
1 hour for each Last day of Each month

In [4]:
files_infos_df = get_files_infos_df(files_info_path)
# files_infos_df = get_files_infos_windows_df(files_info_path) #for windows

In [240]:
np.random.seed(42)
dates_to_take = files_infos_df.groupby([files_infos_df["year_month"]]).apply(lambda x :x.sample(n=1)).reset_index(drop=True)["date"]
file_infos_df_filtered = files_infos_df[files_infos_df["date"].isin(dates_to_take)]

In [199]:
file_infos_df_filtered["hour"] = file_infos_df_filtered.index.hour
file_infos_df_filtered = file_infos_df_filtered.groupby([file_infos_df_filtered["date"], file_infos_df_filtered["hour"], file_infos_df_filtered["market"]]).first().reset_index()

In [200]:
# dfs = multi_read_df_from_paths(list(file_infos_df_filtered["path"]))

In [None]:
# prefixes = set()
# for i, df in enumerate(dfs):
#     current_prefixes = get_prefix(df)
#     if not current_prefixes.issubset(prefixes):
#         prefixes = prefixes.union(current_prefixes)
#         # print(current_prefixes, i)
# prefixes

## PEA PME - Symbol NF handling

### Check if every files from PEA PME have the same prefixes

In [213]:
dfs = multi_read_df_from_paths(list(file_infos_df_filtered[file_infos_df_filtered["market"] == "peapme"] ["path"]))

100%|██████████| 34804/34804 [02:35<00:00, 224.47it/s]


In [203]:
def get_prefix(df: pd.DataFrame, prefix_function = lambda x : x[0:3]) -> set:
    df["prefix"] = df["symbol"].apply(prefix_function)
    return set(df["prefix"].unique())


In [205]:
map_prefix_to_symbol_nf = {
    "1rP": lambda x: x[3:].split("_")[0] if len(x) != 15 else np.nan, #EuroNext Pars
    "1rA": lambda x: x[3:], #EuroNext Amsterdam
    "1rE": lambda x: x[4:], #EuroNext Paris Real Time 
    "FF1": lambda x: x.split("_")[1][0:], #EuroNext Brussels
}

In [206]:
def filter_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df["last"] != 0]
    return df

def add_symbol_nf_column(df: pd.DataFrame, map_prefix_to_symbol_nf: dict) -> pd.DataFrame:
    df["prefix"] = df["symbol"].apply(lambda x: x[0:3])
    df["symbol_nf"] = df.apply(
        lambda x: map_prefix_to_symbol_nf.get(x["prefix"], lambda _: x["symbol"])(x["symbol"]),
        axis=1,
    )
    return df

In [207]:
df = filter_df(dfs[-1])
df = add_symbol_nf_column(df, map_prefix_to_symbol_nf)

In [208]:
df["len"] = df["symbol_nf"].apply(len)
df.sort_values("len", ascending=False)

Unnamed: 0_level_0,last,volume,symbol,name,last_suffix,prefix,symbol_nf,len
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1rEPALMIL,23.600,0,1rEPALMIL,1000MERCIS,c,1rE,ALMIL,5
1rEPALMAS,0.020,0,1rEPALMAS,MASTRAD,s,1rE,ALMAS,5
1rPMLCLI,1.980,0,1rPMLCLI,MAISON CLIO,c,1rP,MLCLI,5
1rPMLMAD_OTC,7.500,0,1rPMLMAD_OTC,MADE,c,1rP,MLMAD,5
1rPALMII,3.720,601,1rPALMII,M2I,,1rP,ALMII,5
...,...,...,...,...,...,...,...,...
1rPEQS,57.000,3914,1rPEQS,SRDEQUASENS,,1rP,EQS,3
1rPIPH,2.295,108349,1rPIPH,SRDINNATE PHARMA,,1rP,IPH,3
1rPGV,0.002,0,1rPGV,GENOMIC VISION,s,1rP,GV,2
1rPAB,2.920,38285,1rPAB,SRDAB SCIENCE,,1rP,AB,2


In [215]:
df_all_days = pd.concat(dfs)

In [222]:
df_all_days_grouped = df_all_days.reset_index(drop=True).groupby(["symbol", "name"]).count()

In [252]:
df_all_days_grouped["name"] = df_all_days_grouped.index.get_level_values(1)
df_filtered = df_all_days_grouped.groupby(df_all_days_grouped.index.get_level_values(0)).last()
df_filtered

Unnamed: 0_level_0,last,volume,last_suffix,name
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1rAADUX,34803,34803,5133,ADUX
1rABESI,34803,34803,3795,BESI
1rEPALANT,10844,10844,362,ANTEVENIO
1rEPALAQU,34782,34782,21699,AQUILA
1rEPALBDM,34775,34775,7336,BD MULTI MEDIA
...,...,...,...,...
FF11_EVS,34803,34803,2820,EVS BROADCAST EQU
FF11_GKTX,20058,20058,13762,GENKYOTEX
FF11_KIN,34803,34803,2680,KINEPOLIS GROUP
FF11_MLMAZ,21097,21097,18756,MAZARO


In [None]:
def pea_pme_df_to_companies(dfs: list[pd.DataFrame]):
    #TODO
    ...

## CompA - Symbol NF handling