In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import pandas as pd
import numpy as np
import timescaledb_model as tsdb
from analyze import get_files_infos_df, read_file, get_files_infos_windows_df
from tqdm import tqdm 
from utils import multi_read_df_from_paths
from constant import DATA_PATH
import os

In [5]:
files_info_path = os.path.join(DATA_PATH, 'files_infos.pkl')
# files_infos_df = get_files_infos_df()
# files_infos_df = get_files_infos_windows_df()  # for windows
# files_infos_df.to_pickle(files_info_path)

## Files Dataframe - Filtering
1 hour for each Last day of Each month

In [57]:
files_infos_df = get_files_infos_df(files_info_path)
# files_infos_df = get_files_infos_windows_df(files_info_path) #for windows

In [292]:
np.random.seed(10)
dates_to_take = files_infos_df.groupby([files_infos_df["year_month"]]).apply(lambda x :x.sample(n=1)).reset_index(drop=True)["date"]
file_infos_df_filtered = files_infos_df[files_infos_df["date"].isin(dates_to_take)]

In [293]:
file_infos_df_filtered["hour"] = file_infos_df_filtered.index.hour
file_infos_df_filtered = file_infos_df_filtered.groupby([file_infos_df_filtered["date"], file_infos_df_filtered["hour"], file_infos_df_filtered["market"]]).first().reset_index()

# Utils

In [337]:
import re

def is_possible_ticker_with_number(symbol):
    return bool(re.match(r'^[A-Z0-9\-]{1,10}$', symbol))

def is_possible_ticker(symbol):
    return bool(re.match(r'^[A-Z]{1,10}$', symbol))

def detection_intrus(df: pd.DataFrame):
    df["is_ticker"] = df["symbol"].apply(is_possible_ticker_with_number)
    df["is_ticker_just_letters"] = df["symbol"].apply(is_possible_ticker)
    return df[(df["is_ticker"] == False) | (df["is_ticker_just_letters"] == False)]

In [296]:
def get_prefix(df: pd.DataFrame, prefix_function = lambda x : x[0:3]) -> set:
    df["prefix"] = df["symbol"].apply(prefix_function)
    return set(df["prefix"].unique())

In [297]:
map_prefix_to_symbol_nf = {
    "1rP": lambda x: x[3:].split("_")[0] if len(x) != 15 else np.nan,  # EuroNext Pars
    "1rA": lambda x: x[3:],  # EuroNext Amsterdam
    "1rE": lambda x: x[4:],  # EuroNext Paris
    "FF1": lambda x: x.split("_")[1][0:],  # EuroNext Brussels
}

map_prefix_to_mid = {
    "1rP": lambda is_compA: 7 if is_compA else 8,  # EuroNext Pars
    "1rA": lambda _: 6,  # EuroNext Amsterdam
    "1rE": lambda is_compA: 7 if is_compA else 8,  # EuroNext Paris
    "FF1": lambda _: 10,  # EuroNext Brussels
}


def add_symbol_nf_column(
    df: pd.DataFrame,
) -> pd.DataFrame:
    df["symbol_nf"] = df.apply(
        lambda x: map_prefix_to_symbol_nf.get(x["prefix"], lambda _: x["symbol"])(
            x["symbol"]
        ),
        axis=1,
    )
    return df


def add_mid_column(
    df: pd.DataFrame,
    mid_default: int,
    is_compA: bool,
) -> pd.DataFrame:
    df["mid"] = df.apply(
        lambda x: map_prefix_to_mid.get(x["prefix"], lambda _: mid_default)(is_compA),
        axis=1,
    )
    return df

In [298]:
def dfs_to_companie(dfs: list[pd.DataFrame], default_mid: int, is_compA: bool = False) -> pd.DataFrame:
    df_all_days = pd.concat(dfs)
    df_all_days_grouped = df_all_days.reset_index(drop=True).groupby(["symbol", "name"]).count()
    df_all_days_grouped["name"] = df_all_days_grouped.index.get_level_values(1)
    df_companies = df_all_days_grouped.groupby(df_all_days_grouped.index.get_level_values(0)).last()
    df_companies = df_companies.reset_index()[["symbol", "name"]]
    df_companies["prefix"] = df_companies["symbol"].apply(lambda x: x[0:3])
    df_companies = add_symbol_nf_column(df_companies)
    df_companies = add_mid_column(df_companies, default_mid, is_compA)
    df_companies = df_companies.drop(columns=["prefix"])
    return df_companies

## Amsterdam - Companies handling

In [299]:
dfs_amsterdam = multi_read_df_from_paths(list(file_infos_df_filtered[file_infos_df_filtered["market"] == "amsterdam"] ["path"]))

100%|██████████| 540/540 [00:10<00:00, 51.23it/s]


In [392]:

dfs_amsterdam[0]

Unnamed: 0_level_0,last,volume,symbol,name,last_suffix
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AKTX,1.99,32071,AKTX,AKARI THERP SP ADR,
FLWS,13.00,119867,FLWS,1-800-FLOWERS.COM-A,
YI,9.79,5147,YI,111 SP ADS-A,
PIH,4.30,2641,PIH,1347 PROPT INS H,
PIHPP,19.80,739,PIHPP,1347PTY 8%CPRP RG-A,
...,...,...,...,...,...
ZS,45.01,517056,ZS,ZSCALER,
ZUMZ,22.09,372049,ZUMZ,ZUMIEZ,
ZGEN,9.76,0,ZGEN,ZYMOGENETICS INC LGT ZLP,
ZYNE,3.88,250012,ZYNE,ZYNERBA PHARMA,


Unnamed: 0_level_0,last,volume,symbol,name,last_suffix
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [335]:
mid = 6 #TODO get mid from db
amsterdam_companies = dfs_to_companie(dfs_amsterdam, mid) 

In [336]:
amsterdam_companies

Unnamed: 0,symbol,name,symbol_nf,mid
0,124718367,ASTRAZENECA FIN CVR,124718367,6
1,AABA,ALTABA,AABA,6
2,AACG,ATA CRTV GLB SP ADR,AACG,6
3,AACI,ARMADA ACQN I,AACI,6
4,AACIU,ARMADA ACQN I UTS,AACIU,6
...,...,...,...,...
6684,ZWRKU,Z-WORK ACQN UTS 26,ZWRKU,6
6685,ZY,ZYMERGEN,ZY,6
6686,ZYME,ZYMEWORKS,ZYME,6
6687,ZYNE,ZYNERBA PHARMA,ZYNE,6


In [338]:
detection_intrus(amsterdam_companies)

Unnamed: 0,symbol,name,symbol_nf,mid,is_ticker,is_ticker_just_letters
0,124718367,ASTRAZENECA FIN CVR,124718367,6,True,False
290,ALKS_V,ALKERMES-EX,ALKS_V,6,False,False
2410,GASS1,GASS1 PACKAGE,GASS1,6,True,False
2414,GB2814265,MAGMA DESIGN AUTO,GB2814265,6,True,False
2684,GSRM_R,GSR II METEORA RTS,GSRM_R,6,False,False
5375,SGBX_V,SAFE & GREEN RG-EX,SGBX_V,6,False,False
5379,SGDV_V,SAFE AND RG-WI,SGDV_V,6,False,False
5950,TK112803851,FRD FUT INTELL RG-A,TK112803851,6,False,False


## CompA - Companies handling

In [339]:
dfs_compA = multi_read_df_from_paths(list(file_infos_df_filtered[file_infos_df_filtered["market"] == "compA"] ["path"]))

100%|██████████| 540/540 [00:01<00:00, 277.14it/s]


In [398]:
dfs_compA[-1][dfs_compA[-1]["last_suffix"] == "s"]

Unnamed: 0_level_0,last,volume,symbol,name,last_suffix
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1rPRE,175.0,0,1rPRE,SRDCOLAS,s


In [340]:
mid = 7 #TODO get mid from db
compA_companies = dfs_to_companie(dfs_compA, 7, is_compA=True)

In [343]:
compA_companies

Unnamed: 0,symbol,name,symbol_nf,mid
0,1rAAF,AIR FRANCE - KLM,AF,6
1,1rAAFA,AIR FRANCE - KLM,AFA,6
2,1rAENX,EURONEXT,ENX,6
3,1rAGTO,GEMALTO,GTO,6
4,1rARDSADT,ROYAL D SH DRP 21,RDSADT,6
...,...,...,...,...
404,FF11_ENX,EURONEXT,ENX,10
405,FF11_FP,TOTALENERGIES,FP,10
406,FF11_SEV,SUEZ,SEV,10
407,FF11_SGO,SAINT-GOBAIN,SGO,10


In [344]:
detection_intrus(compA_companies)

Unnamed: 0,symbol,name,symbol_nf,mid,is_ticker,is_ticker_just_letters
0,1rAAF,AIR FRANCE - KLM,AF,6,False,False
1,1rAAFA,AIR FRANCE - KLM,AFA,6,False,False
2,1rAENX,EURONEXT,ENX,6,False,False
3,1rAGTO,GEMALTO,GTO,6,False,False
4,1rARDSADT,ROYAL D SH DRP 21,RDSADT,6,False,False
...,...,...,...,...,...,...
404,FF11_ENX,EURONEXT,ENX,10,False,False
405,FF11_FP,TOTALENERGIES,FP,10,False,False
406,FF11_SEV,SUEZ,SEV,10,False,False
407,FF11_SGO,SAINT-GOBAIN,SGO,10,False,False


In [345]:
get_prefix(compA_companies)

{'1rA', '1rP', 'FF1'}

In [367]:
symbol_paris_compA = set(compA_companies[compA_companies["mid"] == 7]["symbol"])

## CompB - Companies handling

In [346]:
dfs_compB = multi_read_df_from_paths(list(file_infos_df_filtered[file_infos_df_filtered["market"] == "compB"] ["path"]))

100%|██████████| 540/540 [00:02<00:00, 264.06it/s]


In [347]:
mid = 8 #TODO get mid from db
compB_companies = dfs_to_companie(dfs_compB, mid, is_compA=False)

In [348]:
compB_companies.value_counts("mid")

mid
8     394
10      2
Name: count, dtype: int64

In [349]:
compB_companies

Unnamed: 0,symbol,name,symbol_nf,mid
0,1rP03227,LATECOERE,03227,8
1,1rP2MX,TERACT R PFD,2MX,8
2,1rP5478,CLARANOVA,5478,8
3,1rPAAA,ALAN ALLMAN ASSOCIATES,AAA,8
4,1rPAAC,ACCOR ACQ COM R PFD,AAC,8
...,...,...,...,...
391,1rPXFAB,X-FAB SILICON,XFAB,8
392,1rPXIL,XILAM ANIMATION,XIL,8
393,1rPYSYT,SALVEPORN01JAN22EX,YSYT,8
394,FF11_AKA,AKKA TECHNOLOGIES,AKA,10


In [351]:
get_prefix(compB_companies)

{'1rP', 'FF1'}

In [365]:
symbol_paris_compB = set(compB_companies[compB_companies["mid"] == 8]["symbol"])

## PEA PME - Companies Handling

In [352]:
dfs_peapme = multi_read_df_from_paths(list(file_infos_df_filtered[file_infos_df_filtered["market"] == "peapme"] ["path"]))

100%|██████████| 324/324 [00:01<00:00, 236.36it/s]


In [353]:
mid = 99 #TODO get mid from db
peapme_companies = dfs_to_companie(dfs_peapme, mid, is_compA=False)

In [354]:
peapme_companies

Unnamed: 0,symbol,name,symbol_nf,mid
0,1rAADUX,ADUX,ADUX,6
1,1rABESI,BESI,BESI,6
2,1rEPALANT,ANTEVENIO,ALANT,8
3,1rEPALAQU,AQUILA,ALAQU,8
4,1rEPALBDM,BD MULTI MEDIA,ALBDM,8
...,...,...,...,...
672,FF11_EVS,EVS BROADCAST EQU,EVS,10
673,FF11_GKTX,GENKYOTEX,GKTX,10
674,FF11_KIN,KINEPOLIS GROUP,KIN,10
675,FF11_MLMAZ,MAZARO,MLMAZ,10


In [355]:
peapme_companies.value_counts("mid")

mid
8     656
10     19
6       2
Name: count, dtype: int64

## View intersection and difference

In [360]:
symbol_paris_peapme = set(peapme_companies[peapme_companies["mid"] == 8]["symbol"])

In [370]:
symbol_paris_comp = symbol_paris_compA | symbol_paris_compB

In [384]:
len(symbol_paris_peapme), len(symbol_paris_comp)

(656, 735)

In [381]:
len(symbol_paris_peapme.intersection(symbol_paris_compA))

25

In [383]:
symbol_paris_peapme.difference(symbol_paris_comp)

{'1rEPALANT',
 '1rEPALAQU',
 '1rEPALBDM',
 '1rEPALCAR',
 '1rEPALCLA',
 '1rEPALCLS',
 '1rEPALDEI',
 '1rEPALDLS',
 '1rEPALEO2',
 '1rEPALFRE',
 '1rEPALGEN',
 '1rEPALI2S',
 '1rEPALINS',
 '1rEPALLP',
 '1rEPALMAS',
 '1rEPALMET',
 '1rEPALMIL',
 '1rEPALNEV',
 '1rEPALNXT',
 '1rEPALOBR',
 '1rEPALOCT',
 '1rEPALODI',
 '1rEPALPRO',
 '1rEPALSAS',
 '1rEPALSPO',
 '1rEPALSTA',
 '1rEPALSTW',
 '1rEPALTRA',
 '1rEPALTVO',
 '1rEPALUCR',
 '1rEPALVDM',
 '1rEPALVXM',
 '1rEPALWEB',
 '1rEPALWED',
 '1rEPCBOT',
 '1rP03902',
 '1rP06083',
 '1rP2CRSI',
 '1rP7665',
 '1rPABNX',
 '1rPABVNV',
 '1rPADI',
 '1rPADOC',
 '1rPADONV',
 '1rPADV',
 '1rPADVIC',
 '1rPAL2SI',
 '1rPALA2M',
 '1rPALACT',
 '1rPALADM',
 '1rPALADO',
 '1rPALADV',
 '1rPALAFY',
 '1rPALAGO',
 '1rPALAGP',
 '1rPALAIR',
 '1rPALALO',
 '1rPALAM',
 '1rPALAMG',
 '1rPALANV',
 '1rPALARF',
 '1rPALAST',
 '1rPALATA',
 '1rPALATI',
 '1rPALAUD',
 '1rPALAUR',
 '1rPALAVI',
 '1rPALAVY',
 '1rPALBFR',
 '1rPALBI',
 '1rPALBIO',
 '1rPALBIZ',
 '1rPALBKK',
 '1rPALBLD',
 '1rPALBLU',
 