In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import timescaledb_model as tsdb
from analyze import get_files_infos_df, read_file, get_files_infos_windows_df
from tqdm import tqdm 
from utils import multi_read_df_from_paths
from constant import DATA_PATH, IS_DOCKER
import os

In [3]:
files_info_path = os.path.join(DATA_PATH, 'files_infos.pkl')
# files_infos_df = get_files_infos_df()
# files_infos_df = get_files_infos_windows_df()  # for windows
# files_infos_df.to_pickle(files_info_path)

In [4]:
files_infos_df = get_files_infos_df(files_info_path)
# files_infos_df = get_files_infos_windows_df(files_info_path) #for windows

In [5]:
np.random.seed(10)
dates_to_take = files_infos_df.groupby([files_infos_df["year_month"]]).apply(lambda x :x.sample(n=1)).reset_index(drop=True)["date"]
file_infos_df_filtered = files_infos_df[files_infos_df["date"].isin(dates_to_take)]
file_infos_df_filtered["hour"] = file_infos_df_filtered.index.hour
file_infos_df_filtered = file_infos_df_filtered.groupby([file_infos_df_filtered["date"], file_infos_df_filtered["hour"], file_infos_df_filtered["market"]]).first().reset_index()

  dates_to_take = files_infos_df.groupby([files_infos_df["year_month"]]).apply(lambda x :x.sample(n=1)).reset_index(drop=True)["date"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_infos_df_filtered["hour"] = file_infos_df_filtered.index.hour


## DB

In [108]:
import timescaledb_model as tsdb
db = (
    tsdb.TimescaleStockMarketModel("bourse", "ricou", "db", "monmdp")
    if IS_DOCKER
    else tsdb.TimescaleStockMarketModel("bourse", "ricou", "localhost", "monmdp")
)

Logs of timescaledb_model go to ../data/bourse.log


# Utils

In [7]:
import re

def is_possible_ticker_with_number(symbol):
    return bool(re.match(r'^[A-Z0-9\-]{1,10}$', symbol))

def detection_intrus(df: pd.DataFrame):
    return df[(df["ticker"].apply(is_possible_ticker_with_number) == False)]

In [8]:
def get_prefix(df: pd.DataFrame, prefix_function = lambda x : x[0:3]) -> set:
    df["prefix"] = df["symbol"].apply(prefix_function)
    return set(df["prefix"].unique())

In [9]:
map_prefix_to_symbol_nf = {
    "1rP": lambda x: x[3:].split("_")[0] if len(x) != 15 else np.nan,  # EuroNext Pars
    "1rA": lambda x: x[3:],  # EuroNext Amsterdam
    "1rE": lambda x: x[4:],  # EuroNext Paris
    "FF1": lambda x: x.split("_")[1][0:],  # EuroNext Brussels
}


def update_ticker_column(
    df: pd.DataFrame,
) -> pd.DataFrame:
    df["ticker"] = df.apply(
        lambda x: map_prefix_to_symbol_nf.get(x["prefix"], lambda _: x["symbol"])(
            x["symbol"]
        ),
        axis=1,
    )
    return df

def update_mid_column(
    df: pd.DataFrame,
    prefix_to_market_id: dict,
    default_mid: int,
) -> pd.DataFrame:
    df["mid"] = df.apply(
        lambda x: prefix_to_market_id.get(x["prefix"],default_mid),
        axis=1,
    )
    return df

In [111]:
def dfs_to_companie(
    dfs: list[pd.DataFrame], prefix_to_market_id: dict, default_mid: int, is_pea: bool = False
) -> pd.DataFrame:
    df_all_days = pd.concat(dfs)
    df_all_days_grouped = (
        df_all_days.reset_index(drop=True).groupby(["symbol", "name"]).last()
    )
    df_all_days_grouped["name"] = df_all_days_grouped.index.get_level_values(1)
    df_companies = df_all_days_grouped.groupby(
        df_all_days_grouped.index.get_level_values(0)
    ).last()
    df_companies = df_companies.reset_index()[["symbol", "name", "timestamp"]]
    df_companies["prefix"] = df_companies["symbol"].apply(lambda x: x[0:3])
    df_companies = update_ticker_column(df_companies)
    df_companies = update_mid_column(df_companies, prefix_to_market_id, default_mid=default_mid)
    df_companies = df_companies.drop(columns=["prefix"])
    df_companies["pea"] = is_pea
    return df_companies

## Amsterdam - Companies handling

In [16]:
dfs_amsterdam = multi_read_df_from_paths(list(file_infos_df_filtered[file_infos_df_filtered["market"] == "amsterdam"] ["path"]))

  0%|          | 0/540 [00:00<?, ?it/s]

100%|██████████| 540/540 [00:12<00:00, 44.93it/s]


In [118]:
amsterdam_companies = dfs_to_companie(dfs_amsterdam, db.prefix_to_market_id, default_mid=db.nasdaq_market_id) 

In [119]:
amsterdam_companies

Unnamed: 0,symbol,name,timestamp,ticker,mid,pea
0,124718367,ASTRAZENECA FIN CVR,2023-03-16 17:02:01,124718367,14,False
1,AABA,ALTABA,2019-10-28 17:02:02,AABA,14,False
2,AACG,ATA CRTV GLB SP ADR,2023-12-21 17:02:02,AACG,14,False
3,AACI,ARMADA ACQN I,2023-12-21 17:02:02,AACI,14,False
4,AACIU,ARMADA ACQN I UTS,2023-12-21 17:02:02,AACIU,14,False
...,...,...,...,...,...,...
6684,ZWRKU,Z-WORK ACQN UTS 26,2022-12-20 17:02:01,ZWRKU,14,False
6685,ZY,ZYMERGEN,2022-11-18 17:02:02,ZY,14,False
6686,ZYME,ZYMEWORKS,2023-12-21 17:02:02,ZYME,14,False
6687,ZYNE,ZYNERBA PHARMA,2023-10-30 17:02:01,ZYNE,14,False


In [156]:
# detection_intrus(amsterdam_companies)

## CompA - Companies handling

In [20]:
dfs_compA = multi_read_df_from_paths(list(file_infos_df_filtered[file_infos_df_filtered["market"] == "compA"] ["path"]))

100%|██████████| 540/540 [00:03<00:00, 152.72it/s]


In [116]:
compA_companies = dfs_to_companie(dfs_compA, prefix_to_market_id=db.prefix_to_market_id ,default_mid=db.prefix_to_market_id["1rP"])

In [117]:
compA_companies

Unnamed: 0,symbol,name,timestamp,ticker,mid,pea
0,1rAAF,AIR FRANCE - KLM,2023-12-21 17:02:02,AF,11,False
1,1rAAFA,AIR FRANCE - KLM,2023-12-21 17:02:02,AFA,11,False
2,1rAENX,EURONEXT,2023-12-21 17:02:02,ENX,11,False
3,1rAGTO,GEMALTO,2019-06-21 17:02:02,GTO,11,False
4,1rARDSADT,ROYAL D SH DRP 21,2021-10-05 17:02:01,RDSADT,11,False
...,...,...,...,...,...,...
404,FF11_ENX,EURONEXT,2023-12-21 17:02:02,ENX,13,False
405,FF11_FP,TOTALENERGIES,2021-08-04 17:02:02,FP,13,False
406,FF11_SEV,SUEZ,2023-12-21 17:02:02,SEV,13,False
407,FF11_SGO,SAINT-GOBAIN,2023-12-21 17:02:02,SGO,13,False


In [161]:
# detection_intrus(compA_companies)

In [119]:
get_prefix(compA_companies)

{'1rA', '1rP', 'FF1'}

## CompB - Companies handling

In [23]:
dfs_compB = multi_read_df_from_paths(list(file_infos_df_filtered[file_infos_df_filtered["market"] == "compB"] ["path"]))

100%|██████████| 540/540 [00:04<00:00, 119.91it/s]


In [114]:
compB_companies = dfs_to_companie(dfs_compB, prefix_to_market_id=db.prefix_to_market_id ,default_mid=db.prefix_to_market_id["1rP"])

In [115]:
compB_companies

Unnamed: 0,symbol,name,timestamp,ticker,mid,pea
0,1rP03227,LATECOERE,2023-12-21 17:02:02,03227,12,False
1,1rP2MX,TERACT R PFD,2023-12-21 17:02:02,2MX,12,False
2,1rP5478,CLARANOVA,2021-10-05 17:02:01,5478,12,False
3,1rPAAA,ALAN ALLMAN ASSOCIATES,2023-12-21 17:02:02,AAA,12,False
4,1rPAAC,ACCOR ACQ COM R PFD,2021-06-01 17:02:01,AAC,12,False
...,...,...,...,...,...,...
391,1rPXFAB,X-FAB SILICON,2023-12-21 17:02:02,XFAB,12,False
392,1rPXIL,XILAM ANIMATION,2023-12-21 17:02:02,XIL,12,False
393,1rPYSYT,SALVEPORN01JAN22EX,2023-12-21 17:02:02,YSYT,12,False
394,FF11_AKA,AKKA TECHNOLOGIES,2022-01-14 17:02:01,AKA,13,False


In [26]:
# get_prefix(compB_companies)

## PEA PME - Companies Handling

In [27]:
dfs_peapme = multi_read_df_from_paths(list(file_infos_df_filtered[file_infos_df_filtered["market"] == "peapme"] ["path"]))

100%|██████████| 324/324 [00:02<00:00, 115.24it/s]


In [112]:
peapme_companies = dfs_to_companie(dfs_peapme, prefix_to_market_id=db.prefix_to_market_id ,default_mid=db.prefix_to_market_id["1rP"], is_pea=True)

In [113]:
peapme_companies

Unnamed: 0,symbol,name,timestamp,ticker,mid,pea
0,1rAADUX,ADUX,2023-12-21 17:02:02,ADUX,11,True
1,1rABESI,BESI,2023-12-21 17:02:02,BESI,11,True
2,1rEPALANT,ANTEVENIO,2021-11-16 17:02:01,ALANT,12,True
3,1rEPALAQU,AQUILA,2023-12-21 17:02:02,ALAQU,12,True
4,1rEPALBDM,BD MULTI MEDIA,2023-12-21 17:02:02,ALBDM,12,True
...,...,...,...,...,...,...
672,FF11_EVS,EVS BROADCAST EQU,2023-12-21 17:02:02,EVS,13,True
673,FF11_GKTX,GENKYOTEX,2022-08-08 17:02:02,GKTX,13,True
674,FF11_KIN,KINEPOLIS GROUP,2023-12-21 17:02:02,KIN,13,True
675,FF11_MLMAZ,MAZARO,2023-12-21 17:02:02,MLMAZ,13,True


In [32]:
get_prefix(peapme_companies)

{'1rA', '1rE', '1rP', 'FF1'}

## Join companies

In [120]:
df_companies = [amsterdam_companies, compA_companies, compB_companies, peapme_companies]

In [121]:
df_companies = pd.concat(df_companies)
df_companies.sort_values(by="timestamp", inplace=True)
df_companies.drop_duplicates(inplace=True)
df_companies.drop_duplicates(subset=["symbol"], keep="last", inplace=True)

### Check no duplicated

In [122]:
df_companies[df_companies.duplicated(subset=["symbol"], keep=False)][:50]

Unnamed: 0,symbol,name,timestamp,ticker,mid,pea


In [123]:
df_companies.drop("timestamp", axis=1, inplace=True)

In [124]:
df_companies.set_index("symbol", inplace=True)

### Update companies table

In [125]:
# db.clean_database()
# db._setup_database()

In [126]:
db.df_write(df_companies, "companies", commit=True)