In [3]:
import pandas as pd

In [8]:
historical_df = pd.read_csv('data/historical_data.csv', parse_dates=['SEANCE'])
historical_indices_df = pd.read_csv('data/index_historical_data.csv', parse_dates=['SEANCE'])
sentiment_df = pd.read_csv('data/sentiment_features.csv', parse_dates=['SEANCE'])
sentiment_path = 'data/articles_with_sentiment.json'
models_path = 'models'
score_params_path = 'models/market_mood_params.json'
anomaly_params_path = 'models/anomaly_params.json'

In [9]:
reference_forecast = pd.read_csv('data/forecast_next_5_days.csv', parse_dates=['SEANCE'])

## Test Setup: Simulate a New Trading Day

Use the **last date** in the historical data as the "new day". Remove it from the history so the pipeline processes it as fresh incoming data.

In [10]:
# Identify the last date — this will act as the "new incoming day"
last_date = historical_df['SEANCE'].max()
print(f"Simulated new trading day: {last_date.date()}")
print(f"Number of tickers on that day: {len(historical_df[historical_df['SEANCE'] == last_date])}")

# Split: everything before last_date = history, last_date rows = new market data
hist_df = historical_df[historical_df['SEANCE'] < last_date].copy()
market_df = historical_df[historical_df['SEANCE'] == last_date].copy()

# market_df needs raw columns + index columns for the new day
# Extract index values for the new day from historical_indices_df
new_day_indices = historical_indices_df[historical_indices_df['SEANCE'] == last_date]

# Keep only the columns that feature_engineer expects for market_df
market_cols = ['SEANCE', 'GROUPE', 'CODE', 'VALEUR', 'OUVERTURE', 'CLOTURE',
               'PLUS_BAS', 'PLUS_HAUT', 'QUANTITE_NEGOCIEE', 'NB_TRANSACTION',
               'CAPITAUX', 'VARIATION']
market_df = market_df[market_cols].copy()

# Merge index data onto market_df (feature_engineer expects it)
market_df = market_df.merge(new_day_indices, on='SEANCE', how='left')

# Also remove last_date from historical_indices for consistency
hist_indices_df = historical_indices_df[historical_indices_df['SEANCE'] < last_date].copy()

print(f"\nHistory shape: {hist_df.shape}")
print(f"Market (new day) shape: {market_df.shape}")
print(f"Historical indices shape: {hist_indices_df.shape}")
print(f"New day indices:\n{new_day_indices.head()}")

Simulated new trading day: 2026-02-06
Number of tickers on that day: 41

History shape: (97070, 32)
Market (new day) shape: (41, 27)
Historical indices shape: (2532, 16)
New day indices:
         SEANCE  TUNBANQ_INDICE_JOUR  TUNFIN_INDICE_JOUR  \
2532 2026-02-06             10602.25            11397.71   

      TUNINDEX_INDICE_JOUR  TUNINDEX20_INDICE_JOUR  TUNSAC_INDICE_JOUR  \
2532              14599.12                 6489.26             5728.44   

      TUNBANQ_INDICE_VEILLE  TUNFIN_INDICE_VEILLE  TUNINDEX_INDICE_VEILLE  \
2532               10550.97              11346.04                14525.18   

      TUNINDEX20_INDICE_VEILLE  TUNSAC_INDICE_VEILLE  \
2532                   6465.96               5596.45   

      TUNBANQ_VARIATION_VEILLE  TUNFIN_VARIATION_VEILLE  \
2532                  0.486022                 0.455401   

      TUNINDEX_VARIATION_VEILLE  TUNINDEX20_VARIATION_VEILLE  \
2532                   0.509047                     0.360349   

      TUNSAC_VARIATION_VEIL

## Run the Pipeline

Call `analyze_new_data` with the simulated inputs.

In [11]:
from real_time_utils import analyze_new_data

new_date_str = str(last_date.date())

forecast_output, new_historical_output, new_indices_output, new_sentiment_output = analyze_new_data(
    new_date=new_date_str,
    market_df=market_df,
    sentiment_path=sentiment_path,
    historical_df=hist_df,
    historical_indices_df=hist_indices_df,
    models_path=models_path,
    anomaly_params_path=anomaly_params_path,
    score_params_path=score_params_path
)

print("Pipeline completed successfully!")
print(f"\nForecast output shape: {forecast_output.shape}")
print(f"New historical output shape: {new_historical_output.shape}")
print(f"New indices output shape: {new_indices_output.shape}")
print(f"New sentiment output shape: {new_sentiment_output.shape}")

  combined_df = combined_df.groupby('CODE').apply(compute_financial_features).reset_index(drop=True)


Pipeline completed successfully!

Forecast output shape: (205, 8)
New historical output shape: (41, 32)
New indices output shape: (1, 16)
New sentiment output shape: (12, 5)


## Validate Forecast Against Reference

Compare the pipeline's forecast output with the pre-computed `reference_forecast` to verify correctness.

In [12]:
import numpy as np

# Align both DataFrames on the same columns and sort consistently
common_cols = ['SEANCE', 'CODE', 'VALEUR', 'CLOTURE', 'VOLUME', 'VAR_CLOTURE', 'VAR_VOLUME', 'PROB_LIQUIDITY']

ref = reference_forecast[common_cols].sort_values(['CODE', 'SEANCE']).reset_index(drop=True)
out = forecast_output[common_cols].sort_values(['CODE', 'SEANCE']).reset_index(drop=True)

print("=== Shape Comparison ===")
print(f"Reference: {ref.shape}")
print(f"Pipeline:  {out.shape}")
assert ref.shape == out.shape, f"Shape mismatch! Reference {ref.shape} vs Pipeline {out.shape}"
print("✓ Shapes match\n")

# Check that SEANCE and CODE align exactly
print("=== Key Columns Match ===")
assert (ref['SEANCE'] == out['SEANCE']).all(), "SEANCE mismatch!"
print("✓ SEANCE dates match")
assert (ref['CODE'] == out['CODE']).all(), "CODE mismatch!"
print("✓ CODE values match")
assert (ref['VALEUR'] == out['VALEUR']).all(), "VALEUR mismatch!"
print("✓ VALEUR values match\n")

# Compare numeric columns with tolerance
numeric_cols = ['CLOTURE', 'VOLUME', 'VAR_CLOTURE', 'VAR_VOLUME', 'PROB_LIQUIDITY']
print("=== Numeric Column Comparison (tolerance=1e-4) ===")
all_match = True
for col in numeric_cols:
    close = np.allclose(ref[col].values, out[col].values, atol=1e-4, equal_nan=True)
    max_diff = np.nanmax(np.abs(ref[col].values - out[col].values))
    status = "✓" if close else "✗"
    print(f"{status} {col:20s} | max_diff={max_diff:.6f} | match={close}")
    if not close:
        all_match = False

print(f"\n{'='*50}")
if all_match:
    print("ALL FORECASTS MATCH THE REFERENCE ✓")
else:
    print("SOME FORECASTS DIFFER — see details above")

=== Shape Comparison ===
Reference: (205, 8)
Pipeline:  (205, 8)
✓ Shapes match

=== Key Columns Match ===
✓ SEANCE dates match
✓ CODE values match
✓ VALEUR values match

=== Numeric Column Comparison (tolerance=1e-4) ===
✗ CLOTURE              | max_diff=0.005000 | match=False
✗ VOLUME               | max_diff=739268.000000 | match=False
✗ VAR_CLOTURE          | max_diff=0.012197 | match=False
✗ VAR_VOLUME           | max_diff=18.355399 | match=False
✗ PROB_LIQUIDITY       | max_diff=0.948400 | match=False

SOME FORECASTS DIFFER — see details above


## Inspect Pipeline Outputs

Quick look at all returned DataFrames.

In [13]:
print("=== Forecast Output (first 10 rows) ===")
display(forecast_output.head(10))

print("\n=== New Historical Output (anomalies & scores for new day) ===")
display(new_historical_output.head(10))

print("\n=== New Indices Output ===")
display(new_indices_output)

print("\n=== New Sentiment Output ===")
display(new_sentiment_output.head(10))

=== Forecast Output (first 10 rows) ===


Unnamed: 0,SEANCE,CODE,VALEUR,CLOTURE,VOLUME,VAR_CLOTURE,VAR_VOLUME,PROB_LIQUIDITY
0,2026-02-09,TN0001000108,MONOPRIX,6.157,16218,-0.006918,323.354886,0.9768
1,2026-02-10,TN0001000108,MONOPRIX,6.128,90691,-0.004728,4.59211,0.9943
2,2026-02-11,TN0001000108,MONOPRIX,6.169,155402,0.006633,0.713525,0.9953
3,2026-02-12,TN0001000108,MONOPRIX,6.158,0,-0.001755,-1.0,0.0
4,2026-02-13,TN0001000108,MONOPRIX,6.137,0,-0.00343,0.0,0.0
5,2026-02-09,TN0001100254,SFBT,13.51,142260,0.005173,7.587478,0.9612
6,2026-02-10,TN0001100254,SFBT,13.399,43942,-0.00816,-0.691112,0.8509
7,2026-02-11,TN0001100254,SFBT,13.414,112090,0.001087,1.55083,0.9537
8,2026-02-12,TN0001100254,SFBT,13.378,48883,-0.002643,-0.56389,0.8735
9,2026-02-13,TN0001100254,SFBT,13.549,43297,0.012723,-0.114281,0.8501



=== New Historical Output (anomalies & scores for new day) ===


Unnamed: 0,SEANCE,GROUPE,CODE,VALEUR,OUVERTURE,CLOTURE,PLUS_BAS,PLUS_HAUT,QUANTITE_NEGOCIEE,NB_TRANSACTION,...,NewsScore,MarketMood,volume_z_score,VOLUME_Anomaly,variation_z_score,VARIATION_ANOMALY,VARIATION_ANOMALY_POST_NEWS,VARIATION_ANOMALY_PRE_NEWS,VOLUME_ANOMALY_POST_NEWS,VOLUME_ANOMALY_PRE_NEWS
2505,2026-02-06,11,TN0001000108,MONOPRIX,6.2,6.2,6.2,6.2,50,1,...,36.348706,72.561382,-0.197746,0,2.065924,0,0,0,0,0
5036,2026-02-06,11,TN0001100254,SFBT,13.39,13.44,13.3,13.5,16566,71,...,36.348706,72.561382,-0.426121,0,0.370878,0,0,0,0,0
7555,2026-02-06,11,TN0001200401,TUNISAIR,0.33,0.33,0.33,0.33,2588,7,...,36.348706,72.561382,-0.498873,0,0.016258,0,0,0,0,0
9782,2026-02-06,12,TN0001400704,SPDIT - SICAF,12.96,12.8,12.8,12.96,6,2,...,36.348706,72.561382,-0.371877,0,-0.911854,0,0,0,0,0
12312,2026-02-06,11,TN0001600154,ATTIJARI BANK,68.7,68.5,67.8,69.0,20320,66,...,36.348706,72.561382,0.116484,0,-0.063904,0,0,0,0,0
14840,2026-02-06,11,TN0001800457,BIAT,129.9,132.0,129.9,132.0,2035,50,...,36.348706,72.561382,-0.853779,0,0.952565,0,0,0,0,0
15959,2026-02-06,11,TN0001900604,BH BANK,10.0,10.25,10.0,10.25,601,2,...,36.348706,72.561382,-0.41909,0,-0.007146,0,0,0,0,0
18346,2026-02-06,11,TN0002100907,TUNISIE LEASING & FACTORING,41.0,41.5,41.0,41.5,2605,32,...,36.348706,72.561382,-0.554642,0,0.745204,0,0,0,0,0
20857,2026-02-06,11,TN0002600955,STB BANK,3.78,3.75,3.75,3.78,21768,63,...,36.348706,72.561382,-0.065867,0,-0.516366,0,0,0,0,0
23129,2026-02-06,12,TN0003200755,ICF,77.05,77.0,76.9,79.0,855,32,...,36.348706,72.561382,-0.248709,0,-0.052958,0,0,0,0,0



=== New Indices Output ===


Unnamed: 0,SEANCE,TUNBANQ_INDICE_JOUR,TUNFIN_INDICE_JOUR,TUNINDEX_INDICE_JOUR,TUNINDEX20_INDICE_JOUR,TUNSAC_INDICE_JOUR,TUNBANQ_INDICE_VEILLE,TUNFIN_INDICE_VEILLE,TUNINDEX_INDICE_VEILLE,TUNINDEX20_INDICE_VEILLE,TUNSAC_INDICE_VEILLE,TUNBANQ_VARIATION_VEILLE,TUNFIN_VARIATION_VEILLE,TUNINDEX_VARIATION_VEILLE,TUNINDEX20_VARIATION_VEILLE,TUNSAC_VARIATION_VEILLE
0,2026-02-06,10602.25,11397.71,14599.12,6489.26,5728.44,10550.97,11346.04,14525.18,6465.96,5596.45,0.486022,0.455401,0.509047,0.360349,2.358459



=== New Sentiment Output ===


Unnamed: 0,VALEUR,SEANCE,Mean_Weighted_Sentiment,Article_Count,Sentiment_Intensity
851,ARAB TUNISIAN BANK,2026-02-06,0.716,1,0.716
1391,ATTIJARI BANK,2026-02-06,-0.321,1,0.321
1862,BANQUE NATIONALE AGRICOLE,2026-02-06,-0.321,1,0.321
2379,BIAT,2026-02-06,0.6995,2,1.399
2626,CARTHAGE CEMENT,2026-02-06,-0.321,1,0.321
3960,POULINA GROUP HOLDING,2026-02-06,-0.321,1,0.321
4703,SOCIETE CHIMIQUE ALKIMIA,2026-02-06,-0.252,1,0.252
4940,SOTUMAG,2026-02-06,0.0,1,0.0
5810,TUNINDEX,2026-02-06,-0.321,1,0.321
5915,TUNIS RE,2026-02-06,-0.321,1,0.321


In [14]:
# Side-by-side comparison of reference vs pipeline forecast for a sample ticker
sample_code = ref['CODE'].iloc[0]
ref_sample = ref[ref['CODE'] == sample_code][['SEANCE', 'CLOTURE', 'VOLUME', 'VAR_CLOTURE', 'PROB_LIQUIDITY']].reset_index(drop=True)
out_sample = out[out['CODE'] == sample_code][['SEANCE', 'CLOTURE', 'VOLUME', 'VAR_CLOTURE', 'PROB_LIQUIDITY']].reset_index(drop=True)

comparison = ref_sample.copy()
comparison.columns = [f'REF_{c}' for c in comparison.columns]
for c in ['SEANCE', 'CLOTURE', 'VOLUME', 'VAR_CLOTURE', 'PROB_LIQUIDITY']:
    comparison[f'PIPE_{c}'] = out_sample[c]

print(f"Side-by-side for {sample_code}:")
display(comparison)

# Overall correlation
from scipy.stats import pearsonr
for col in ['CLOTURE', 'VOLUME', 'VAR_CLOTURE']:
    r, p = pearsonr(ref[col].values, out[col].values)
    print(f"Pearson correlation for {col}: r={r:.6f}, p={p:.2e}")

Side-by-side for TN0001000108:


Unnamed: 0,REF_SEANCE,REF_CLOTURE,REF_VOLUME,REF_VAR_CLOTURE,REF_PROB_LIQUIDITY,PIPE_SEANCE,PIPE_CLOTURE,PIPE_VOLUME,PIPE_VAR_CLOTURE,PIPE_PROB_LIQUIDITY
0,2026-02-09,6.157,16218,-0.006918,0.9769,2026-02-09,6.157,16218,-0.006918,0.9768
1,2026-02-10,6.128,90691,-0.004728,0.9943,2026-02-10,6.128,90691,-0.004728,0.9943
2,2026-02-11,6.169,155402,0.006633,0.9953,2026-02-11,6.169,155402,0.006633,0.9953
3,2026-02-12,6.158,0,-0.001755,0.0,2026-02-12,6.158,0,-0.001755,0.0
4,2026-02-13,6.137,0,-0.00343,0.0,2026-02-13,6.137,0,-0.00343,0.0


Pearson correlation for CLOTURE: r=1.000000, p=0.00e+00
Pearson correlation for VOLUME: r=0.934468, p=5.41e-93
Pearson correlation for VAR_CLOTURE: r=0.996122, p=2.87e-216
