# 1. Setup & Environment


In [None]:
# !pip install setfit datasets pandas

# 1. Force install a compatible version of transformers
#%pip install "transformers<4.48.0" "setfit>=1.1.0" datasets accelerate


import json
import pandas as pd
from datasets import Dataset
from setfit import SetFitModel, SetFitTrainer, TrainingArguments
from sentence_transformers.losses import CosineSimilarityLoss

print("Setup Complete.")

  from .autonotebook import tqdm as notebook_tqdm


Setup Complete.


# 2. Load the Labeled Dataset

Load the fewshot_sentiment_dataset.json file you prepared. This file contains the "Gold Examples" used to teach the model.

In [None]:
# Load your labeled few-shot dataset
with open('../data/reference/fewshot_sentiment_dataset.json', 'r', encoding='utf-8') as f:
    labeled_data = json.load(f)

# Convert to Hugging Face Dataset format
# Expects a list like: [{"text": "...", "label": 1}, ...]
train_dataset = Dataset.from_list(labeled_data)

print(f"Loaded {len(train_dataset)} examples for training.")
print(f"Example: {train_dataset[0]}")

Loaded 50 examples for training.
Example: {'text': 'la bna bank d√©croche trois gold awards aux tunisia digital awards 2026', 'label': 1}


# 3. Initialize the Multilingual Model

Load the pre-trained "Backbone" model. We use paraphrase-multilingual-MiniLM-L12-v2 because it understands both French and Arabic, which is essential for the Tunisian market.

In [3]:
# Load the pre-trained multilingual model
model_id = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

model = SetFitModel.from_pretrained(
    model_id,
    labels=[-1, 0, 1]  # -1: Neg, 0: Neu, 1: Pos
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


# 4. Fine-Tuning (Few-Shot Training)

Train the model using the SetFitTrainer. This step uses Contrastive Learning to help the model distinguish between positive and negative financial signals with very few examples.

In [None]:
# Configure the trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_epochs=3,
    num_iterations=40, # Higher iterations = better learning for few-shot
    column_mapping={"text": "text", "label": "label"}
)

# Start training
trainer.train()

# Save the specialized model
model_path = "models/tunisian_finance_model"
model = SetFitModel.from_pretrained(model_path)
print("Model trained and saved locally.")

  trainer = SetFitTrainer(
Applying column mapping to the training dataset
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 2304.92 examples/s]
***** Running training *****
  Num unique pairs = 4000
  Batch size = 16
  Num epochs = 3
  super().__init__(loader)
  0%|          | 1/750 [00:03<45:55,  3.68s/it]

{'embedding_loss': 0.3732, 'grad_norm': 4.242681503295898, 'learning_rate': 2.666666666666667e-07, 'epoch': 0.0}


  7%|‚ñã         | 50/750 [02:07<29:34,  2.53s/it]

{'embedding_loss': 0.2314, 'grad_norm': 2.920628309249878, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.2}


 13%|‚ñà‚ñé        | 100/750 [04:13<27:40,  2.55s/it]

{'embedding_loss': 0.0835, 'grad_norm': 0.23106688261032104, 'learning_rate': 1.925925925925926e-05, 'epoch': 0.4}


 20%|‚ñà‚ñà        | 150/750 [06:13<21:47,  2.18s/it]

{'embedding_loss': 0.0024, 'grad_norm': 0.053406063467264175, 'learning_rate': 1.7777777777777777e-05, 'epoch': 0.6}


 27%|‚ñà‚ñà‚ñã       | 200/750 [08:03<20:11,  2.20s/it]

{'embedding_loss': 0.001, 'grad_norm': 0.07318763434886932, 'learning_rate': 1.6296296296296297e-05, 'epoch': 0.8}


 33%|‚ñà‚ñà‚ñà‚ñé      | 250/750 [09:53<18:12,  2.18s/it]

{'embedding_loss': 0.0005, 'grad_norm': 0.08175420016050339, 'learning_rate': 1.4814814814814815e-05, 'epoch': 1.0}


 40%|‚ñà‚ñà‚ñà‚ñà      | 300/750 [11:52<18:50,  2.51s/it]

{'embedding_loss': 0.0004, 'grad_norm': 0.07314854860305786, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.2}


 47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 350/750 [13:56<16:38,  2.50s/it]

{'embedding_loss': 0.0004, 'grad_norm': 0.04499372839927673, 'learning_rate': 1.1851851851851852e-05, 'epoch': 1.4}


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 400/750 [15:56<13:59,  2.40s/it]

{'embedding_loss': 0.0004, 'grad_norm': 0.024713490158319473, 'learning_rate': 1.037037037037037e-05, 'epoch': 1.6}


 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 450/750 [17:54<11:48,  2.36s/it]

{'embedding_loss': 0.0003, 'grad_norm': 0.04250386357307434, 'learning_rate': 8.888888888888888e-06, 'epoch': 1.8}


 67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 500/750 [19:53<09:43,  2.34s/it]

{'embedding_loss': 0.0003, 'grad_norm': 0.041771795600652695, 'learning_rate': 7.4074074074074075e-06, 'epoch': 2.0}


  super().__init__(loader)
 73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 550/750 [21:57<07:47,  2.34s/it]

{'embedding_loss': 0.0002, 'grad_norm': 0.030479446053504944, 'learning_rate': 5.925925925925926e-06, 'epoch': 2.2}


 80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 600/750 [24:01<06:44,  2.70s/it]

{'embedding_loss': 0.0002, 'grad_norm': 0.03718612715601921, 'learning_rate': 4.444444444444444e-06, 'epoch': 2.4}


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 650/750 [26:02<04:10,  2.50s/it]

{'embedding_loss': 0.0002, 'grad_norm': 0.03574511036276817, 'learning_rate': 2.962962962962963e-06, 'epoch': 2.6}


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 700/750 [28:09<02:08,  2.57s/it]

{'embedding_loss': 0.0002, 'grad_norm': 0.03477509692311287, 'learning_rate': 1.4814814814814815e-06, 'epoch': 2.8}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 750/750 [30:19<00:00,  2.60s/it]

{'embedding_loss': 0.0002, 'grad_norm': 0.024035964161157608, 'learning_rate': 0.0, 'epoch': 3.0}


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 750/750 [30:25<00:00,  2.43s/it]


{'train_runtime': 1825.6912, 'train_samples_per_second': 6.573, 'train_steps_per_second': 0.411, 'train_loss': 0.021634811465938885, 'epoch': 3.0}
Model trained and saved locally.


# 5. Inference on Scraped Data


In [None]:
# 1. Load the main scraped results
input_file = '../data/processed/global_cleaned.json'
with open(input_file, 'r', encoding='utf-8') as f:
    full_data = json.load(f)

articles = full_data['all_articles']

# 2. Extract texts for the model
# We prepare two lists: one for headlines, one for the full content
headlines = [a.get('headline', '') for a in articles]
contents = [a.get('content', '') for a in articles]

print(f"Total articles to analyze: {len(articles)}")

# 3. Run Predictions
print("ü§ñ Analyzing Headlines (Primary Signal)...")
headline_preds = model.predict(headlines)

print("üîç Analyzing Content (Detailed Context)...")
content_preds = model.predict(contents)

# 4. Apply Weighted Logic
# Strategy: Headline is the boss. If headline is neutral, we check the content for details.
# This prevents "noise" in the content from ruining a clear headline.

for i, article in enumerate(articles):
    h_score = int(headline_preds[i])
    c_score = int(content_preds[i])
    
    # LOGIC:
    # If the headline has a strong sentiment (1 or -1), we trust it.
    # If the headline is neutral (0), we let the content decide.
    if h_score != 0:
        final_sentiment = h_score
    else:
        final_sentiment = c_score
        
    # Store the results back in the article object
    article['sentiment_score'] = final_sentiment
    

print("‚úÖ Inference completed. Each article now has a 'sentiment_score'.")

# Preview the first few results
for a in articles[:2]:
    print(f"\nTicker: {a['tickers']} | Score: {a['sentiment_score']}")
    print(f"Headline: {a['headline'][:50]}...")

Total articles to analyze: 97
ü§ñ Analyzing Headlines (Primary Signal)...
üîç Analyzing Content (Detailed Context)...
‚úÖ Inference completed. Each article now has a 'sentiment_score'.

Ticker: ['SFBT'] | Score: 0
Headline: la sfbt annonce plus de 840 millions de dinars de ...

Ticker: ['BNA'] | Score: 0
Headline: la bna bank d√©croche trois gold awards aux tunisia...


# 6. Export Results for Visualization


In [None]:
# 1. Convert the list of articles into a Pandas DataFrame
df = pd.DataFrame(articles)

# 2. 'Explode' the tickers column
# If an article has tickers ["SFBT", "BNA"], it will create two rows: 
# one for SFBT and one for BNA, both sharing the same sentiment score.
df_exploded = df.explode('tickers')

# 3. Clean up the DataFrame
# We only keep the columns we need for charts to keep the file small
columns_to_keep = ['date', 'tickers', 'sentiment_score', 'headline']
df_final = df_exploded[columns_to_keep]

# 4. Rename 'tickers' to 'ticker' for clarity
df_final = df_final.rename(columns={'tickers': 'ticker'})

# 5. Export to CSV
# 'utf-8-sig' ensures that Arabic characters open correctly in Excel
output_csv = '../exports/final_sentiment_analysis_results.csv'
df_final.to_csv(output_csv, index=False, encoding='utf-8-sig')

print(f"üìä Success! Data formatted for visualization.")
print(f"File saved as: {output_csv}")
print(f"Total rows (after exploding multi-ticker articles): {len(df_final)}")

# Preview the structure
df_final

üìä Success! Data formatted for visualization.
File saved as: final_sentiment_analysis_results.csv
Total rows (after exploding multi-ticker articles): 150


Unnamed: 0,date,ticker,sentiment_score,headline
0,2026-01-31,SFBT,0,la sfbt annonce plus de 840 millions de dinars...
1,2026-01-31,BNA,0,la bna bank d√©croche trois gold awards aux tun...
2,2026-01-29,ATTIJARI BANK,-1,bilan 2025 du secteur du leasing cot√© : r√©sili...
3,2026-01-29,STB,-1,la bourse de tunis renforce l‚Äôaccompagnement d...
4,2026-01-28,STB,-1,le pari r√©ussi de hatem zaara √† la t√™te de la ...
...,...,...,...,...
94,2025-12-11,BIAT,1,bourse de tunis : le tunindex grignote quelque...
94,2025-12-11,CARTHAGE CEMENT,1,bourse de tunis : le tunindex grignote quelque...
94,2025-12-11,SAH,1,bourse de tunis : le tunindex grignote quelque...
95,2025-12-11,BNA,-1,atl : fitch ratings accorde la note √† long ter...


In [None]:
# 6. Export to JSON (Records format)
output_json = '../exports/daily_ticker_sentiment_signals.json'

# force_ascii=False keeps Arabic/French characters readable
# indent=4 makes the file human-readable (good for the hackathon pitch)
df_final.to_json(output_json, orient='records', force_ascii=False, indent=4)

print(f"JSON Exported: {output_json}")

JSON Exported: daily_ticker_sentiment_signals.json
