# Implementing "FX sentiment analysis with large language models" (Ballinari et al.)
This paper can be found at 

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import re
import pandas_datareader.data as web
# import transformers
# import bitsandbytes as bnb
# import accelerate
# import peft
# import trl

## 1. Dataset Preprocessing

### 1.1. Filtering
- Load the datasets
- Drop articles with <20 words
- Remove duplicate articles 
- Convert time to datetime

In [2]:
# Loading the dailyfx news articles dataset
df_news = pd.read_csv('datasets/news articles/dailyfx_articles_012011-062024.csv')

# Drop articles with <20 words
df_news = df_news[df_news['Full Text'].str.split().str.len() > 20]

# Remove duplicate articles
df_news = df_news.drop_duplicates(subset=['Full Text'])
df_news = df_news.drop_duplicates(subset=['Title'])

# Convert time to datetime
df_news['Date'] = pd.to_datetime(df_news['Date'])

# Sort by date
df_news = df_news.sort_values(by='Date')


  df_news['Date'] = pd.to_datetime(df_news['Date'])


### 1.2. Creating mentioned_currency column
- Use regex to capture all the currencies used in an article
- Make use of common synomyms
- Filter articles that don't mention any of the G10 currencies 

In [3]:
# Dictionary mapping ISO codes to the regex patterns (synonyms) from Figure A.1
currency_synonyms = {
    "EUR": [r"EUR", r"Euro"],
    "USD": [r"USD", r"Dollar", r"Dollars", r"US Dollar", r"US-Dollar", r"U\.S\. Dollar", 
            r"US Dollars", r"US-Dollars", r"U\.S\. Dollars", r"Greenback"],
    "JPY": [r"JPY", r"Yen", r"Japanese Yen"],
    "GBP": [r"GBP", r"Pound", r"Pounds", r"Sterling", r"British Pound", r"British Pounds"],
    "AUD": [r"AUD", r"Australian Dollar", r"Australian Dollars", r"Aussie"],
    "CAD": [r"CAD", r"Canadian Dollar", r"Canadian Dollars"],
    "CHF": [r"CHF", r"Swiss Franc", r"Swiss Francs", r"Swissie"],
    "NZD": [r"NZD", r"New Zealand Dollar", r"New Zealand Dollars", r"Kiwi"],
    "NOK": [r"NOK", r"Norwegian Krone", r"Norwegian Kroner"],
    "SEK": [r"SEK", r"Swedish Krona", r"Swedish Kronor"]
}

# Get list of mentioned currencies from text
def get_mentioned_currencies(text):
    mentioned_currencies = list()

    for currency, patterns in currency_synonyms.items():
        for pattern in patterns:
            if re.search(pattern, text, re.IGNORECASE):
                mentioned_currencies.append(currency)
                break

    return mentioned_currencies

df_news['mentioned_currencies'] = df_news['Full Text'].apply(get_mentioned_currencies)

# Filter articles to keep only those where 'mentioned_currencies' is non empty
df_news = df_news[df_news['mentioned_currencies'].apply(lambda x: len(x) > 0)]

### 1.3. Getting historical prices
Using nominal narrow effective exchange rate (daily) for each country.

Narrow effective exchange rate is a good proxy for the tradable currency index that the authors used.

Allows us to put a number to the currency rather than using a pair as then the currency can be effected by the other in the pair. 

It is done by taking the geometric mean from the exchange rate of various other currencies (narrow means only a small number of industrialised countries so that the average isn't skewed by some other non industrialised country going down).

In [20]:
# All links to get data from for effective exchage rate
urls = {
    "USD": "https://stats.bis.org/api/v2/data/dataflow/BIS/WS_EER/1.0/D.N.N.US?startPeriod=2011-01-01&endPeriod=2024-06-01&format=csv",
    "EUR": "https://stats.bis.org/api/v2/data/dataflow/BIS/WS_EER/1.0/D.N.N.XM?startPeriod=2011-01-01&endPeriod=2024-06-01&format=csv",
    "JPY": "https://stats.bis.org/api/v2/data/dataflow/BIS/WS_EER/1.0/D.N.N.JP?startPeriod=2011-01-01&endPeriod=2024-06-01&format=csv",
    "GBP": "https://stats.bis.org/api/v2/data/dataflow/BIS/WS_EER/1.0/D.N.N.GB?startPeriod=2011-01-01&endPeriod=2024-06-01&format=csv",
    "CAD": "https://stats.bis.org/api/v2/data/dataflow/BIS/WS_EER/1.0/D.N.N.CA?startPeriod=2011-01-01&endPeriod=2024-06-01&format=csv",
    "AUD": "https://stats.bis.org/api/v2/data/dataflow/BIS/WS_EER/1.0/D.N.N.AU?startPeriod=2011-01-01&endPeriod=2024-06-01&format=csv",
    "CHF": "https://stats.bis.org/api/v2/data/dataflow/BIS/WS_EER/1.0/D.N.N.CH?startPeriod=2011-01-01&endPeriod=2024-06-01&format=csv", 
    "SEK": "https://stats.bis.org/api/v2/data/dataflow/BIS/WS_EER/1.0/D.N.N.SE?startPeriod=2011-01-01&endPeriod=2024-06-01&format=csv",
    "NOK": "https://stats.bis.org/api/v2/data/dataflow/BIS/WS_EER/1.0/D.N.N.NO?startPeriod=2011-01-01&endPeriod=2024-06-01&format=csv",
    "NZD": "https://stats.bis.org/api/v2/data/dataflow/BIS/WS_EER/1.0/D.N.N.NZ?startPeriod=2011-01-01&endPeriod=2024-06-01&format=csv"
}

# Initialise an empty DataFrame (EER = effective exchange rate)
df_EER = pd.DataFrame()

for code, url in urls.items():
    # Read only the required columns from the CSV
    df_temp = pd.read_csv(url, usecols=lambda c: c in ["TIME_PERIOD", "OBS_VALUE"])
    
    # Convert OBS_VALUE to float for log calculations later
    df_temp["OBS_VALUE"] = pd.to_numeric(df_temp["OBS_VALUE"], errors="coerce")
    
    # Rename "OBS_VALUE" to currency code
    df_temp = df_temp.rename(columns={
        "OBS_VALUE": code,
        "TIME_PERIOD": "date"
    })
    
    # If the main df is empty, set it to this df
    if df_EER.empty:
        df_EER = df_temp
    else:
        # Join on "date", keep all records (outer join)
        df_EER = pd.merge(df_EER, df_temp, on=["date"], how='outer')


df_EER['date'] = pd.to_datetime(df_EER['date'])
df_EER = df_EER.set_index('date')

# drop all NaNs in the data
df_EER.dropna(inplace=True)

df_EER.head()

Unnamed: 0_level_0,USD,EUR,JPY,GBP,CAD,AUD,CHF,SEK,NOK,NZD
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2011-01-03,82.07,103.41,120.99,105.23,129.89,130.38,87.3,116.49,136.0,98.33
2011-01-04,82.05,103.98,119.81,105.84,129.59,128.97,86.05,116.47,135.74,97.42
2011-01-05,82.77,102.87,120.35,106.7,128.98,128.61,86.01,116.37,135.51,97.75
2011-01-06,83.09,102.38,119.32,106.95,130.06,128.83,85.4,116.25,135.96,97.37
2011-01-07,83.46,101.62,119.31,107.42,130.42,128.88,86.29,115.74,136.06,97.72


### 1.4 Calculate log returns

In [21]:
# Calculate daily log returns
df_log_returns = np.log(df_EER / df_EER.shift(1))

df_log_returns.dropna(inplace=True)  # created in the shifting

### 1.5 Calculate cumulative 5 day windows

In [None]:
# Future returns
# At index t, we want the sum of t+1, t+2, t+3, t+4, t+5 returns
df_future_returns = df_log_returns.rolling(window=5, min_periods=5).sum().shift(-5)
df_future_returns.dropna(inplace=True)

# Past returns
# At index t, we want the sum of t-1, t-2, t-3, t-4, t-5 returns
df_past_returns = df_log_returns.rolling(window=5, min_periods=5).sum().shift(1)
df_past_returns.dropna(inplace=True)


# Merge future and past returns DataFrames into df_log_returns, aligning on date index.
df_log_returns = df_log_returns.join(df_future_returns.add_suffix('_future'), how='inner')
df_log_returns = df_log_returns.join(df_past_returns.add_suffix('_past'), how='inner')

Empty DataFrame
Columns: [USD, EUR, JPY, GBP, CAD, AUD, CHF, SEK, NOK, NZD, USD_future, EUR_future, JPY_future, GBP_future, CAD_future, AUD_future, CHF_future, SEK_future, NOK_future, NZD_future, USD_past, EUR_past, JPY_past, GBP_past, CAD_past, AUD_past, CHF_past, SEK_past, NOK_past, NZD_past]
Index: []

[0 rows x 30 columns]


### 1.6 Get sentiment labels

Based of future returns:

For each timestep:
- Top 3 (30%) -> "Appreciation"
- Middle 4 (40%) -> "Unchanged"
- Bottom 3 (30%) -> "Depreciation"

In [None]:
# Get list of currency codes (G10 currencies)
currency_codes = ['USD', 'EUR', 'JPY', 'GBP', 'CAD', 'AUD', 'CHF', 'SEK', 'NOK', 'NZD']

# Initialize label columns for each currency
for currency in currency_codes:
    df_log_returns[f'{currency}_label'] = None

# For each date (row), rank currencies by their future returns and assign labels
for date in df_log_returns.index:
    # Get future returns for this date
    future_returns = {}
    for currency in currency_codes:
        value = df_log_returns.loc[date, f'{currency}_future']
        if pd.notna(value):
            future_returns[currency] = value

    # Get past returns for this date
    past_returns = {}
    for currency in currency_codes:
        value = df_log_returns.loc[date, f'{currency}_past']
        if pd.notna(value):
            past_returns[currency] = value
    
    # Rank currencies by future returns (highest to lowest)
    sorted_currencies_future = sorted(future_returns.items(), key=lambda x: x[1], reverse=True)

    # Rank currencies by past returns (highest to lowest)
    sorted_currencies_past = sorted(past_returns.items(), key=lambda x: x[1], reverse=True)
    
    # Assign labels based on ranking
    # Top 3 (30%) -> "Appreciation"
    # Middle 4 (40%) -> "Unchanged"
    # Bottom 3 (30%) -> "Depreciation"
    for i, (currency, _) in enumerate(sorted_currencies_future):
        if i < 3:  # Top 3 (0, 1, 2)
            df_log_returns.loc[date, f'{currency}_forward_label'] = 'Appreciation'
        elif i >= 7:  # Bottom 3 (7, 8, 9)
            df_log_returns.loc[date, f'{currency}_forward_label'] = 'Depreciation'
        else:  # Middle 4 (3, 4, 5, 6)
            df_log_returns.loc[date, f'{currency}_forward_label'] = 'Unchanged'
    
    for i, (currency, _) in enumerate(sorted_currencies_past):
        if i < 3:  # Top 3 (0, 1, 2)
            df_log_returns.loc[date, f'{currency}_backward_label'] = 'Depreciation'
        elif i >= 7:  # Bottom 3 (7, 8, 9)
            df_log_returns.loc[date, f'{currency}_backward_label'] = 'Appreciation'
        else:  # Middle 4 (3, 4, 5, 6)
            df_log_returns.loc[date, f'{currency}_backward_label'] = 'Unchanged'

# Only keep labels
df_labels = df_log_returns[
    [f'{currency}_forward_label' for currency in currency_codes] + 
    [f'{currency}_backward_label' for currency in currency_codes]
]

df_labels.columns

Nulls [USD]: log_ret=0, future_win=0, past_win=0
Nulls [EUR]: log_ret=0, future_win=0, past_win=0
Nulls [JPY]: log_ret=0, future_win=0, past_win=0
Nulls [GBP]: log_ret=0, future_win=0, past_win=0
Nulls [CAD]: log_ret=0, future_win=0, past_win=0
Nulls [AUD]: log_ret=0, future_win=0, past_win=0
Nulls [CHF]: log_ret=0, future_win=0, past_win=0
Nulls [SEK]: log_ret=0, future_win=0, past_win=0
Nulls [NOK]: log_ret=0, future_win=0, past_win=0
Nulls [NZD]: log_ret=0, future_win=0, past_win=0


Index(['USD_forward_label', 'EUR_forward_label', 'JPY_forward_label',
       'GBP_forward_label', 'CAD_forward_label', 'AUD_forward_label',
       'CHF_forward_label', 'SEK_forward_label', 'NOK_forward_label',
       'NZD_forward_label', 'USD_backward_label', 'EUR_backward_label',
       'JPY_backward_label', 'GBP_backward_label', 'CAD_backward_label',
       'AUD_backward_label', 'CHF_backward_label', 'SEK_backward_label',
       'NOK_backward_label', 'NZD_backward_label'],
      dtype='object')

### 1.7 Assign labels to news articles

In [None]:
# Prepare trading dates DataFrame from df_labels index
# This contains all the trading days where we have labels
df_trading_dates = pd.DataFrame({'trading_date': df_labels.index}).sort_values('trading_date')

# Prepare news articles - ensure sorted and reset index
df_news = df_news.sort_values(by='Date').reset_index(drop=True)

# Create a temporary DataFrame with Date for merge_asof
df_news_temp = df_news[['Date']].copy()

# For FUTURE labels: Use next available trading day if article is on non-trading day
# direction='forward' means: for each article date, find the next trading day
df_news_future = pd.merge_asof(
    df_news_temp.sort_values('Date'),
    df_trading_dates.sort_values('trading_date'),
    left_on='Date',
    right_on='trading_date',
    direction='forward'  # Forward: next available trading day
)

# For PAST labels: Use most recent preceding trading day if article is on non-trading day
# direction='backward' means: for each article date, find the previous trading day
df_news_past = pd.merge_asof(
    df_news_temp.sort_values('Date'),
    df_trading_dates.sort_values('trading_date'),
    left_on='Date',
    right_on='trading_date',
    direction='backward'  # Backward: previous trading day
)

# Merge the aligned trading dates back to df_news using Date as key
df_news = df_news.merge(
    df_news_future[['Date', 'trading_date']].rename(columns={'trading_date': 'trading_date_future'}),
    on='Date',
    how='left'
)
df_news = df_news.merge(
    df_news_past[['Date', 'trading_date']].rename(columns={'trading_date': 'trading_date_past'}),
    on='Date',
    how='left'
)

# Prepare labels DataFrames with trading_date as a column for merging
df_labels_future = df_labels[[col for col in df_labels.columns if col.endswith('_forward_label')]].copy()
df_labels_future['trading_date'] = df_labels_future.index
df_labels_future = df_labels_future.reset_index(drop=True)

df_labels_past = df_labels[[col for col in df_labels.columns if col.endswith('_backward_label')]].copy()
df_labels_past['trading_date'] = df_labels_past.index
df_labels_past = df_labels_past.reset_index(drop=True)

# Merge forward labels using the aligned future trading dates
df_news = df_news.merge(
    df_labels_future,
    left_on='trading_date_future',
    right_on='trading_date',
    how='left',
    suffixes=('', '_future')
)

# Merge backward labels using the aligned past trading dates
df_news = df_news.merge(
    df_labels_past,
    left_on='trading_date_past',
    right_on='trading_date',
    how='left',
    suffixes=('', '_past')
)

# Drop the temporary trading_date columns from merges
df_news = df_news.drop(columns=['trading_date', 'trading_date_future', 'trading_date_past'], errors='ignore')

print(f"Total articles: {len(df_news)}")
print(f"Articles with forward labels: {df_news[[col for col in df_news.columns if col.endswith('_forward_label')]].notna().any(axis=1).sum()}")
print(f"Articles with backward labels: {df_news[[col for col in df_news.columns if col.endswith('_backward_label')]].notna().any(axis=1).sum()}")
print("\nSample of aligned articles:")
label_cols = [col for col in df_news.columns if 'label' in col]
display_cols = ['Date', 'Title'] + label_cols[:6] if len(label_cols) > 6 else ['Date', 'Title'] + label_cols
print(df_news[display_cols].head(10))



Records in df_labels where USD_forward_label is null:
Empty DataFrame
Columns: [USD_forward_label, EUR_forward_label, JPY_forward_label, GBP_forward_label, CAD_forward_label, AUD_forward_label, CHF_forward_label, SEK_forward_label, NOK_forward_label, NZD_forward_label, USD_backward_label, EUR_backward_label, JPY_backward_label, GBP_backward_label, CAD_backward_label, AUD_backward_label, CHF_backward_label, SEK_backward_label, NOK_backward_label, NZD_backward_label]
Index: []


MergeError: Incompatible merge dtype, dtype('<M8[ns]') and dtype('O'), both sides must have numeric dtype