In [1]:
# =============================================================================
# Python Script for VAR-teXt Analysis of Bank of Japan Statements (Gensim Version)
#
# This script replicates the methodology from Ferreira (2021) "Forecasting with
# VAR-teXt and DFM-teXt Models" using Bank of Japan data.
#
# It uses the 'gensim' library for topic modeling, as indicated in the user's
# provided 'lda.ipynb' notebook.
#
# The script performs the following steps:
# 1. Loads the pre-processed text data from 'cleaned_english_corpus.csv'.
# 2. Applies Latent Dirichlet Allocation (LDA) using Gensim to extract topics.
# 3. Loads and prepares the Japanese CPI data from 'cpi.csv'.
# 4. Merges the LDA topics with the CPI data based on document dates.
# 5. Fits a baseline VAR model using only CPI data.
# 6. Fits a text-augmented VAR (VAR-teXt) model using CPI and text topics.
# 7. Generates forecasts from both models and evaluates their performance.
# 8. Visualizes the results.
# =============================================================================

import pandas as pd
import numpy as np
import re
from datetime import datetime

# --- NLP and Machine Learning ---
from gensim import corpora, models

# --- Time Series Analysis ---
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error

# --- Visualization ---
import matplotlib.pyplot as plt
import seaborn as sns

# --- Configuration ---
N_TOPICS = 10    # Number of topics to extract from the text
VAR_LAGS = 3     # Number of lags for the VAR models
TEST_SIZE = 12   # Number of months to use for the out-of-sample forecast test

def run_lda_analysis(corpus_path: str, n_topics: int):
    """
    Performs LDA using Gensim on the cleaned text corpus.

    Args:
        corpus_path (str): Path to the cleaned corpus CSV file.
        n_topics (int): The number of topics for the LDA model.

    Returns:
        pd.DataFrame: A DataFrame with filenames and topic distributions.
    """
    print("--- Step 1: Running LDA Analysis with Gensim ---")
    try:
        df_corpus = pd.read_csv(corpus_path)
    except FileNotFoundError:
        print(f"Error: The file '{corpus_path}' was not found.")
        print("Please ensure you have run your pre-processing script first.")
        return None

    # The pre-processing script saves tokens as a string representation of a list.
    # We convert it back to a list of strings for Gensim.
    processed_docs = df_corpus['cleaned_tokens'].apply(eval)

    # Create Gensim Dictionary and Bag-of-Words Corpus
    dictionary = corpora.Dictionary(processed_docs)
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

    # Calculate hyperparameters as per the research paper
    V = len(dictionary)
    alpha = 50 / n_topics
    eta = 200 / V

    print(f"Training LDA with K={n_topics}, alpha={alpha:.4f}, eta={eta:.4f}")
    
    # Train the LDA model
    lda_model = models.LdaModel(
        corpus=bow_corpus,
        id2word=dictionary,
        num_topics=n_topics,
        alpha=alpha,
        eta=eta,
        passes=15,
        iterations=400,
        random_state=42
    )

    # Print top words for each topic
    print("\n--- Top Words for Each Topic ---")
    for idx, topic in lda_model.print_topics(-1):
        print(f"Topic #{idx}: {topic}")
    print("-" * 30)

    # Extract topic probabilities for each document
    topic_distributions = []
    for doc_bow in bow_corpus:
        doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)
        topic_prob_dict = dict(doc_topics)
        topic_prob_vector = [topic_prob_dict.get(i, 0.0) for i in range(n_topics)]
        topic_distributions.append(topic_prob_vector)

    # Create a DataFrame with the topic distributions (textual factors)
    topic_cols = [f'topic_{i}' for i in range(n_topics)]
    df_topics = pd.DataFrame(topic_distributions, columns=topic_cols)
    df_topics['filename'] = df_corpus['filename']
    
    print("LDA analysis complete. Textual factors generated.")
    return df_topics

def prepare_data(topics_df: pd.DataFrame, cpi_path: str):
    """
    Loads CPI data, prepares it, and merges it with the topic data.

    Args:
        topics_df (pd.DataFrame): DataFrame containing the LDA topic distributions.
        cpi_path (str): Path to the CPI data CSV file.

    Returns:
        pd.DataFrame: A merged DataFrame ready for VAR modeling.
    """
    print("\n--- Step 2: Preparing and Merging Data ---")
    
    # --- Prepare Topic Data ---
    # Extract date from filename (e.g., 'k210121a.pdf' -> '2021-01')
    def extract_date_from_filename(filename):
        match = re.search(r'k(\d{2})(\d{2})(\d{2})', filename)
        if match:
            year, month, day = match.groups()
            return f"20{year}-{month}"
        return None

    topics_df['month'] = topics_df['filename'].apply(extract_date_from_filename)
    topics_df = topics_df.dropna(subset=['month'])
    
    # Average topic scores for multiple meetings in the same month
    monthly_topics = topics_df.groupby('month').mean(numeric_only=True).reset_index()
    monthly_topics['date'] = pd.to_datetime(monthly_topics['month'])
    monthly_topics = monthly_topics.set_index('date').drop(columns=['month'])

    # --- Prepare CPI Data ---
    try:
        df_cpi = pd.read_csv(cpi_path, parse_dates=['date'])
    except FileNotFoundError:
        print(f"Error: The file '{cpi_path}' was not found.")
        return None
        
    df_cpi = df_cpi.set_index('date').sort_index()
    df_cpi['cpi_yoy'] = df_cpi['cpi'].pct_change(12) * 100
    df_cpi = df_cpi.dropna()

    # --- Merge DataFrames ---
    df_merged = df_cpi.join(monthly_topics, how='left')
    
    # Forward-fill topic data
    topic_cols = [col for col in df_merged.columns if col.startswith('topic_')]
    df_merged[topic_cols] = df_merged[topic_cols].fillna(method='ffill')
    df_merged = df_merged.dropna()

    print("Data preparation and merging complete.")
    print(f"Final dataset shape: {df_merged.shape}")
    
    return df_merged

def run_var_analysis(df: pd.DataFrame):
    """
    Fits and compares the Baseline VAR and VAR-teXt models.

    Args:
        df (pd.DataFrame): The final merged DataFrame for analysis.
    """
    print("\n--- Step 3: Running VAR and VAR-teXt Analysis ---")

    # --- Data Splitting ---
    train_data = df.iloc[:-TEST_SIZE]
    test_data = df.iloc[-TEST_SIZE:]
    
    # --- Baseline VAR Model ---
    print("\nFitting Baseline VAR model (CPI only)...")
    endog_vars = ['cpi_yoy']
    model_baseline = VAR(train_data[endog_vars])
    results_baseline = model_baseline.fit(VAR_LAGS)

    # --- VAR-teXt Model ---
    print("Fitting VAR-teXt model (CPI + Textual Factors)...")
    # Use K-1 topics as exogenous variables to avoid multicollinearity
    exog_vars = [col for col in df.columns if col.startswith('topic_')][:-1]
    model_text = VAR(train_data[endog_vars], exog=train_data[exog_vars])
    results_text = model_text.fit(VAR_LAGS)

    # --- Forecasting and Evaluation ---
    print("\n--- Step 4: Forecasting and Evaluation ---")
    
    # Forecast Baseline
    forecast_input_baseline = train_data[endog_vars].values[-VAR_LAGS:]
    forecast_baseline = results_baseline.forecast(y=forecast_input_baseline, steps=TEST_SIZE)
    
    # Forecast VAR-teXt
    forecast_input_text = train_data[endog_vars].values[-VAR_LAGS:]
    forecast_text = results_text.forecast(y=forecast_input_text, steps=TEST_SIZE, exog_future=test_data[exog_vars])

    # Calculate RMSE
    rmse_baseline = np.sqrt(mean_squared_error(test_data['cpi_yoy'], forecast_baseline))
    rmse_text = np.sqrt(mean_squared_error(test_data['cpi_yoy'], forecast_text))

    print(f"\nForecasting Performance (RMSE on test set of {TEST_SIZE} months):")
    print(f"  - Baseline VAR: {rmse_baseline:.4f}")
    print(f"  - VAR-teXt:     {rmse_text:.4f}")

    if rmse_text < rmse_baseline:
        improvement = (rmse_baseline - rmse_text) / rmse_baseline * 100
        print(f"\nThe VAR-teXt model performed better, with an RMSE improvement of {improvement:.2f}%.")
    else:
        print("\nThe Baseline VAR model performed better or equal to the VAR-teXt model.")

    # --- Visualization ---
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(14, 7))
    
    ax.plot(df.index, df['cpi_yoy'], label='Actual CPI (Y-o-Y %)', color='black', linewidth=2)
    ax.plot(test_data.index, forecast_baseline, label=f'Baseline VAR Forecast (RMSE: {rmse_baseline:.2f})', linestyle='--', color='blue')
    ax.plot(test_data.index, forecast_text, label=f'VAR-teXt Forecast (RMSE: {rmse_text:.2f})', linestyle='--', color='red')
    
    ax.set_title('CPI Forecast Comparison: Baseline VAR vs. VAR-teXt', fontsize=16)
    ax.set_ylabel('Year-over-Year CPI Change (%)')
    ax.legend(fontsize=12)
    ax.axvspan(test_data.index[0], test_data.index[-1], color='gray', alpha=0.2, label='Test Period')
    plt.tight_layout()
    plt.savefig("forecast_comparison.png")
    print("\nForecast plot saved as 'forecast_comparison.png'")
    plt.show()

if __name__ == '__main__':
    print("====== Starting VAR-teXt Replication (Gensim Version) ======")
    
    df_topics = run_lda_analysis("cleaned_english_corpus.csv", n_topics=N_TOPICS)
    
    if df_topics is not None:
        final_data = prepare_data(df_topics, "cpi.csv")
        
        if final_data is not None:
            run_var_analysis(df=final_data)
            
    print("\n====== Analysis Complete ======")


--- Step 1: Running LDA Analysis with Gensim ---
Error: The file 'cleaned_english_corpus.csv' was not found.
Please ensure you have run your pre-processing script first.

