In [2]:
%%bash
set -e

echo "=== Step 1: Create clean Python venv ==="
python3.10 -m venv .venv_mlops_sentiment
source .venv_mlops_sentiment/bin/activate

echo "=== Step 2: Upgrade pip ==="
pip install --upgrade pip

echo "=== Step 3: Install core dependencies ==="
pip install numpy==1.26.4 pandas==2.2.2 scipy==1.11.4
pip install scikit-learn==1.4.2
pip install matplotlib==3.8.4 seaborn==0.13.2
pip install yfinance==0.2.44
pip install nltk==3.9.1

echo "=== Step 4: Install PyCaret (+ all models) ==="
pip install 'pycaret[analysis,models]==3.3.2'

echo "=== Step 5: Download NLTK VADER ==="
python - <<'EOF'
import nltk
nltk.download("vader_lexicon")
EOF

echo "=== Step 6: If macOS, install libomp for LightGBM ==="
if [[ "$OSTYPE" == "darwin"* ]]; then
    brew install libomp || true
    export LDFLAGS="-L/opt/homebrew/opt/libomp/lib"
    export CPPFLAGS="-I/opt/homebrew/opt/libomp/include"
    echo "libomp installed and flags exported."
fi

echo "=== All dependencies installed successfully ==="

=== Step 1: Create clean Python venv ===
=== Step 2: Upgrade pip ===
=== Step 3: Install core dependencies ===
Collecting pandas==2.2.2
  Using cached pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (19 kB)
Using cached pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl (11.3 MB)
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.1.4
    Uninstalling pandas-2.1.4:
      Successfully uninstalled pandas-2.1.4


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pycaret 3.3.2 requires pandas<2.2.0, but you have pandas 2.2.2 which is incompatible.
sktime 0.26.0 requires pandas<2.2.0,>=1.1, but you have pandas 2.2.2 which is incompatible.[0m[31m
[0m

Successfully installed pandas-2.2.2
Collecting matplotlib==3.8.4
  Using cached matplotlib-3.8.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.8 kB)
Using cached matplotlib-3.8.4-cp310-cp310-macosx_11_0_arm64.whl (7.5 MB)
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.7.5
    Uninstalling matplotlib-3.7.5:
      Successfully uninstalled matplotlib-3.7.5


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pycaret 3.3.2 requires matplotlib<3.8.0, but you have matplotlib 3.8.4 which is incompatible.
pycaret 3.3.2 requires pandas<2.2.0, but you have pandas 2.2.2 which is incompatible.[0m[31m
[0m

Successfully installed matplotlib-3.8.4
=== Step 4: Install PyCaret (+ all models) ===
Collecting pandas<2.2.0 (from pycaret==3.3.2->pycaret[analysis,models]==3.3.2)
  Using cached pandas-2.1.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (18 kB)
Collecting matplotlib<3.8.0 (from pycaret==3.3.2->pycaret[analysis,models]==3.3.2)
  Using cached matplotlib-3.7.5-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.7 kB)
Using cached matplotlib-3.7.5-cp310-cp310-macosx_11_0_arm64.whl (7.3 MB)
Using cached pandas-2.1.4-cp310-cp310-macosx_11_0_arm64.whl (10.9 MB)
Installing collected packages: pandas, matplotlib
[2K  Attempting uninstall: pandas
[2K    Found existing installation: pandas 2.2.2
[2K    Uninstalling pandas-2.2.2:
[2K      Successfully uninstalled pandas-2.2.2
[2K  Attempting uninstall: matplotlib━━━━━━━━━━━━━━[0m [32m0/2[0m [pandas]━━━━━━━━━━━[0m [32m0/2[0m [pandas]━━━━━━━━━━━[0m [32m0/2[0m [pandas]━━━━━━━━━━━[0m [32m0/2[0m [pandas]━━━━━━━━━━━[0m [32m0/2[0m [pandas]━

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ultraronachart/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


=== Step 6: If macOS, install libomp for LightGBM ===


✔︎ JSON API cask.jws.json
✔︎ JSON API formula.jws.json
To reinstall 21.1.7, run:
  brew reinstall libomp


libomp installed and flags exported.
=== All dependencies installed successfully ===


# Download news data to make a sentiment score and combine to both alternative sentiment score and S&P500 to make a "main database" prepare for run the model.

### Import news text data

In [64]:
# Import libraries
import pandas as pd

### Import News .csv file ###
# Import CNBC news
path_cnbc = "/Users/ultraronachart/Documents/Macine Learning Operations/Final_project/data/raw/cnbc_headlines.csv"
cnbc = pd.read_csv(path_cnbc)

# Import Guardian news
path_gd = "/Users/ultraronachart/Documents/Macine Learning Operations/Final_project/data/raw/guardian_headlines.csv"
guardian = pd.read_csv(path_gd)

# Import Reuters news
path_reuters = "/Users/ultraronachart/Documents/Macine Learning Operations/Final_project/data/raw/reuters_headlines.csv"
reuters = pd.read_csv(path_reuters)

In [65]:
### Prepare each source dataframe to merge later ###
def prepare_source(df, source_name, has_description=True):
    """
    Standardize column names to be lower case, add the source column, and replace description = headline if there is no description. 
    """

    df = df.copy()

    # Rename columns
    rename_map = {
        "Headlines": "headline",
        "Time": "time",
        "Description": "description"
    }
    df = df.rename(columns=rename_map)

    # Replace description = headline if there is no description
    if not has_description:
        df['description'] = df['headline']

    df['source'] = source_name

    # Sort order of the columns
    return df[["time", "headline", "description", "source"]]

In [66]:
cnbc_std = prepare_source(cnbc, "cnbc", has_description=True)
guardian_std = prepare_source(guardian, "guardian", has_description=False)
reuters_std = prepare_source(reuters, "reuters", has_description=True)

# Append (stack) all rows on top of each other
news_all = pd.concat([cnbc_std, guardian_std, reuters_std], ignore_index=True)

In [67]:
### Basic text cleaning ###

# Ensure headline/description are strings and strip whitespace
news_all["headline"] = news_all["headline"].astype(str).str.strip()
news_all["description"] = news_all["description"].astype(str).str.strip()

# Sometimes there are weird line breaks / multiple spaces in Guardian
news_all["headline"] = news_all["headline"].str.replace(r"\s+", " ", regex=True)
news_all["description"] = news_all["description"].str.replace(r"\s+", " ", regex=True)

# Drop rows where headline is empty after stripping
news_all = news_all[news_all["headline"].str.len() > 0]

# --- Time parsing & cleaning ---

# Convert time column to datetime, coerce invalid formats to NaT (missing)
news_all["time"] = pd.to_datetime(news_all["time"], errors="coerce")

# Drop rows where time could not be parsed
news_all = news_all.dropna(subset=["time"])

# Create a pure date column (will be super useful later when we aggregate per day)
news_all["date"] = news_all["time"].dt.date

# --- Handle duplicates ---

# Drop exact duplicate rows (same time, headline, description, source)
news_all = news_all.drop_duplicates(subset=["time", "headline", "description", "source"])

news_all.info()
news_all.head()

<class 'pandas.core.frame.DataFrame'>
Index: 53272 entries, 0 to 53649
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   time         53272 non-null  datetime64[ns]
 1   headline     53272 non-null  object        
 2   description  53272 non-null  object        
 3   source       53272 non-null  object        
 4   date         53272 non-null  object        
dtypes: datetime64[ns](1), object(4)
memory usage: 2.4+ MB


Unnamed: 0,time,headline,description,source,date
0,2020-07-17 19:51:00,Jim Cramer: A better way to invest in the Covi...,"""Mad Money"" host Jim Cramer recommended buying...",cnbc,2020-07-17
1,2020-07-17 19:33:00,Cramer's lightning round: I would own Teradyne,"""Mad Money"" host Jim Cramer rings the lightnin...",cnbc,2020-07-17
3,2020-07-17 19:25:00,"Cramer's week ahead: Big week for earnings, ev...","""We'll pay more for the earnings of the non-Co...",cnbc,2020-07-17
4,2020-07-17 16:24:00,IQ Capital CEO Keith Bliss says tech and healt...,"Keith Bliss, IQ Capital CEO, joins ""Closing Be...",cnbc,2020-07-17
5,2020-07-16 19:36:00,Wall Street delivered the 'kind of pullback I'...,"""Look for the stocks of high-quality companies...",cnbc,2020-07-16


### Create the sentiment score using NLTK (1st sentiment analysis approach)

In [68]:
### Create the sentiment of each article ###
!pip install nltk

import nltk
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ultraronachart/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [69]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [70]:
news_all["text"] = (
    news_all["headline"].fillna("") 
    + ". " + 
    news_all["description"].fillna("")
)

In [71]:
def get_sentiment_score(text):
    return sia.polarity_scores(text)

In [72]:
sentiment_df = news_all['text'].apply(get_sentiment_score).apply(pd.Series)

news_all = pd.concat([news_all, sentiment_df], axis=1)

news_all[["time", "date", "source", "headline", "compound", "pos", "neg", "neu"]].head()

Unnamed: 0,time,date,source,headline,compound,pos,neg,neu
0,2020-07-17 19:51:00,2020-07-17,cnbc,Jim Cramer: A better way to invest in the Covi...,0.5267,0.232,0.098,0.671
1,2020-07-17 19:33:00,2020-07-17,cnbc,Cramer's lightning round: I would own Teradyne,-0.2023,0.074,0.098,0.828
3,2020-07-17 19:25:00,2020-07-17,cnbc,"Cramer's week ahead: Big week for earnings, ev...",0.3612,0.078,0.038,0.885
4,2020-07-17 16:24:00,2020-07-17,cnbc,IQ Capital CEO Keith Bliss says tech and healt...,0.8126,0.183,0.0,0.817
5,2020-07-16 19:36:00,2020-07-16,cnbc,Wall Street delivered the 'kind of pullback I'...,-0.6597,0.0,0.134,0.866


In [73]:
daily_sentiment = (
    news_all
    .groupby("date")
    .agg(
        sentiment_compound_mean=("compound", "mean"),
        sentiment_compound_std=("compound", "std"),
        sentiment_compound_min=("compound", "min"),
        sentiment_compound_max=("compound", "max"),
        sentiment_pos_mean=("pos", "mean"),
        sentiment_neg_mean=("neg", "mean"),
        sentiment_neu_mean=("neu", "mean"),
        num_news=("headline", "count")
    )
    .reset_index()
)

# Some days may have only 1 article → std will be NaN
daily_sentiment["sentiment_compound_std"] = daily_sentiment["sentiment_compound_std"].fillna(0.0)

print(daily_sentiment.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 931 entries, 0 to 930
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   date                     931 non-null    object 
 1   sentiment_compound_mean  931 non-null    float64
 2   sentiment_compound_std   931 non-null    float64
 3   sentiment_compound_min   931 non-null    float64
 4   sentiment_compound_max   931 non-null    float64
 5   sentiment_pos_mean       931 non-null    float64
 6   sentiment_neg_mean       931 non-null    float64
 7   sentiment_neu_mean       931 non-null    float64
 8   num_news                 931 non-null    int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 65.6+ KB
None


### Downloading the S&P 500 data

In [74]:
import yfinance as yf
import pandas as pd

In [75]:
sp500 = yf.download("^GSPC", start="2000-01-01", progress=False)

# Flatten column names if they are MultiIndex:
if isinstance(sp500.columns, pd.MultiIndex):
    sp500.columns = [col[0] for col in sp500.columns]  # take the high level

    # Move Date index to a column
sp500 = sp500.reset_index()

# Rename Date to date, and others to lowercase
sp500 = sp500.rename(columns={
    "Date": "date",
    "Close": "close",
    "Open": "open",
    "High": "high",
    "Low": "low",
    "Volume": "volume",
    "Price": "price"
})

# Convert to date only (no time)
sp500["date"] = pd.to_datetime(sp500["date"]).dt.date

sp500.head()

Unnamed: 0,date,close,high,low,open,volume
0,2000-01-03,1455.219971,1478.0,1438.359985,1469.25,931800000
1,2000-01-04,1399.420044,1455.219971,1397.430054,1455.219971,1009000000
2,2000-01-05,1402.109985,1413.27002,1377.680054,1399.420044,1085500000
3,2000-01-06,1403.449951,1411.900024,1392.099976,1402.109985,1092300000
4,2000-01-07,1441.469971,1441.469971,1400.72998,1403.449951,1225200000


In [76]:
# Daily return (%)
sp500["return_t"] = sp500["close"].pct_change() * 100

# Next-day return (%), our target
sp500["return_t_plus_1"] = sp500["return_t"].shift(-1)

# Drop the first and last rows where returns are NaN
sp500 = sp500.dropna(subset=["return_t", "return_t_plus_1"])

sp500.head()

Unnamed: 0,date,close,high,low,open,volume,return_t,return_t_plus_1
1,2000-01-04,1399.420044,1455.219971,1397.430054,1455.219971,1009000000,-3.834467,0.192218
2,2000-01-05,1402.109985,1413.27002,1377.680054,1399.420044,1085500000,0.192218,0.095568
3,2000-01-06,1403.449951,1411.900024,1392.099976,1402.109985,1092300000,0.095568,2.70904
4,2000-01-07,1441.469971,1441.469971,1400.72998,1403.449951,1225200000,2.70904,1.118997
5,2000-01-10,1457.599976,1464.359985,1441.469971,1441.469971,1064800000,1.118997,-1.306251


### Merge VADER sentiment with S&P 500

In [77]:
model_df = pd.merge(
    daily_sentiment,
    sp500[["date", "return_t", "return_t_plus_1"]],
    on="date",
    how="inner"
)

# Sort by date just to be safe
model_df_sorted = model_df.sort_values("date").reset_index(drop=True)

model_df_sorted.head()

Unnamed: 0,date,sentiment_compound_mean,sentiment_compound_std,sentiment_compound_min,sentiment_compound_max,sentiment_pos_mean,sentiment_neg_mean,sentiment_neu_mean,num_news,return_t,return_t_plus_1
0,2017-12-18,-0.11059,0.47669,-0.8807,0.8402,0.0616,0.104625,0.8338,40,0.536281,-0.323027
1,2017-12-19,-0.26177,0.481266,-0.875,0.6759,0.02695,0.12025,0.8528,20,-0.323027,-0.082789
2,2017-12-20,0.031795,0.498608,-0.9698,0.875,0.06895,0.0789,0.85215,20,-0.082789,0.198566
3,2017-12-21,0.07703,0.462032,-0.6808,0.765,0.1139,0.08655,0.79955,20,0.198566,-0.045817
4,2017-12-22,0.037161,0.530048,-0.8481,0.875,0.109478,0.078609,0.811913,23,-0.045817,-0.105842


In [78]:
def label_return_bucket(r):
    if r > 0.5:
        return "up"
    elif r > 0:
        return "slightly_up"
    elif r >= -0.5:
        return "slightly_down"
    else:
        return "down"

model_df_sorted = model_df_sorted.copy()
model_df_sorted["return_bucket"] = model_df_sorted["return_t_plus_1"].apply(label_return_bucket)

# Quick check: counts of each bucket
print(model_df_sorted["return_bucket"].value_counts())
print("\nShares:\n", model_df_sorted["return_bucket"].value_counts(normalize=True))

return_bucket
up               189
slightly_up      172
down             148
slightly_down    138
Name: count, dtype: int64

Shares:
 return_bucket
up               0.292117
slightly_up      0.265842
down             0.228748
slightly_down    0.213292
Name: proportion, dtype: float64


In [79]:
def label_up_down(r):
    return "up" if r > 0 else "down"

model_df_sorted = model_df_sorted.copy()
model_df_sorted['direction_binary'] = model_df_sorted['return_t_plus_1'].apply(label_up_down)

print("Class counts (all data):")
print(model_df_sorted["direction_binary"].value_counts())
print("\nClass shares:")
print(model_df_sorted["direction_binary"].value_counts(normalize=True))

Class counts (all data):
direction_binary
up      361
down    286
Name: count, dtype: int64

Class shares:
direction_binary
up      0.55796
down    0.44204
Name: proportion, dtype: float64


### Download GenAI sentiment data and merge with the Main database (model_df_sorted)

In [80]:
# Import daily sentiment
path_sentiment = "/Users/ultraronachart/Documents/Macine Learning Operations/Final_project/data/raw/daily_sentiment.csv"
daily_sentiment = pd.read_csv(path_sentiment)

In [81]:
daily_sentiment["sentiment_score"] = (daily_sentiment["Positive"] - daily_sentiment["Negative"]) / (
    daily_sentiment["Positive"] + daily_sentiment["Negative"] + daily_sentiment["Neutral"]
)

In [82]:
daily_sentiment["net_sentiment"] = (daily_sentiment["Positive"] - daily_sentiment["Negative"]) / (
    daily_sentiment["Positive"] + daily_sentiment["Negative"]
)

In [83]:
daily_sentiment["date"] = pd.to_datetime(daily_sentiment["date"])
model_df_sorted["date"] = pd.to_datetime(model_df_sorted["date"])

In [84]:
model_df_merged = model_df_sorted.merge(
    daily_sentiment[["date", "sentiment_score", "net_sentiment"]],
    on="date",
    how="left"
)

### Download the sentiment by category data (Hritik file)

In [85]:
# Import News .csv file 
path_data = "/Users/ultraronachart/Documents/Macine Learning Operations/Final_project/data/processed/csv/all_news_final.csv"
all_news = pd.read_csv(path_data)

In [86]:
all_news

Unnamed: 0,id,date,headline,description,text,source,category,category_confidence,sentiment,confidence
0,0,7/17/20,Jim Cramer: A better way to invest in the Covi...,"""Mad Money"" host Jim Cramer recommended buying...",Jim Cramer: A better way to invest in the Covi...,cnbc,Healthcare,0.44,Positive,0.85
1,1,7/17/20,Cramer's lightning round: I would own Teradyne,"""Mad Money"" host Jim Cramer rings the lightnin...",Cramer's lightning round: I would own Teradyne...,cnbc,Technology,0.56,Positive,0.85
2,2,7/17/20,"Cramer's week ahead: Big week for earnings, ev...","""We'll pay more for the earnings of the non-Co...","Cramer's week ahead: Big week for earnings, ev...",cnbc,Healthcare,0.80,Positive,0.85
3,3,7/17/20,IQ Capital CEO Keith Bliss says tech and healt...,"Keith Bliss, IQ Capital CEO, joins ""Closing Be...",IQ Capital CEO Keith Bliss says tech and healt...,cnbc,Technology,0.63,Positive,0.85
4,4,7/16/20,Wall Street delivered the 'kind of pullback I'...,"""Look for the stocks of high-quality companies...",Wall Street delivered the 'kind of pullback I'...,cnbc,Technology,0.80,Positive,0.85
...,...,...,...,...,...,...,...,...,...,...
53267,53267,3/20/18,Malaysia says never hired British data firm at...,The Malaysian government and the ruling party ...,Malaysia says never hired British data firm at...,reuters,Technology,0.60,Neutral,0.60
53268,53268,3/20/18,Prosecutors search Volkswagen headquarters in ...,German prosecutors said on Tuesday they had se...,Prosecutors search Volkswagen headquarters in ...,reuters,Automobile,0.97,Negative,0.85
53269,53269,3/20/18,McDonald's sets greenhouse gas reduction targets,McDonald's Corp on Tuesday announced an approv...,McDonald's sets greenhouse gas reduction targe...,reuters,Energy,0.73,Positive,0.85
53270,53270,3/20/18,Pratt & Whitney to deliver spare A320neo engin...,Pratt & Whitney will soon begin deliveries of ...,Pratt & Whitney to deliver spare A320neo engin...,reuters,Airlines,0.95,Positive,0.85


In [87]:
# make sure date is datetime (optional)
all_news["date"] = pd.to_datetime(all_news["date"])

# --- counts per (date, category, sentiment) ---
cat_counts = (
    all_news
    .groupby(["date", "category", "sentiment"])
    .size()
    .unstack(fill_value=0)        # columns: Positive / Negative / Neutral (if present)
    .reset_index()
)

# ensure columns exist even if some sentiment never appears
for col in ["Positive", "Negative", "Neutral"]:
    if col not in cat_counts.columns:
        cat_counts[col] = 0

# total news per date & category
cat_counts["total_cat_news"] = cat_counts[["Positive", "Negative", "Neutral"]].sum(axis=1)

# avoid division by zero
cat_counts["total_cat_news"] = cat_counts["total_cat_news"].replace(0, pd.NA)

# category-level sentiment index
cat_counts["cat_sentiment"] = (
    (cat_counts["Positive"] - cat_counts["Negative"]) / cat_counts["total_cat_news"]
)

# wide format: one column per category
cat_sent_wide = (
    cat_counts
    .pivot(index="date", columns="category", values="cat_sentiment")
    .add_prefix("sentiment_")      # e.g. sentiment_Healthcare, sentiment_Technology
    .reset_index()
)

cat_sent_wide.head()

category,date,sentiment_Airlines,sentiment_Automobile,sentiment_Corporate,sentiment_Economy,sentiment_Energy,sentiment_Geo-Political,sentiment_Healthcare,sentiment_Technology,sentiment_US Politics
0,2017-12-17,0.0,,-0.333333,,0.333333,-1.0,,0.666667,-0.5
1,2017-12-18,0.0,-1.0,-0.25,-1.0,-0.333333,-1.0,,-1.0,-0.5625
2,2017-12-19,-1.0,-1.0,-0.5,-1.0,1.0,,,0.0,-0.5
3,2017-12-20,,,-1.0,,1.0,-1.0,1.0,0.0,-0.5
4,2017-12-21,1.0,-1.0,0.0,,,0.0,-1.0,1.0,-0.666667


In [88]:
# counts per (date, sentiment)
daily_sent = (
    all_news
    .groupby(["date", "sentiment"])
    .size()
    .unstack(fill_value=0)        # columns: Positive / Negative / Neutral
)

for col in ["Positive", "Negative", "Neutral"]:
    if col not in daily_sent.columns:
        daily_sent[col] = 0

daily_sent["total_news"] = daily_sent[["Positive", "Negative", "Neutral"]].sum(axis=1)
daily_sent["total_news"] = daily_sent["total_news"].replace(0, pd.NA)

daily_sent["overall_sentiment"] = (
    (daily_sent["Positive"] - daily_sent["Negative"]) / daily_sent["total_news"]
)

daily_sent = daily_sent[["overall_sentiment"]].reset_index()

daily_sent.head()

sentiment,date,overall_sentiment
0,2017-12-17,-0.15
1,2017-12-18,-0.5
2,2017-12-19,-0.4
3,2017-12-20,-0.45
4,2017-12-21,-0.35


In [89]:
final_daily = cat_sent_wide.merge(daily_sent, on="date", how="left")

final_daily.head()

Unnamed: 0,date,sentiment_Airlines,sentiment_Automobile,sentiment_Corporate,sentiment_Economy,sentiment_Energy,sentiment_Geo-Political,sentiment_Healthcare,sentiment_Technology,sentiment_US Politics,overall_sentiment
0,2017-12-17,0.0,,-0.333333,,0.333333,-1.0,,0.666667,-0.5,-0.15
1,2017-12-18,0.0,-1.0,-0.25,-1.0,-0.333333,-1.0,,-1.0,-0.5625,-0.5
2,2017-12-19,-1.0,-1.0,-0.5,-1.0,1.0,,,0.0,-0.5,-0.4
3,2017-12-20,,,-1.0,,1.0,-1.0,1.0,0.0,-0.5,-0.45
4,2017-12-21,1.0,-1.0,0.0,,,0.0,-1.0,1.0,-0.666667,-0.35


In [90]:
final_daily_filled = final_daily.copy()

# All sentiment columns (category-level) + overall
sent_cols = [c for c in final_daily_filled.columns if c.startswith("sentiment_")]
sent_cols.append("overall_sentiment")

for col in sent_cols:
    if col in final_daily_filled.columns:
        final_daily_filled[col] = final_daily_filled[col].fillna(0.0)

final_daily_filled.head()

Unnamed: 0,date,sentiment_Airlines,sentiment_Automobile,sentiment_Corporate,sentiment_Economy,sentiment_Energy,sentiment_Geo-Political,sentiment_Healthcare,sentiment_Technology,sentiment_US Politics,overall_sentiment
0,2017-12-17,0.0,0.0,-0.333333,0.0,0.333333,-1.0,0.0,0.666667,-0.5,-0.15
1,2017-12-18,0.0,-1.0,-0.25,-1.0,-0.333333,-1.0,0.0,-1.0,-0.5625,-0.5
2,2017-12-19,-1.0,-1.0,-0.5,-1.0,1.0,0.0,0.0,0.0,-0.5,-0.4
3,2017-12-20,0.0,0.0,-1.0,0.0,1.0,-1.0,1.0,0.0,-0.5,-0.45
4,2017-12-21,1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,1.0,-0.666667,-0.35


In [91]:
final_daily_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 931 entries, 0 to 930
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   date                     931 non-null    datetime64[ns]
 1   sentiment_Airlines       931 non-null    float64       
 2   sentiment_Automobile     931 non-null    float64       
 3   sentiment_Corporate      931 non-null    float64       
 4   sentiment_Economy        931 non-null    float64       
 5   sentiment_Energy         931 non-null    float64       
 6   sentiment_Geo-Political  931 non-null    float64       
 7   sentiment_Healthcare     931 non-null    float64       
 8   sentiment_Technology     931 non-null    float64       
 9   sentiment_US Politics    931 non-null    float64       
 10  overall_sentiment        931 non-null    float64       
dtypes: datetime64[ns](1), float64(10)
memory usage: 80.1 KB


In [92]:
model_df_merged = model_df_merged.merge(
    final_daily_filled,
    on="date",
    how="left"
)

# In case some dates exist in returns but not in news at all:
sent_cols_all = [c for c in model_df_merged.columns if c.startswith("sentiment_")] + ["overall_sentiment"]
for col in sent_cols_all:
    if col in model_df_merged.columns:
        model_df_merged[col] = model_df_merged[col].fillna(0.0)

In [93]:
model_df_merged.head()

Unnamed: 0,date,sentiment_compound_mean,sentiment_compound_std,sentiment_compound_min,sentiment_compound_max,sentiment_pos_mean,sentiment_neg_mean,sentiment_neu_mean,num_news,return_t,...,sentiment_Airlines,sentiment_Automobile,sentiment_Corporate,sentiment_Economy,sentiment_Energy,sentiment_Geo-Political,sentiment_Healthcare,sentiment_Technology,sentiment_US Politics,overall_sentiment
0,2017-12-18,-0.11059,0.47669,-0.8807,0.8402,0.0616,0.104625,0.8338,40,0.536281,...,0.0,-1.0,-0.25,-1.0,-0.333333,-1.0,0.0,-1.0,-0.5625,-0.5
1,2017-12-19,-0.26177,0.481266,-0.875,0.6759,0.02695,0.12025,0.8528,20,-0.323027,...,-1.0,-1.0,-0.5,-1.0,1.0,0.0,0.0,0.0,-0.5,-0.4
2,2017-12-20,0.031795,0.498608,-0.9698,0.875,0.06895,0.0789,0.85215,20,-0.082789,...,0.0,0.0,-1.0,0.0,1.0,-1.0,1.0,0.0,-0.5,-0.45
3,2017-12-21,0.07703,0.462032,-0.6808,0.765,0.1139,0.08655,0.79955,20,0.198566,...,1.0,-1.0,0.0,0.0,0.0,0.0,-1.0,1.0,-0.666667,-0.35
4,2017-12-22,0.037161,0.530048,-0.8481,0.875,0.109478,0.078609,0.811913,23,-0.045817,...,0.0,0.0,-0.555556,-1.0,1.0,0.0,-1.0,0.6,0.6,0.043478


In [94]:
model_df_merged.columns

Index(['date', 'sentiment_compound_mean', 'sentiment_compound_std',
       'sentiment_compound_min', 'sentiment_compound_max',
       'sentiment_pos_mean', 'sentiment_neg_mean', 'sentiment_neu_mean',
       'num_news', 'return_t', 'return_t_plus_1', 'return_bucket',
       'direction_binary', 'sentiment_score', 'net_sentiment',
       'sentiment_Airlines', 'sentiment_Automobile', 'sentiment_Corporate',
       'sentiment_Economy', 'sentiment_Energy', 'sentiment_Geo-Political',
       'sentiment_Healthcare', 'sentiment_Technology', 'sentiment_US Politics',
       'overall_sentiment'],
      dtype='object')

# AutoML to refine the algorithms

In [124]:
# Copy the dataframe
df = model_df_merged.copy()

# Make sure the date is sorted
df = df.sort_values("date")

# drop rows with missing targets if any
targets = ["return_t_plus_1", "return_bucket", "direction_binary"]
df = df.dropna(subset=targets)

In [125]:
# Time-based train/test split (e.g. 70/30)
n = len(df)
split_idx = int(n * 0.7)

train_df = df.iloc[:split_idx].reset_index(drop=True)
test_df  = df.iloc[split_idx:].reset_index(drop=True)

print("Train size:", train_df.shape, "Test size:", test_df.shape)
print("Train period:", model_df_sorted['date'].iloc[0], '->', model_df_sorted['date'].iloc[split_idx-1])
print("Test period:", model_df_sorted["date"].iloc[split_idx], "→", model_df_sorted["date"].iloc[-1])

Train size: (452, 25) Test size: (195, 25)
Train period: 2017-12-18 00:00:00 -> 2019-10-08 00:00:00
Test period: 2019-10-09 00:00:00 → 2020-07-17 00:00:00


In [130]:
### Feature set up
feature_cols_all = [
    "sentiment_compound_mean",
    "sentiment_compound_std",
    "sentiment_compound_min",
    "sentiment_compound_max",
    "sentiment_pos_mean",
    "sentiment_neg_mean",
    "sentiment_neu_mean",
    "sentiment_score",
    "sentiment_Airlines",
    "sentiment_Automobile",
    "sentiment_Corporate",
    "sentiment_Economy",
    "sentiment_Energy",
    "sentiment_Geo-Political", 
    "sentiment_Healthcare",
    "sentiment_Technology",
    "sentiment_US Politics",
    "overall_sentiment",
]

feature_cols_vader = [
    "sentiment_compound_mean",
    "sentiment_compound_std",
    "sentiment_compound_min",
    "sentiment_compound_max",
    "sentiment_pos_mean",
    "sentiment_neg_mean",
    "sentiment_neu_mean",
]

feature_cols_genai = [
    "sentiment_score",
]

feature_cols_sector = [
    "sentiment_Airlines",
    "sentiment_Automobile",
    "sentiment_Corporate",
    "sentiment_Economy",
    "sentiment_Energy",
    "sentiment_Geo-Political", 
    "sentiment_Healthcare",
    "sentiment_Technology",
    "sentiment_US Politics",
    "overall_sentiment",
]

In [134]:
import numpy as np
import pandas as pd

from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    accuracy_score,
    f1_score,
)

from pycaret.regression import (
    setup as reg_setup,
    compare_models as reg_compare,
    finalize_model as reg_finalize,
    predict_model as reg_predict,
)

from pycaret.classification import (
    setup as clf_setup,
    compare_models as clf_compare,
    finalize_model as clf_finalize,
    predict_model as clf_predict,
)

# -------------------------------------------------------------------
# 0. DEFINE FEATURE SETS (plug in your actual lists here)
# -------------------------------------------------------------------
feature_sets = {
    "All sentiment":      feature_cols_all,     # e.g. all sentiment features
    "VADER sentiment":    feature_cols_vader,
    "GenAI sentiment":    feature_cols_genai,
    "Sectoral sentiment": feature_cols_sector,  # category-based indices
}

# -------------------------------------------------------------------
# 1. HELPER: DETECT PREDICTION COLUMN FROM predict_model()
# -------------------------------------------------------------------
def get_prediction_column(pred_df: pd.DataFrame, feature_cols):
    """
    pred_df: DataFrame returned by pycaret.predict_model
    feature_cols: list of feature column names used in X_test

    Returns the name of the column that contains predictions
    (works even if it's not called 'Label').
    """
    extra_cols = [c for c in pred_df.columns if c not in feature_cols]
    # You can print once to inspect:
    # print("Extra prediction columns:", extra_cols)
    pred_col = extra_cols[0]
    return pred_col

# -------------------------------------------------------------------
# 2. GLOBAL RESULTS LIST + LOGGING HELPERS
# -------------------------------------------------------------------
results = []

def log_regression_result(target, feature_set, model, y_true, y_pred):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2   = r2_score(y_true, y_pred)

    # directional accuracy: sign of return (up vs non-up)
    true_dir = np.where(y_true > 0, 1, 0)
    pred_dir = np.where(y_pred > 0, 1, 0)
    dir_acc  = (true_dir == pred_dir).mean()

    results.append(
        {
            "task":          "regression",
            "target":        target,
            "feature_set":   feature_set,
            "model":         model.__class__.__name__,
            "MAE":           mae,
            "RMSE":          rmse,
            "R2":            r2,
            "DirectionalAcc": dir_acc,
            "Accuracy":      np.nan,
            "F1_macro":      np.nan,
        }
    )

def log_classification_result(task_name, feature_set, model, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")

    results.append(
        {
            "task":          task_name,              # 'bucket_return' or 'direction_binary'
            "target":        task_name,
            "feature_set":   feature_set,
            "model":         model.__class__.__name__,
            "MAE":           np.nan,
            "RMSE":          np.nan,
            "R2":            np.nan,
            "DirectionalAcc": np.nan,
            "Accuracy":      acc,
            "F1_macro":      f1,
        }
    )

# ===================================================================
# 3. REGRESSION: return_t_plus_1  (4 feature sets)
# ===================================================================
for fs_name, fs_cols in feature_sets.items():
    print(f"\n===== REGRESSION – return_t_plus_1 – {fs_name} =====\n")

    reg_train = train_df[fs_cols + ["return_t_plus_1"]].copy()

    reg_setup(
        data=reg_train,
        target="return_t_plus_1",
        session_id=42,
        html=False,
        log_experiment=False,
        n_jobs=1,      # avoid joblib issues on Mac
        verbose=True,
    )

    best_reg = reg_compare(sort="MAE", turbo=True)
    print("Best model:", best_reg)

    final_reg = reg_finalize(best_reg)

    X_test = test_df[fs_cols].copy()
    y_test = test_df["return_t_plus_1"].copy()

    pred_df = reg_predict(final_reg, data=X_test)
    pred_col = get_prediction_column(pred_df, fs_cols)
    y_pred = pred_df[pred_col]

    log_regression_result(
        target="return_t_plus_1",
        feature_set=fs_name,
        model=final_reg,
        y_true=y_test,
        y_pred=y_pred,
    )

# ===================================================================
# 4. CLASSIFICATION: bucket_return  (multi-class, 4 feature sets)
# ===================================================================
for fs_name, fs_cols in feature_sets.items():
    print(f"\n===== CLASSIFICATION – return_bucket – {fs_name} =====\n")

    cls_train = train_df[fs_cols + ["return_bucket"]].copy()

    clf_setup(
        data=cls_train,
        target="return_bucket",
        session_id=42,
        html=False,
        log_experiment=False,
        n_jobs=1,
        verbose=True,
    )

    best_cls = clf_compare(sort="Accuracy", turbo=True)
    print("Best model:", best_cls)

    final_cls = clf_finalize(best_cls)

    X_test = test_df[fs_cols].copy()
    y_test = test_df["return_bucket"].copy()

    pred_df = clf_predict(final_cls, data=X_test)
    pred_col = get_prediction_column(pred_df, fs_cols)
    y_pred = pred_df[pred_col]

    log_classification_result(
        task_name="bucket_return",
        feature_set=fs_name,
        model=final_cls,
        y_true=y_test,
        y_pred=y_pred,
    )

# ===================================================================
# 5. CLASSIFICATION: direction_binary (binary, 4 feature sets)
# ===================================================================
for fs_name, fs_cols in feature_sets.items():
    print(f"\n===== CLASSIFICATION – direction_binary – {fs_name} =====\n")

    bin_train = train_df[fs_cols + ["direction_binary"]].copy()

    clf_setup(
        data=bin_train,
        target="direction_binary",
        session_id=42,
        html=False,
        log_experiment=False,
        n_jobs=1,
        verbose=True,
    )

    best_bin = clf_compare(sort="Accuracy", turbo=True)
    print("Best model:", best_bin)

    final_bin = clf_finalize(best_bin)

    X_test = test_df[fs_cols].copy()
    y_test = test_df["direction_binary"].copy()

    pred_df = clf_predict(final_bin, data=X_test)
    pred_col = get_prediction_column(pred_df, fs_cols)
    y_pred = pred_df[pred_col]

    log_classification_result(
        task_name="direction_binary",
        feature_set=fs_name,
        model=final_bin,
        y_true=y_test,
        y_pred=y_pred,
    )

# ===================================================================
# 6. SUMMARY EVALUATION TABLE
# ===================================================================
summary_df = pd.DataFrame(results).sort_values(
    by=["task", "feature_set"]
).reset_index(drop=True)

display(summary_df)

# Optionally save
summary_df.to_csv("model_summary_results.csv", index=False)


===== REGRESSION – return_t_plus_1 – All sentiment =====

                    Description             Value
0                    Session id                42
1                        Target   return_t_plus_1
2                   Target type        Regression
3           Original data shape         (452, 19)
4        Transformed data shape         (452, 19)
5   Transformed train set shape         (316, 19)
6    Transformed test set shape         (136, 19)
7              Numeric features                18
8                    Preprocess              True
9               Imputation type            simple
10           Numeric imputation              mean
11       Categorical imputation              mode
12               Fold Generator             KFold
13                  Fold Number                10
14                     CPU Jobs                 1
15                      Use GPU             False
16               Log Experiment             False
17              Experiment Name  reg-defa

                                                           

                                    Model          MAE           MSE  \
lasso                    Lasso Regression       0.7166  1.037300e+00   
llar         Lasso Least Angle Regression       0.7166  1.037300e+00   
dummy                     Dummy Regressor       0.7166  1.037300e+00   
en                            Elastic Net       0.7166  1.037300e+00   
br                         Bayesian Ridge       0.7174  1.038600e+00   
omp           Orthogonal Matching Pursuit       0.7328  1.071900e+00   
huber                     Huber Regressor       0.7501  1.087100e+00   
ridge                    Ridge Regression       0.7603  1.095700e+00   
et                  Extra Trees Regressor       0.7689  1.135800e+00   
rf                Random Forest Regressor       0.7701  1.153700e+00   
lr                      Linear Regression       0.7720  1.104500e+00   
catboost               CatBoost Regressor       0.7961  1.208900e+00   
knn                 K Neighbors Regressor       0.8075  1.201700

                                                           

                                    Model     MAE       MSE    RMSE        R2  \
lasso                    Lasso Regression  0.7166    1.0373  1.0001   -0.0483   
llar         Lasso Least Angle Regression  0.7166    1.0373  1.0001   -0.0483   
dummy                     Dummy Regressor  0.7166    1.0373  1.0001   -0.0483   
en                            Elastic Net  0.7166    1.0373  1.0001   -0.0483   
ridge                    Ridge Regression  0.7184    1.0391  1.0011   -0.0506   
huber                     Huber Regressor  0.7202    1.0314  0.9995   -0.0535   
br                         Bayesian Ridge  0.7219    1.0511  1.0069   -0.0630   
omp           Orthogonal Matching Pursuit  0.7273    1.0540  1.0080   -0.0647   
lr                      Linear Regression  0.7328    1.0419  1.0044   -0.0652   
rf                Random Forest Regressor  0.7670    1.1206  1.0399   -0.1347   
et                  Extra Trees Regressor  0.7685    1.1241  1.0390   -0.1301   
catboost               CatBo

                                                           

                                    Model     MAE     MSE    RMSE      R2  \
huber                     Huber Regressor  0.7159  1.0444  1.0060 -0.0660   
llar         Lasso Least Angle Regression  0.7166  1.0373  1.0001 -0.0483   
dummy                     Dummy Regressor  0.7166  1.0373  1.0001 -0.0483   
lasso                    Lasso Regression  0.7166  1.0373  1.0001 -0.0483   
en                            Elastic Net  0.7166  1.0373  1.0001 -0.0483   
br                         Bayesian Ridge  0.7190  1.0475  1.0057 -0.0611   
ridge                    Ridge Regression  0.7192  1.0444  1.0048 -0.0605   
omp           Orthogonal Matching Pursuit  0.7203  1.0461  1.0059 -0.0633   
lr                      Linear Regression  0.7203  1.0461  1.0059 -0.0633   
lar                Least Angle Regression  0.7203  1.0461  1.0059 -0.0633   
lightgbm  Light Gradient Boosting Machine  0.7583  1.1292  1.0482 -0.1617   
gbr           Gradient Boosting Regressor  0.8046  1.3016  1.1184 -0.3168   

                                                           

                                    Model     MAE     MSE    RMSE      R2  \
lasso                    Lasso Regression  0.7166  1.0373  1.0001 -0.0483   
llar         Lasso Least Angle Regression  0.7166  1.0373  1.0001 -0.0483   
dummy                     Dummy Regressor  0.7166  1.0373  1.0001 -0.0483   
en                            Elastic Net  0.7166  1.0373  1.0001 -0.0483   
br                         Bayesian Ridge  0.7174  1.0387  1.0007 -0.0495   
omp           Orthogonal Matching Pursuit  0.7328  1.0719  1.0173 -0.0856   
huber                     Huber Regressor  0.7346  1.0726  1.0194 -0.0948   
ridge                    Ridge Regression  0.7563  1.0903  1.0270 -0.1096   
lr                      Linear Regression  0.7585  1.0928  1.0283 -0.1127   
lar                Least Angle Regression  0.7585  1.0928  1.0283 -0.1127   
rf                Random Forest Regressor  0.7841  1.1709  1.0668 -0.2075   
et                  Extra Trees Regressor  0.7967  1.1898  1.0743 -0.2245   

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
nb                            Naive Bayes    0.3075  0.5898  0.3075  0.3135   
et                 Extra Trees Classifier    0.2979  0.5552  0.2979  0.2863   
lda          Linear Discriminant Analysis    0.2880  0.0000  0.2880  0.2761   
xgboost         Extreme Gradient Boosting    0.2787  0.5074  0.2787  0.2816   
catboost              CatBoost Classifier    0.2787  0.5148  0.2787  0.2813   
gbc          Gradient Boosting Classifier    0.2784  0.0000  0.2784  0.2808   
rf               Random Forest Classifier    0.2783  0.5373  0.2783  0.2751   
svm                   SVM - Linear Kernel    0.2755  0.0000  0.2755  0.1773   
lightgbm  Light Gradient Boosting Machine    0.2752  0.5111  0.2752  0.2824   
dummy                    Dummy Classifier    0.2752  0.5000  0.2752  0.0759   
ridge                    Ridge Classifier    0.2689  0.0000  0.2689  0.2398   
lr                    Logistic Regression    0.2659 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.3134  0.0000  0.3134  0.1717   
knn                K Neighbors Classifier    0.3068  0.5430  0.3068  0.3091   
lda          Linear Discriminant Analysis    0.3037  0.0000  0.3037  0.3028   
et                 Extra Trees Classifier    0.2921  0.5179  0.2921  0.2853   
ada                  Ada Boost Classifier    0.2911  0.0000  0.2911  0.2941   
lightgbm  Light Gradient Boosting Machine    0.2887  0.5164  0.2887  0.2867   
ridge                    Ridge Classifier    0.2879  0.0000  0.2879  0.1943   
rf               Random Forest Classifier    0.2827  0.5212  0.2827  0.2770   
xgboost         Extreme Gradient Boosting    0.2825  0.5199  0.2825  0.2713   
svm                   SVM - Linear Kernel    0.2821  0.0000  0.2821  0.1534   
dummy                    Dummy Classifier    0.2752  0.5000  0.2752  0.0759   
catboost              CatBoost Classifier    0.2731 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
ridge                    Ridge Classifier    0.3324  0.0000  0.3324  0.1942   
lda          Linear Discriminant Analysis    0.3323  0.0000  0.3323  0.1939   
lr                    Logistic Regression    0.3258  0.0000  0.3258  0.1803   
nb                            Naive Bayes    0.2944  0.5778  0.2944  0.2358   
qda       Quadratic Discriminant Analysis    0.2944  0.0000  0.2944  0.2358   
lightgbm  Light Gradient Boosting Machine    0.2817  0.5343  0.2817  0.2911   
knn                K Neighbors Classifier    0.2789  0.5701  0.2789  0.2752   
dummy                    Dummy Classifier    0.2752  0.5000  0.2752  0.0759   
xgboost         Extreme Gradient Boosting    0.2600  0.5401  0.2600  0.2681   
gbc          Gradient Boosting Classifier    0.2598  0.0000  0.2598  0.2608   
rf               Random Forest Classifier    0.2597  0.5399  0.2597  0.2586   
catboost              CatBoost Classifier    0.2595 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.2973  0.5379  0.2973  0.3141   
gbc          Gradient Boosting Classifier    0.2973  0.0000  0.2973  0.3007   
qda       Quadratic Discriminant Analysis    0.2820  0.0000  0.2820  0.2951   
nb                            Naive Bayes    0.2787  0.5588  0.2787  0.2932   
catboost              CatBoost Classifier    0.2784  0.5051  0.2784  0.2842   
dummy                    Dummy Classifier    0.2752  0.5000  0.2752  0.0759   
lightgbm  Light Gradient Boosting Machine    0.2749  0.5135  0.2749  0.2619   
xgboost         Extreme Gradient Boosting    0.2693  0.5150  0.2693  0.2588   
lr                    Logistic Regression    0.2660  0.0000  0.2660  0.2397   
dt               Decision Tree Classifier    0.2660  0.5086  0.2660  0.2596   
ridge                    Ridge Classifier    0.2660  0.0000  0.2660  0.2390   
lda          Linear Discriminant Analysis    0.2565 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
dummy                    Dummy Classifier    0.5412  0.5000  0.5412  0.2930   
lda          Linear Discriminant Analysis    0.5344  0.5591  0.5344  0.5290   
dt               Decision Tree Classifier    0.5217  0.5153  0.5217  0.5222   
svm                   SVM - Linear Kernel    0.5123  0.5227  0.5123  0.4772   
gbc          Gradient Boosting Classifier    0.5034  0.4820  0.5034  0.4980   
qda       Quadratic Discriminant Analysis    0.4992  0.5320  0.4992  0.5129   
ada                  Ada Boost Classifier    0.4905  0.4721  0.4905  0.4895   
ridge                    Ridge Classifier    0.4845  0.5004  0.4845  0.4691   
xgboost         Extreme Gradient Boosting    0.4844  0.4585  0.4844  0.4775   
lightgbm  Light Gradient Boosting Machine    0.4843  0.4775  0.4843  0.4805   
nb                            Naive Bayes    0.4778  0.5020  0.4778  0.5001   
catboost              CatBoost Classifier    0.4777 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.5412  0.4809  0.5412  0.2930   
dummy                    Dummy Classifier    0.5412  0.5000  0.5412  0.2930   
ridge                    Ridge Classifier    0.5319  0.5023  0.5319  0.4823   
lda          Linear Discriminant Analysis    0.5098  0.5308  0.5098  0.4978   
qda       Quadratic Discriminant Analysis    0.5062  0.5291  0.5062  0.5436   
svm                   SVM - Linear Kernel    0.5030  0.4994  0.5030  0.3314   
ada                  Ada Boost Classifier    0.4808  0.4564  0.4808  0.4762   
dt               Decision Tree Classifier    0.4781  0.4751  0.4781  0.4787   
nb                            Naive Bayes    0.4779  0.5179  0.4779  0.5057   
gbc          Gradient Boosting Classifier    0.4751  0.4539  0.4751  0.4699   
rf               Random Forest Classifier    0.4622  0.4347  0.4622  0.4563   
knn                K Neighbors Classifier    0.4531 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.5636  0.5711  0.5636  0.5591   
et                 Extra Trees Classifier    0.5538  0.5429  0.5538  0.5531   
knn                K Neighbors Classifier    0.5508  0.5495  0.5508  0.5478   
ridge                    Ridge Classifier    0.5413  0.5371  0.5413  0.3573   
dummy                    Dummy Classifier    0.5412  0.5000  0.5412  0.2930   
rf               Random Forest Classifier    0.5382  0.5677  0.5382  0.5334   
lr                    Logistic Regression    0.5381  0.5371  0.5381  0.2922   
dt               Decision Tree Classifier    0.5381  0.5482  0.5381  0.5369   
lda          Linear Discriminant Analysis    0.5381  0.5371  0.5381  0.3565   
xgboost         Extreme Gradient Boosting    0.5316  0.5453  0.5316  0.5305   
svm                   SVM - Linear Kernel    0.5253  0.5114  0.5253  0.3044   
ada                  Ada Boost Classifier    0.5220 

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
dummy                    Dummy Classifier    0.5412  0.5000  0.5412  0.2930   
xgboost         Extreme Gradient Boosting    0.5161  0.5059  0.5161  0.5148   
rf               Random Forest Classifier    0.5157  0.4859  0.5157  0.5076   
gbc          Gradient Boosting Classifier    0.5128  0.4870  0.5128  0.5146   
ada                  Ada Boost Classifier    0.5041  0.4914  0.5041  0.5019   
svm                   SVM - Linear Kernel    0.5004  0.4564  0.5004  0.4274   
lightgbm  Light Gradient Boosting Machine    0.4939  0.4945  0.4939  0.4924   
catboost              CatBoost Classifier    0.4907  0.5101  0.4907  0.4831   
lr                    Logistic Regression    0.4878  0.4568  0.4878  0.4943   
ridge                    Ridge Classifier    0.4847  0.4571  0.4847  0.4752   
lda          Linear Discriminant Analysis    0.4847  0.4538  0.4847  0.4755   
et                 Extra Trees Classifier    0.4782 

Unnamed: 0,task,target,feature_set,model,MAE,RMSE,R2,DirectionalAcc,Accuracy,F1_macro
0,bucket_return,bucket_return,All sentiment,Pipeline,,,,,0.302564,0.247257
1,bucket_return,bucket_return,GenAI sentiment,Pipeline,,,,,0.353846,0.247107
2,bucket_return,bucket_return,Sectoral sentiment,Pipeline,,,,,0.230769,0.218055
3,bucket_return,bucket_return,VADER sentiment,Pipeline,,,,,0.4,0.247018
4,direction_binary,direction_binary,All sentiment,Pipeline,,,,,0.594872,0.37299
5,direction_binary,direction_binary,GenAI sentiment,Pipeline,,,,,0.512821,0.489121
6,direction_binary,direction_binary,Sectoral sentiment,Pipeline,,,,,0.594872,0.37299
7,direction_binary,direction_binary,VADER sentiment,Pipeline,,,,,0.594872,0.37299
8,regression,return_t_plus_1,All sentiment,Pipeline,1.377527,2.341142,-0.000768,0.594872,,
9,regression,return_t_plus_1,GenAI sentiment,Pipeline,1.36824,2.341494,-0.001069,0.6,,


# OPTIONAL: Auto ML by hand (just for look at the detail result)

### Regression model (Target = return_t_plus_1)

#### Features = All sentiment

In [99]:
all_sentiment = [
    "sentiment_compound_mean",
    "sentiment_compound_std",
    "sentiment_compound_min",
    "sentiment_compound_max",
    "sentiment_pos_mean",
    "sentiment_neg_mean",
    "sentiment_neu_mean",
    "sentiment_score",
    "sentiment_Airlines",
    "sentiment_Automobile",
    "sentiment_Corporate",
    "sentiment_Economy",
    "sentiment_Energy",
    "sentiment_Geo-Political", 
    "sentiment_Healthcare",
    "sentiment_Technology",
    "sentiment_US Politics",
    "overall_sentiment",
]

In [100]:
from pycaret.regression import (
    setup as reg_setup,
    compare_models as reg_compare,
    finalize_model as reg_finalize,
    predict_model as reg_predict,
)

print("\n================ REGRESSION: return_t_plus_1 ================\n")

reg_train = train_df[all_sentiment + ["return_t_plus_1"]].copy()

reg_setup(
    data=reg_train,
    target="return_t_plus_1",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,    # avoid joblib multiprocessing issues on Mac
    verbose=True
)

# 🚀 AutoML over FULL model library (no include=...)
best_reg_all = reg_compare(sort="MAE", turbo=True)
print("Best model (Regression):", best_reg_all)

# ❌ No tune_model (some models have no param grid) → just finalize
final_reg_all = reg_finalize(best_reg_all)

# Evaluate on test set
X_test_reg_all = test_df[all_sentiment].copy()
y_test_reg_all = test_df["return_t_plus_1"].copy()

pred_reg = reg_predict(final_reg_all , data=X_test_reg)
pred_col_reg = get_prediction_column(pred_reg, all_sentiment)
y_pred_reg_all = pred_reg[pred_col_reg]

mae = mean_absolute_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
r2   = r2_score(y_test_reg, y_pred_reg)

# 🔹 Directional accuracy (up/down based on 0 threshold)
true_dir = np.where(y_test_reg > 0, 1, 0)   # 1 = up, 0 = down/flat
pred_dir = np.where(y_pred_reg > 0, 1, 0)

directional_acc = (true_dir == pred_dir).mean()

print("\nRegression – return_t_plus_1 results:")
print("Prediction column:", pred_col_reg)
print("MAE :", mae)
print("RMSE:", rmse)
print("R²  :", r2)
print("Directional Accuracy (up vs down):", directional_acc)



                    Description             Value
0                    Session id                42
1                        Target   return_t_plus_1
2                   Target type        Regression
3           Original data shape         (452, 19)
4        Transformed data shape         (452, 19)
5   Transformed train set shape         (316, 19)
6    Transformed test set shape         (136, 19)
7              Numeric features                18
8                    Preprocess              True
9               Imputation type            simple
10           Numeric imputation              mean
11       Categorical imputation              mode
12               Fold Generator             KFold
13                  Fold Number                10
14                     CPU Jobs                 1
15                      Use GPU             False
16               Log Experiment             False
17              Experiment Name  reg-default-name
18                          USI              049

                                                           

                                    Model          MAE           MSE  \
lasso                    Lasso Regression       0.7166  1.037300e+00   
llar         Lasso Least Angle Regression       0.7166  1.037300e+00   
dummy                     Dummy Regressor       0.7166  1.037300e+00   
en                            Elastic Net       0.7166  1.037300e+00   
br                         Bayesian Ridge       0.7174  1.038600e+00   
omp           Orthogonal Matching Pursuit       0.7328  1.071900e+00   
huber                     Huber Regressor       0.7501  1.087100e+00   
ridge                    Ridge Regression       0.7603  1.095700e+00   
et                  Extra Trees Regressor       0.7689  1.135800e+00   
rf                Random Forest Regressor       0.7701  1.153700e+00   
lr                      Linear Regression       0.7720  1.104500e+00   
catboost               CatBoost Regressor       0.7961  1.208900e+00   
knn                 K Neighbors Regressor       0.8075  1.201700

#### Features = Only the VADER sentiment

In [101]:
feature_cols = [
    "sentiment_compound_mean",
    "sentiment_compound_std",
    "sentiment_compound_min",
    "sentiment_compound_max",
    "sentiment_pos_mean",
    "sentiment_neg_mean",
    "sentiment_neu_mean",
]

In [102]:
from pycaret.regression import (
    setup as reg_setup,
    compare_models as reg_compare,
    finalize_model as reg_finalize,
    predict_model as reg_predict,
)

print("\n================ REGRESSION: return_t_plus_1 ================\n")

reg_train = train_df[feature_cols + ["return_t_plus_1"]].copy()

reg_setup(
    data=reg_train,
    target="return_t_plus_1",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,    # avoid joblib multiprocessing issues on Mac
    verbose=True
)

# 🚀 AutoML over FULL model library (no include=...)
best_reg = reg_compare(sort="MAE", turbo=True)
print("Best model (Regression):", best_reg)

# ❌ No tune_model (some models have no param grid) → just finalize
final_reg = reg_finalize(best_reg)

# Evaluate on test set
X_test_reg = test_df[feature_cols].copy()
y_test_reg = test_df["return_t_plus_1"].copy()

pred_reg = reg_predict(final_reg, data=X_test_reg)
pred_col_reg = get_prediction_column(pred_reg, feature_cols)
y_pred_reg = pred_reg[pred_col_reg]

mae = mean_absolute_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
r2   = r2_score(y_test_reg, y_pred_reg)

# 🔹 Directional accuracy (up/down based on 0 threshold)
true_dir = np.where(y_test_reg > 0, 1, 0)   # 1 = up, 0 = down/flat
pred_dir = np.where(y_pred_reg > 0, 1, 0)

directional_acc = (true_dir == pred_dir).mean()

print("\nRegression – return_t_plus_1 results:")
print("Prediction column:", pred_col_reg)
print("MAE :", mae)
print("RMSE:", rmse)
print("R²  :", r2)
print("Directional Accuracy (up vs down):", directional_acc)



                    Description             Value
0                    Session id                42
1                        Target   return_t_plus_1
2                   Target type        Regression
3           Original data shape          (452, 8)
4        Transformed data shape          (452, 8)
5   Transformed train set shape          (316, 8)
6    Transformed test set shape          (136, 8)
7              Numeric features                 7
8                    Preprocess              True
9               Imputation type            simple
10           Numeric imputation              mean
11       Categorical imputation              mode
12               Fold Generator             KFold
13                  Fold Number                10
14                     CPU Jobs                 1
15                      Use GPU             False
16               Log Experiment             False
17              Experiment Name  reg-default-name
18                          USI              d73

                                                           

                                    Model     MAE       MSE    RMSE        R2  \
lasso                    Lasso Regression  0.7166    1.0373  1.0001   -0.0483   
llar         Lasso Least Angle Regression  0.7166    1.0373  1.0001   -0.0483   
dummy                     Dummy Regressor  0.7166    1.0373  1.0001   -0.0483   
en                            Elastic Net  0.7166    1.0373  1.0001   -0.0483   
ridge                    Ridge Regression  0.7184    1.0391  1.0011   -0.0506   
huber                     Huber Regressor  0.7202    1.0314  0.9995   -0.0535   
br                         Bayesian Ridge  0.7219    1.0511  1.0069   -0.0630   
omp           Orthogonal Matching Pursuit  0.7273    1.0540  1.0080   -0.0647   
lr                      Linear Regression  0.7328    1.0419  1.0044   -0.0652   
rf                Random Forest Regressor  0.7670    1.1206  1.0399   -0.1347   
et                  Extra Trees Regressor  0.7685    1.1241  1.0390   -0.1301   
catboost               CatBo

#### Features = only the GenAI sentiment score

In [103]:
feature_cols = [
    "sentiment_score",
]

In [104]:
from pycaret.regression import (
    setup as reg_setup,
    compare_models as reg_compare,
    finalize_model as reg_finalize,
    predict_model as reg_predict,
)

print("\n================ REGRESSION: return_t_plus_1 ================\n")

reg_train = train_df[feature_cols + ["return_t_plus_1"]].copy()

reg_setup(
    data=reg_train,
    target="return_t_plus_1",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,    # avoid joblib multiprocessing issues on Mac
    verbose=True
)

# 🚀 AutoML over FULL model library (no include=...)
best_reg = reg_compare(sort="MAE", turbo=True)
print("Best model (Regression):", best_reg)

# ❌ No tune_model (some models have no param grid) → just finalize
final_reg = reg_finalize(best_reg)

# Evaluate on test set
X_test_reg = test_df[feature_cols].copy()
y_test_reg = test_df["return_t_plus_1"].copy()

pred_reg = reg_predict(final_reg, data=X_test_reg)
pred_col_reg = get_prediction_column(pred_reg, feature_cols)
y_pred_reg = pred_reg[pred_col_reg]

mae = mean_absolute_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
r2   = r2_score(y_test_reg, y_pred_reg)

# 🔹 Directional accuracy (up/down based on 0 threshold)
true_dir = np.where(y_test_reg > 0, 1, 0)   # 1 = up, 0 = down/flat
pred_dir = np.where(y_pred_reg > 0, 1, 0)

directional_acc = (true_dir == pred_dir).mean()

print("\nRegression – return_t_plus_1 results:")
print("Prediction column:", pred_col_reg)
print("MAE :", mae)
print("RMSE:", rmse)
print("R²  :", r2)
print("Directional Accuracy (up vs down):", directional_acc)



                    Description             Value
0                    Session id                42
1                        Target   return_t_plus_1
2                   Target type        Regression
3           Original data shape          (452, 2)
4        Transformed data shape          (452, 2)
5   Transformed train set shape          (316, 2)
6    Transformed test set shape          (136, 2)
7              Numeric features                 1
8                    Preprocess              True
9               Imputation type            simple
10           Numeric imputation              mean
11       Categorical imputation              mode
12               Fold Generator             KFold
13                  Fold Number                10
14                     CPU Jobs                 1
15                      Use GPU             False
16               Log Experiment             False
17              Experiment Name  reg-default-name
18                          USI              c6b

                                                           

                                    Model     MAE     MSE    RMSE      R2  \
huber                     Huber Regressor  0.7159  1.0444  1.0060 -0.0660   
llar         Lasso Least Angle Regression  0.7166  1.0373  1.0001 -0.0483   
dummy                     Dummy Regressor  0.7166  1.0373  1.0001 -0.0483   
lasso                    Lasso Regression  0.7166  1.0373  1.0001 -0.0483   
en                            Elastic Net  0.7166  1.0373  1.0001 -0.0483   
br                         Bayesian Ridge  0.7190  1.0475  1.0057 -0.0611   
ridge                    Ridge Regression  0.7192  1.0444  1.0048 -0.0605   
omp           Orthogonal Matching Pursuit  0.7203  1.0461  1.0059 -0.0633   
lr                      Linear Regression  0.7203  1.0461  1.0059 -0.0633   
lar                Least Angle Regression  0.7203  1.0461  1.0059 -0.0633   
lightgbm  Light Gradient Boosting Machine  0.7583  1.1292  1.0482 -0.1617   
gbr           Gradient Boosting Regressor  0.8046  1.3016  1.1184 -0.3168   

#### Features = Sectoral sentiment

In [105]:
feature_cols = [
    "sentiment_Airlines",
    "sentiment_Automobile",
    "sentiment_Corporate",
    "sentiment_Economy",
    "sentiment_Energy",
    "sentiment_Geo-Political", 
    "sentiment_Healthcare",
    "sentiment_Technology",
    "sentiment_US Politics",
    "overall_sentiment",
]

In [106]:
from pycaret.regression import (
    setup as reg_setup,
    compare_models as reg_compare,
    finalize_model as reg_finalize,
    predict_model as reg_predict,
)

print("\n================ REGRESSION: return_t_plus_1 ================\n")

reg_train = train_df[feature_cols + ["return_t_plus_1"]].copy()

reg_setup(
    data=reg_train,
    target="return_t_plus_1",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,    # avoid joblib multiprocessing issues on Mac
    verbose=True
)

# 🚀 AutoML over FULL model library (no include=...)
best_reg = reg_compare(sort="MAE", turbo=True)
print("Best model (Regression):", best_reg)

# ❌ No tune_model (some models have no param grid) → just finalize
final_reg = reg_finalize(best_reg)

# Evaluate on test set
X_test_reg = test_df[feature_cols].copy()
y_test_reg = test_df["return_t_plus_1"].copy()

pred_reg = reg_predict(final_reg, data=X_test_reg)
pred_col_reg = get_prediction_column(pred_reg, feature_cols)
y_pred_reg = pred_reg[pred_col_reg]

mae = mean_absolute_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
r2   = r2_score(y_test_reg, y_pred_reg)

# 🔹 Directional accuracy (up/down based on 0 threshold)
true_dir = np.where(y_test_reg > 0, 1, 0)   # 1 = up, 0 = down/flat
pred_dir = np.where(y_pred_reg > 0, 1, 0)

directional_acc = (true_dir == pred_dir).mean()

print("\nRegression – return_t_plus_1 results:")
print("Prediction column:", pred_col_reg)
print("MAE :", mae)
print("RMSE:", rmse)
print("R²  :", r2)
print("Directional Accuracy (up vs down):", directional_acc)



                    Description             Value
0                    Session id                42
1                        Target   return_t_plus_1
2                   Target type        Regression
3           Original data shape         (452, 11)
4        Transformed data shape         (452, 11)
5   Transformed train set shape         (316, 11)
6    Transformed test set shape         (136, 11)
7              Numeric features                10
8                    Preprocess              True
9               Imputation type            simple
10           Numeric imputation              mean
11       Categorical imputation              mode
12               Fold Generator             KFold
13                  Fold Number                10
14                     CPU Jobs                 1
15                      Use GPU             False
16               Log Experiment             False
17              Experiment Name  reg-default-name
18                          USI              8fc

                                                           

                                    Model     MAE     MSE    RMSE      R2  \
lasso                    Lasso Regression  0.7166  1.0373  1.0001 -0.0483   
llar         Lasso Least Angle Regression  0.7166  1.0373  1.0001 -0.0483   
dummy                     Dummy Regressor  0.7166  1.0373  1.0001 -0.0483   
en                            Elastic Net  0.7166  1.0373  1.0001 -0.0483   
br                         Bayesian Ridge  0.7174  1.0387  1.0007 -0.0495   
omp           Orthogonal Matching Pursuit  0.7328  1.0719  1.0173 -0.0856   
huber                     Huber Regressor  0.7346  1.0726  1.0194 -0.0948   
ridge                    Ridge Regression  0.7563  1.0903  1.0270 -0.1096   
lr                      Linear Regression  0.7585  1.0928  1.0283 -0.1127   
lar                Least Angle Regression  0.7585  1.0928  1.0283 -0.1127   
rf                Random Forest Regressor  0.7841  1.1709  1.0668 -0.2075   
et                  Extra Trees Regressor  0.7967  1.1898  1.0743 -0.2245   

### Classification model_1 (Target = return_bucket)

#### Feature = All Sentiments

In [107]:
feature_cols = [
    "sentiment_compound_mean",
    "sentiment_compound_std",
    "sentiment_compound_min",
    "sentiment_compound_max",
    "sentiment_pos_mean",
    "sentiment_neg_mean",
    "sentiment_neu_mean",
    "sentiment_score",
    "sentiment_Airlines",
    "sentiment_Automobile",
    "sentiment_Corporate",
    "sentiment_Economy",
    "sentiment_Energy",
    "sentiment_Geo-Political", 
    "sentiment_Healthcare",
    "sentiment_Technology",
    "sentiment_US Politics",
    "overall_sentiment",
]

In [108]:
from pycaret.classification import (
    setup as clf_setup,
    compare_models as clf_compare,
    finalize_model as clf_finalize,
    predict_model as clf_predict,
)

print("\n================ CLASSIFICATION: return_bucket ================\n")

cls_train_mc = train_df[feature_cols + ["return_bucket"]].copy()

clf_setup(
    data=cls_train_mc,
    target="return_bucket",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,
    verbose=True
)

# AutoML over all available classifiers
best_mc = clf_compare(sort="Accuracy", turbo=True)
print("Best model (return_bucket):", best_mc)

final_mc = clf_finalize(best_mc)

X_test_mc = test_df[feature_cols].copy()
y_test_mc = test_df["return_bucket"].copy()

pred_mc = clf_predict(final_mc, data=X_test_mc)
pred_col_mc = get_prediction_column(pred_mc, feature_cols)
y_pred_mc = pred_mc[pred_col_mc]

print("\nMulti-class – return_bucket results:")
print("Prediction column:", pred_col_mc)
print("Accuracy:", accuracy_score(y_test_mc, y_pred_mc))
print(classification_report(y_test_mc, y_pred_mc))



                    Description  \
0                    Session id   
1                        Target   
2                   Target type   
3                Target mapping   
4           Original data shape   
5        Transformed data shape   
6   Transformed train set shape   
7    Transformed test set shape   
8              Numeric features   
9                    Preprocess   
10              Imputation type   
11           Numeric imputation   
12       Categorical imputation   
13               Fold Generator   
14                  Fold Number   
15                     CPU Jobs   
16                      Use GPU   
17               Log Experiment   
18              Experiment Name   
19                          USI   

                                               Value  
0                                                 42  
1                                      return_bucket  
2                                         Multiclass  
3   down: 0, slightly_down: 1, slightly_up

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
nb                            Naive Bayes    0.3075  0.5898  0.3075  0.3135   
et                 Extra Trees Classifier    0.2979  0.5552  0.2979  0.2863   
lda          Linear Discriminant Analysis    0.2880  0.0000  0.2880  0.2761   
xgboost         Extreme Gradient Boosting    0.2787  0.5074  0.2787  0.2816   
catboost              CatBoost Classifier    0.2787  0.5148  0.2787  0.2813   
gbc          Gradient Boosting Classifier    0.2784  0.0000  0.2784  0.2808   
rf               Random Forest Classifier    0.2783  0.5373  0.2783  0.2751   
svm                   SVM - Linear Kernel    0.2755  0.0000  0.2755  0.1773   
lightgbm  Light Gradient Boosting Machine    0.2752  0.5111  0.2752  0.2824   
dummy                    Dummy Classifier    0.2752  0.5000  0.2752  0.0759   
ridge                    Ridge Classifier    0.2689  0.0000  0.2689  0.2398   
lr                    Logistic Regression    0.2659 

#### Features = Sentiment VADER

In [109]:
feature_cols = [
    "sentiment_compound_mean",
    "sentiment_compound_std",
    "sentiment_compound_min",
    "sentiment_compound_max",
    "sentiment_pos_mean",
    "sentiment_neg_mean",
    "sentiment_neu_mean",
]

In [110]:
from pycaret.classification import (
    setup as clf_setup,
    compare_models as clf_compare,
    finalize_model as clf_finalize,
    predict_model as clf_predict,
)

print("\n================ CLASSIFICATION: return_bucket ================\n")

cls_train_mc = train_df[feature_cols + ["return_bucket"]].copy()

clf_setup(
    data=cls_train_mc,
    target="return_bucket",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,
    verbose=True
)

# AutoML over all available classifiers
best_mc = clf_compare(sort="Accuracy", turbo=True)
print("Best model (return_bucket):", best_mc)

final_mc = clf_finalize(best_mc)

X_test_mc = test_df[feature_cols].copy()
y_test_mc = test_df["return_bucket"].copy()

pred_mc = clf_predict(final_mc, data=X_test_mc)
pred_col_mc = get_prediction_column(pred_mc, feature_cols)
y_pred_mc = pred_mc[pred_col_mc]

print("\nMulti-class – return_bucket results:")
print("Prediction column:", pred_col_mc)
print("Accuracy:", accuracy_score(y_test_mc, y_pred_mc))
print(classification_report(y_test_mc, y_pred_mc))



                    Description  \
0                    Session id   
1                        Target   
2                   Target type   
3                Target mapping   
4           Original data shape   
5        Transformed data shape   
6   Transformed train set shape   
7    Transformed test set shape   
8              Numeric features   
9                    Preprocess   
10              Imputation type   
11           Numeric imputation   
12       Categorical imputation   
13               Fold Generator   
14                  Fold Number   
15                     CPU Jobs   
16                      Use GPU   
17               Log Experiment   
18              Experiment Name   
19                          USI   

                                               Value  
0                                                 42  
1                                      return_bucket  
2                                         Multiclass  
3   down: 0, slightly_down: 1, slightly_up

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.3134  0.0000  0.3134  0.1717   
knn                K Neighbors Classifier    0.3068  0.5430  0.3068  0.3091   
lda          Linear Discriminant Analysis    0.3037  0.0000  0.3037  0.3028   
et                 Extra Trees Classifier    0.2921  0.5179  0.2921  0.2853   
ada                  Ada Boost Classifier    0.2911  0.0000  0.2911  0.2941   
lightgbm  Light Gradient Boosting Machine    0.2887  0.5164  0.2887  0.2867   
ridge                    Ridge Classifier    0.2879  0.0000  0.2879  0.1943   
rf               Random Forest Classifier    0.2827  0.5212  0.2827  0.2770   
xgboost         Extreme Gradient Boosting    0.2825  0.5199  0.2825  0.2713   
svm                   SVM - Linear Kernel    0.2821  0.0000  0.2821  0.1534   
dummy                    Dummy Classifier    0.2752  0.5000  0.2752  0.0759   
catboost              CatBoost Classifier    0.2731 

#### Features = GenAI Sentiment

In [111]:
feature_cols = [
    "sentiment_score",
]

In [112]:
from pycaret.classification import (
    setup as clf_setup,
    compare_models as clf_compare,
    finalize_model as clf_finalize,
    predict_model as clf_predict,
)

print("\n================ CLASSIFICATION: return_bucket ================\n")

cls_train_mc = train_df[feature_cols + ["return_bucket"]].copy()

clf_setup(
    data=cls_train_mc,
    target="return_bucket",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,
    verbose=True
)

# AutoML over all available classifiers
best_mc = clf_compare(sort="Accuracy", turbo=True)
print("Best model (return_bucket):", best_mc)

final_mc = clf_finalize(best_mc)

X_test_mc = test_df[feature_cols].copy()
y_test_mc = test_df["return_bucket"].copy()

pred_mc = clf_predict(final_mc, data=X_test_mc)
pred_col_mc = get_prediction_column(pred_mc, feature_cols)
y_pred_mc = pred_mc[pred_col_mc]

print("\nMulti-class – return_bucket results:")
print("Prediction column:", pred_col_mc)
print("Accuracy:", accuracy_score(y_test_mc, y_pred_mc))
print(classification_report(y_test_mc, y_pred_mc))



                    Description  \
0                    Session id   
1                        Target   
2                   Target type   
3                Target mapping   
4           Original data shape   
5        Transformed data shape   
6   Transformed train set shape   
7    Transformed test set shape   
8              Numeric features   
9                    Preprocess   
10              Imputation type   
11           Numeric imputation   
12       Categorical imputation   
13               Fold Generator   
14                  Fold Number   
15                     CPU Jobs   
16                      Use GPU   
17               Log Experiment   
18              Experiment Name   
19                          USI   

                                               Value  
0                                                 42  
1                                      return_bucket  
2                                         Multiclass  
3   down: 0, slightly_down: 1, slightly_up

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
ridge                    Ridge Classifier    0.3324  0.0000  0.3324  0.1942   
lda          Linear Discriminant Analysis    0.3323  0.0000  0.3323  0.1939   
lr                    Logistic Regression    0.3258  0.0000  0.3258  0.1803   
nb                            Naive Bayes    0.2944  0.5778  0.2944  0.2358   
qda       Quadratic Discriminant Analysis    0.2944  0.0000  0.2944  0.2358   
lightgbm  Light Gradient Boosting Machine    0.2817  0.5343  0.2817  0.2911   
knn                K Neighbors Classifier    0.2789  0.5701  0.2789  0.2752   
dummy                    Dummy Classifier    0.2752  0.5000  0.2752  0.0759   
xgboost         Extreme Gradient Boosting    0.2600  0.5401  0.2600  0.2681   
gbc          Gradient Boosting Classifier    0.2598  0.0000  0.2598  0.2608   
rf               Random Forest Classifier    0.2597  0.5399  0.2597  0.2586   
catboost              CatBoost Classifier    0.2595 

#### Features = Sectoral sentiments

In [113]:
feature_cols = [
    "sentiment_Airlines",
    "sentiment_Automobile",
    "sentiment_Corporate",
    "sentiment_Economy",
    "sentiment_Energy",
    "sentiment_Geo-Political", 
    "sentiment_Healthcare",
    "sentiment_Technology",
    "sentiment_US Politics",
    "overall_sentiment",
]

In [114]:
from pycaret.classification import (
    setup as clf_setup,
    compare_models as clf_compare,
    finalize_model as clf_finalize,
    predict_model as clf_predict,
)

print("\n================ CLASSIFICATION: return_bucket ================\n")

cls_train_mc = train_df[feature_cols + ["return_bucket"]].copy()

clf_setup(
    data=cls_train_mc,
    target="return_bucket",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,
    verbose=True
)

# AutoML over all available classifiers
best_mc = clf_compare(sort="Accuracy", turbo=True)
print("Best model (return_bucket):", best_mc)

final_mc = clf_finalize(best_mc)

X_test_mc = test_df[feature_cols].copy()
y_test_mc = test_df["return_bucket"].copy()

pred_mc = clf_predict(final_mc, data=X_test_mc)
pred_col_mc = get_prediction_column(pred_mc, feature_cols)
y_pred_mc = pred_mc[pred_col_mc]

print("\nMulti-class – return_bucket results:")
print("Prediction column:", pred_col_mc)
print("Accuracy:", accuracy_score(y_test_mc, y_pred_mc))
print(classification_report(y_test_mc, y_pred_mc))



                    Description  \
0                    Session id   
1                        Target   
2                   Target type   
3                Target mapping   
4           Original data shape   
5        Transformed data shape   
6   Transformed train set shape   
7    Transformed test set shape   
8              Numeric features   
9                    Preprocess   
10              Imputation type   
11           Numeric imputation   
12       Categorical imputation   
13               Fold Generator   
14                  Fold Number   
15                     CPU Jobs   
16                      Use GPU   
17               Log Experiment   
18              Experiment Name   
19                          USI   

                                               Value  
0                                                 42  
1                                      return_bucket  
2                                         Multiclass  
3   down: 0, slightly_down: 1, slightly_up

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
rf               Random Forest Classifier    0.2973  0.5379  0.2973  0.3141   
gbc          Gradient Boosting Classifier    0.2973  0.0000  0.2973  0.3007   
qda       Quadratic Discriminant Analysis    0.2820  0.0000  0.2820  0.2951   
nb                            Naive Bayes    0.2787  0.5588  0.2787  0.2932   
catboost              CatBoost Classifier    0.2784  0.5051  0.2784  0.2842   
dummy                    Dummy Classifier    0.2752  0.5000  0.2752  0.0759   
lightgbm  Light Gradient Boosting Machine    0.2749  0.5135  0.2749  0.2619   
xgboost         Extreme Gradient Boosting    0.2693  0.5150  0.2693  0.2588   
lr                    Logistic Regression    0.2660  0.0000  0.2660  0.2397   
dt               Decision Tree Classifier    0.2660  0.5086  0.2660  0.2596   
ridge                    Ridge Classifier    0.2660  0.0000  0.2660  0.2390   
lda          Linear Discriminant Analysis    0.2565 

### Classification model_2 [Target = direction_binary]

#### Features = All sentiments

In [115]:
feature_cols = [
    "sentiment_compound_mean",
    "sentiment_compound_std",
    "sentiment_compound_min",
    "sentiment_compound_max",
    "sentiment_pos_mean",
    "sentiment_neg_mean",
    "sentiment_neu_mean",
    "sentiment_score",
    "sentiment_Airlines",
    "sentiment_Automobile",
    "sentiment_Corporate",
    "sentiment_Economy",
    "sentiment_Energy",
    "sentiment_Geo-Political", 
    "sentiment_Healthcare",
    "sentiment_Technology",
    "sentiment_US Politics",
    "overall_sentiment",
]

In [116]:
print("\n================ BINARY CLASSIFICATION: direction_binary ================\n")

cls_train_bin = train_df[feature_cols + ["direction_binary"]].copy()

clf_setup(
    data=cls_train_bin,
    target="direction_binary",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,
    verbose=True
)

best_bin = clf_compare(sort="Accuracy", turbo=True)
print("Best model (direction_binary):", best_bin)

final_bin = clf_finalize(best_bin)

X_test_bin = test_df[feature_cols].copy()
y_test_bin = test_df["direction_binary"].copy()

pred_bin = clf_predict(final_bin, data=X_test_bin)
pred_col_bin = get_prediction_column(pred_bin, feature_cols)
y_pred_bin = pred_bin[pred_col_bin]

print("\nBinary Classification – direction_binary results:")
print("Prediction column:", pred_col_bin)
print("Accuracy:", accuracy_score(y_test_bin, y_pred_bin))
print(classification_report(y_test_bin, y_pred_bin))



                    Description             Value
0                    Session id                42
1                        Target  direction_binary
2                   Target type            Binary
3                Target mapping    down: 0, up: 1
4           Original data shape         (452, 19)
5        Transformed data shape         (452, 19)
6   Transformed train set shape         (316, 19)
7    Transformed test set shape         (136, 19)
8              Numeric features                18
9                    Preprocess              True
10              Imputation type            simple
11           Numeric imputation              mean
12       Categorical imputation              mode
13               Fold Generator   StratifiedKFold
14                  Fold Number                10
15                     CPU Jobs                 1
16                      Use GPU             False
17               Log Experiment             False
18              Experiment Name  clf-default-nam

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
dummy                    Dummy Classifier    0.5412  0.5000  0.5412  0.2930   
lda          Linear Discriminant Analysis    0.5344  0.5591  0.5344  0.5290   
dt               Decision Tree Classifier    0.5217  0.5153  0.5217  0.5222   
svm                   SVM - Linear Kernel    0.5123  0.5227  0.5123  0.4772   
gbc          Gradient Boosting Classifier    0.5034  0.4820  0.5034  0.4980   
qda       Quadratic Discriminant Analysis    0.4992  0.5320  0.4992  0.5129   
ada                  Ada Boost Classifier    0.4905  0.4721  0.4905  0.4895   
ridge                    Ridge Classifier    0.4845  0.5004  0.4845  0.4691   
xgboost         Extreme Gradient Boosting    0.4844  0.4585  0.4844  0.4775   
lightgbm  Light Gradient Boosting Machine    0.4843  0.4775  0.4843  0.4805   
nb                            Naive Bayes    0.4778  0.5020  0.4778  0.5001   
catboost              CatBoost Classifier    0.4777 

#### Features = VADER

In [117]:
feature_cols = [
    "sentiment_compound_mean",
    "sentiment_compound_std",
    "sentiment_compound_min",
    "sentiment_compound_max",
    "sentiment_pos_mean",
    "sentiment_neg_mean",
    "sentiment_neu_mean",
]

In [118]:
print("\n================ BINARY CLASSIFICATION: direction_binary ================\n")

cls_train_bin = train_df[feature_cols + ["direction_binary"]].copy()

clf_setup(
    data=cls_train_bin,
    target="direction_binary",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,
    verbose=True
)

best_bin = clf_compare(sort="Accuracy", turbo=True)
print("Best model (direction_binary):", best_bin)

final_bin = clf_finalize(best_bin)

X_test_bin = test_df[feature_cols].copy()
y_test_bin = test_df["direction_binary"].copy()

pred_bin = clf_predict(final_bin, data=X_test_bin)
pred_col_bin = get_prediction_column(pred_bin, feature_cols)
y_pred_bin = pred_bin[pred_col_bin]

print("\nBinary Classification – direction_binary results:")
print("Prediction column:", pred_col_bin)
print("Accuracy:", accuracy_score(y_test_bin, y_pred_bin))
print(classification_report(y_test_bin, y_pred_bin))



                    Description             Value
0                    Session id                42
1                        Target  direction_binary
2                   Target type            Binary
3                Target mapping    down: 0, up: 1
4           Original data shape          (452, 8)
5        Transformed data shape          (452, 8)
6   Transformed train set shape          (316, 8)
7    Transformed test set shape          (136, 8)
8              Numeric features                 7
9                    Preprocess              True
10              Imputation type            simple
11           Numeric imputation              mean
12       Categorical imputation              mode
13               Fold Generator   StratifiedKFold
14                  Fold Number                10
15                     CPU Jobs                 1
16                      Use GPU             False
17               Log Experiment             False
18              Experiment Name  clf-default-nam

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
lr                    Logistic Regression    0.5412  0.4809  0.5412  0.2930   
dummy                    Dummy Classifier    0.5412  0.5000  0.5412  0.2930   
ridge                    Ridge Classifier    0.5319  0.5023  0.5319  0.4823   
lda          Linear Discriminant Analysis    0.5098  0.5308  0.5098  0.4978   
qda       Quadratic Discriminant Analysis    0.5062  0.5291  0.5062  0.5436   
svm                   SVM - Linear Kernel    0.5030  0.4994  0.5030  0.3314   
ada                  Ada Boost Classifier    0.4808  0.4564  0.4808  0.4762   
dt               Decision Tree Classifier    0.4781  0.4751  0.4781  0.4787   
nb                            Naive Bayes    0.4779  0.5179  0.4779  0.5057   
gbc          Gradient Boosting Classifier    0.4751  0.4539  0.4751  0.4699   
rf               Random Forest Classifier    0.4622  0.4347  0.4622  0.4563   
knn                K Neighbors Classifier    0.4531 

#### Features = GenAI sentiment

In [119]:
feature_cols = [
    "sentiment_score",
]

In [120]:
print("\n================ BINARY CLASSIFICATION: direction_binary ================\n")

cls_train_bin = train_df[feature_cols + ["direction_binary"]].copy()

clf_setup(
    data=cls_train_bin,
    target="direction_binary",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,
    verbose=True
)

best_bin = clf_compare(sort="Accuracy", turbo=True)
print("Best model (direction_binary):", best_bin)

final_bin = clf_finalize(best_bin)

X_test_bin = test_df[feature_cols].copy()
y_test_bin = test_df["direction_binary"].copy()

pred_bin = clf_predict(final_bin, data=X_test_bin)
pred_col_bin = get_prediction_column(pred_bin, feature_cols)
y_pred_bin = pred_bin[pred_col_bin]

print("\nBinary Classification – direction_binary results:")
print("Prediction column:", pred_col_bin)
print("Accuracy:", accuracy_score(y_test_bin, y_pred_bin))
print(classification_report(y_test_bin, y_pred_bin))



                    Description             Value
0                    Session id                42
1                        Target  direction_binary
2                   Target type            Binary
3                Target mapping    down: 0, up: 1
4           Original data shape          (452, 2)
5        Transformed data shape          (452, 2)
6   Transformed train set shape          (316, 2)
7    Transformed test set shape          (136, 2)
8              Numeric features                 1
9                    Preprocess              True
10              Imputation type            simple
11           Numeric imputation              mean
12       Categorical imputation              mode
13               Fold Generator   StratifiedKFold
14                  Fold Number                10
15                     CPU Jobs                 1
16                      Use GPU             False
17               Log Experiment             False
18              Experiment Name  clf-default-nam

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
gbc          Gradient Boosting Classifier    0.5636  0.5711  0.5636  0.5591   
et                 Extra Trees Classifier    0.5538  0.5429  0.5538  0.5531   
knn                K Neighbors Classifier    0.5508  0.5495  0.5508  0.5478   
ridge                    Ridge Classifier    0.5413  0.5371  0.5413  0.3573   
dummy                    Dummy Classifier    0.5412  0.5000  0.5412  0.2930   
rf               Random Forest Classifier    0.5382  0.5677  0.5382  0.5334   
lr                    Logistic Regression    0.5381  0.5371  0.5381  0.2922   
dt               Decision Tree Classifier    0.5381  0.5482  0.5381  0.5369   
lda          Linear Discriminant Analysis    0.5381  0.5371  0.5381  0.3565   
xgboost         Extreme Gradient Boosting    0.5316  0.5453  0.5316  0.5305   
svm                   SVM - Linear Kernel    0.5253  0.5114  0.5253  0.3044   
ada                  Ada Boost Classifier    0.5220 

#### Features = Sectoral Sentiment

In [121]:
feature_cols = [
    "sentiment_Airlines",
    "sentiment_Automobile",
    "sentiment_Corporate",
    "sentiment_Economy",
    "sentiment_Energy",
    "sentiment_Geo-Political", 
    "sentiment_Healthcare",
    "sentiment_Technology",
    "sentiment_US Politics",
    "overall_sentiment",
]

In [122]:
print("\n================ BINARY CLASSIFICATION: direction_binary ================\n")

cls_train_bin = train_df[feature_cols + ["direction_binary"]].copy()

clf_setup(
    data=cls_train_bin,
    target="direction_binary",
    session_id=42,
    html=False,
    log_experiment=False,
    n_jobs=1,
    verbose=True
)

best_bin = clf_compare(sort="Accuracy", turbo=True)
print("Best model (direction_binary):", best_bin)

final_bin = clf_finalize(best_bin)

X_test_bin = test_df[feature_cols].copy()
y_test_bin = test_df["direction_binary"].copy()

pred_bin = clf_predict(final_bin, data=X_test_bin)
pred_col_bin = get_prediction_column(pred_bin, feature_cols)
y_pred_bin = pred_bin[pred_col_bin]

print("\nBinary Classification – direction_binary results:")
print("Prediction column:", pred_col_bin)
print("Accuracy:", accuracy_score(y_test_bin, y_pred_bin))
print(classification_report(y_test_bin, y_pred_bin))



                    Description             Value
0                    Session id                42
1                        Target  direction_binary
2                   Target type            Binary
3                Target mapping    down: 0, up: 1
4           Original data shape         (452, 11)
5        Transformed data shape         (452, 11)
6   Transformed train set shape         (316, 11)
7    Transformed test set shape         (136, 11)
8              Numeric features                10
9                    Preprocess              True
10              Imputation type            simple
11           Numeric imputation              mean
12       Categorical imputation              mode
13               Fold Generator   StratifiedKFold
14                  Fold Number                10
15                     CPU Jobs                 1
16                      Use GPU             False
17               Log Experiment             False
18              Experiment Name  clf-default-nam

                                                           

                                    Model  Accuracy     AUC  Recall   Prec.  \
dummy                    Dummy Classifier    0.5412  0.5000  0.5412  0.2930   
xgboost         Extreme Gradient Boosting    0.5161  0.5059  0.5161  0.5148   
rf               Random Forest Classifier    0.5157  0.4859  0.5157  0.5076   
gbc          Gradient Boosting Classifier    0.5128  0.4870  0.5128  0.5146   
ada                  Ada Boost Classifier    0.5041  0.4914  0.5041  0.5019   
svm                   SVM - Linear Kernel    0.5004  0.4564  0.5004  0.4274   
lightgbm  Light Gradient Boosting Machine    0.4939  0.4945  0.4939  0.4924   
catboost              CatBoost Classifier    0.4907  0.5101  0.4907  0.4831   
lr                    Logistic Regression    0.4878  0.4568  0.4878  0.4943   
ridge                    Ridge Classifier    0.4847  0.4571  0.4847  0.4752   
lda          Linear Discriminant Analysis    0.4847  0.4538  0.4847  0.4755   
et                 Extra Trees Classifier    0.4782 