Do not forget to pip install required packages as mentioned in the previous chapters

In [4]:
from transformers import pipeline, AutoTokenizer
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\santh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
# Sample dataset combining financial time-series data with news data
combined_data = [
    {"Date": "2025-02-27", "Ticker": "AAPL", "Open": 150.2, "High": 155.3, "Low": 148.5, "Close": 154.1, "Volume": 5000000, "News_Headline": "Apple sees record revenue growth amid strong iPhone sales.", "Sentiment_Score": 1.5},
    {"Date": "2025-02-14", "Ticker": "GOOGL", "Open": 2700.5, "High": 2750.0, "Low": 2680.2, "Close": 2725.3, "Volume": 3000000, "News_Headline": "Google faces antitrust lawsuit, market reacts negatively.", "Sentiment_Score": -1.2},
    {"Date": "2025-02-16", "Ticker": "MSFT", "Open": 320.1, "High": 330.5, "Low": 318.2, "Close": 328.9, "Volume": 4000000, "News_Headline": "Microsoft launches AI-powered cloud service, boosting stock.", "Sentiment_Score": 1.2},
    {"Date": "2025-02-27", "Ticker": "AAPL", "Open": 151.0, "High": 152.7, "Low": 149.9, "Close": 151.8, "Volume": 5200000, "News_Headline": "Market uncertainty rises as inflation concerns dominate.", "Sentiment_Score": -0.8},
    {"Date": "2025-02-15", "Ticker": "GOOGL", "Open": 2695.3, "High": 2705.7, "Low": 2678.9, "Close": 2682.4, "Volume": 2800000, "News_Headline": "Tech stocks slump as interest rates increase.", "Sentiment_Score": -1.5},
    {"Date": "2025-02-20", "Ticker": "MSFT", "Open": 322.8, "High": 326.0, "Low": 320.5, "Close": 324.2, "Volume": 3800000, "News_Headline": "Investors optimistic after Fed signals rate cuts.", "Sentiment_Score": 1.8}
]


In [None]:
# Convert to DataFrame
news_df = pd.DataFrame(combined_data)

# Data Cleaning and Preprocessing
## Tokenization
news_df['Tokenized_News'] = news_df['News_Headline'].apply(word_tokenize)

## Subword Tokenization using BPE
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
news_df['BPE_Tokenized_News'] = news_df['News_Headline'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
## Handling Categorical Data (One-Hot Encoding for Ticker)
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
ticker_encoded = one_hot_encoder.fit_transform(news_df[['Ticker']])
ticker_encoded_df = pd.DataFrame(ticker_encoded, columns=one_hot_encoder.get_feature_names_out(['Ticker']))
news_df = news_df.drop(columns=['Ticker']).join(ticker_encoded_df)

## Standardizing and Normalizing Numerical Data
scaler = StandardScaler()
numerical_columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Sentiment_Score']
news_df[numerical_columns] = scaler.fit_transform(news_df[numerical_columns])

minmax_scaler = MinMaxScaler()
news_df[numerical_columns] = minmax_scaler.fit_transform(news_df[numerical_columns])


In [None]:
# Load pre-trained sentiment analysis model (FinBERT or a finance-tuned model)
sentiment_pipeline = pipeline("sentiment-analysis", model="ProsusAI/finbert")

# Predict sentiment using finance-tuned model
def classify_sentiment(text):
    result = sentiment_pipeline(text)
    return result[0]['label']

# Apply classification to dataset
news_df['Predicted_Sentiment'] = news_df['News_Headline'].apply(classify_sentiment)

news_df.head()