In [None]:
from transformers import pipeline, AutoTokenizer
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')



In [None]:
# Generate synthetic dataset with 20+ data points
def generate_data():
    tickers = ["AAPL", "GOOGL", "MSFT", "TSLA", "AMZN"]
    headlines = [
        "Apple announces record profits in Q1.", "Google under investigation for antitrust violations.",
        "Microsoft expands cloud computing capabilities.", "Tesla reveals new AI-powered autopilot system.",
        "Amazon launches drone-based delivery service.", "Stock market crashes amid global recession fears.",
        "Federal Reserve announces interest rate hikes.", "Cryptocurrency markets hit all-time high.",
        "Tech industry layoffs increase amid economic downturn.", "Nasdaq reaches record highs.",
        "Apple plans to invest $100 billion in AI research.", "Google unveils a new quantum computing breakthrough.",
        "Microsoft teams up with OpenAI for advanced AI models.", "Tesla stock soars after strong earnings report.",
        "Amazon struggles with logistics disruptions in supply chain.", "Stock market sees biggest drop in five years.",
        "Economic recovery signals boost in stock prices.", "US government imposes new tech regulations.",
        "AI-driven automation expected to impact job markets.", "Investors optimistic about market recovery."
    ]
    np.random.seed(42)
    data = []
    for i in range(20):
        data.append({
            "Date": f"2025-02-{10+i}",
            "Ticker": np.random.choice(tickers),
            "Open": round(np.random.uniform(100, 3000), 2),
            "High": round(np.random.uniform(100, 3000), 2),
            "Low": round(np.random.uniform(100, 3000), 2),
            "Close": round(np.random.uniform(100, 3000), 2),
            "Volume": np.random.randint(1000000, 50000000),
            "News_Headline": np.random.choice(headlines),
            "Sentiment_Score": round(np.random.uniform(-2, 2), 2)
        })
    return data

# Convert to DataFrame
news_df = pd.DataFrame(generate_data())


In [None]:
# Data Cleaning and Preprocessing
## Tokenization
news_df['Tokenized_News'] = news_df['News_Headline'].apply(word_tokenize)

## Subword Tokenization using BPE
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
news_df['BPE_Tokenized_News'] = news_df['News_Headline'].apply(lambda x: tokenizer.tokenize(x))

## Handling Categorical Data (One-Hot Encoding for Ticker)
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ticker_encoded = one_hot_encoder.fit_transform(news_df[['Ticker']])
ticker_encoded_df = pd.DataFrame(ticker_encoded, columns=one_hot_encoder.get_feature_names_out(['Ticker']))
news_df = news_df.drop(columns=['Ticker']).join(ticker_encoded_df)

## Standardizing and Normalizing Numerical Data
scaler = StandardScaler()
numerical_columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Sentiment_Score']
news_df[numerical_columns] = scaler.fit_transform(news_df[numerical_columns])

minmax_scaler = MinMaxScaler()
news_df[numerical_columns] = minmax_scaler.fit_transform(news_df[numerical_columns])


In [None]:


# Splitting dataset into training and validation sets
train_df, val_df = train_test_split(news_df, test_size=0.2, random_state=42)



In [None]:
# Load pre-trained sentiment analysis model (FinBERT or a finance-tuned model)
sentiment_pipeline = pipeline("sentiment-analysis", model="ProsusAI/finbert")

In [None]:
# Predict sentiment using finance-tuned model
def classify_sentiment(text):
    result = sentiment_pipeline(text)
    return result[0]['label']

# Apply classification to training and validation sets
train_df['Predicted_Sentiment'] = train_df['News_Headline'].apply(classify_sentiment)
val_df['Predicted_Sentiment'] = val_df['News_Headline'].apply(classify_sentiment)

news_df.head()