In [18]:
import pandas as pd
from sklearn.utils import resample

# Simulated raw dataset with a mix of financial and non-financial texts
data = {
    "text": [
        "The stock market rallied today, with equities up 2%.",
        "The Federal Reserve announced new policies impacting bond yields.",
        "Company earnings reports were mixed, causing uncertainty in the market.",
        "Investment banks report increased profits amid robust market conditions.",
        "Local weather update: sunny and warm.",
        "Sports update: The local team won their game.",
        "Global financial markets experienced volatility due to geopolitical tensions.",
        "New tech stocks are attracting investors despite market downturn.",
        "Economic forecast indicates growth in the tech sector.",
        "Non-financial content that is irrelevant."
    ],
    "sentiment": [
        "bullish", "bullish", "bearish", "bullish",
        "neutral", "neutral", "bearish", "bullish", "neutral", "neutral"
    ]
}

# Create a DataFrame
df_raw = pd.DataFrame(data)
print("Original Dataset:")
print(df_raw)
print("\nClass Distribution (Raw):")
print(df_raw["sentiment"].value_counts())

#  Remove Irrelevant Content
# Define a set of  required keywords if known
finance_keywords = ["stock", "market", "equity", "bond", "investment", "earnings", "financial", "fed", "econom", "tech"]

def is_financial(text, keywords):
    text_lower = text.lower()
    # Check if any financial keyword is present in the text
    return any(keyword in text_lower for keyword in keywords)

# Filter the DataFrame to keep only rows with financial content.
df_financial = df_raw[df_raw["text"].apply(lambda x: is_financial(x, finance_keywords))]
print("\nDataset After Removing ")
print(df_financial)
print("\nAfter Filtering:")
print(df_financial["sentiment"].value_counts())

#  Balance Class Distribution. for balanced datasets

min_count = df_financial["sentiment"].value_counts().min()

# Downsample each class to have an equal number of examples.
balanced_dfs = []
for sentiment in df_financial["sentiment"].unique():
    df_class = df_financial[df_financial["sentiment"] == sentiment]
    # For simplicity, using downsampling
    df_downsampled = resample(df_class, replace=False, n_samples=min_count, random_state=42)
    balanced_dfs.append(df_downsampled)

# Concatenate the balanced subsets.
df_balanced = pd.concat(balanced_dfs)
print("\nBalanced Dataset:")
print(df_balanced)
print("\nClass Distribution (Balanced):")
print(df_balanced["sentiment"].value_counts())



Original Dataset:
                                                text sentiment
0  The stock market rallied today, with equities ...   bullish
1  The Federal Reserve announced new policies imp...   bullish
2  Company earnings reports were mixed, causing u...   bearish
3  Investment banks report increased profits amid...   bullish
4              Local weather update: sunny and warm.   neutral
5      Sports update: The local team won their game.   neutral
6  Global financial markets experienced volatilit...   bearish
7  New tech stocks are attracting investors despi...   bullish
8  Economic forecast indicates growth in the tech...   neutral
9          Non-financial content that is irrelevant.   neutral

Class Distribution (Raw):
sentiment
bullish    4
neutral    4
bearish    2
Name: count, dtype: int64

Dataset After Removing Irrelevant Content:
                                                text sentiment
0  The stock market rallied today, with equities ...   bullish
1  The Federal Re