In [1]:
import pandas as pd
from transformers import pipeline

# Sample dataset with mixed texts (financial and non-financial)
data = {
    "text": [
        "The stock market rallied today with equities up 2%.",
        "Local weather update: sunny and warm.",
        "Investment banks report increased profits amid robust market conditions.",
        "Sports update: The local team won their game.",
        "Economic forecast indicates growth in the tech sector.",
        "Non-financial content: a recipe for apple pie."
    ]
}

# Create a DataFrame
df = pd.DataFrame(data)


# Initialize the zero-shot classification pipeline using BART-large-MNLI
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


"""
Use zero-shot classification to determine if the text is financial.
The text is considered financial if the top label is "financial" and its confidence is above the threshold.
"""
def is_financial(text, threshold=0.5):

    candidate_labels = ["financial", "non-financial"]
    result = classifier(text, candidate_labels)
    # The classifier returns a dict with keys "labels" and "scores".
    top_label = result["labels"][0]
    top_score = result["scores"][0]
    return top_label == "financial" and top_score >= threshold

# Apply the classifier to each text in the dataset
df["is_financial"] = df["text"].apply(lambda x: is_financial(x))

# Filter out non-financial content
df_filtered = df[df["is_financial"]].reset_index(drop=True)

print("\nFiltered Dataset ")
print(df_filtered[["text"]])


Original Dataset:
                                                text
0  The stock market rallied today with equities u...
1              Local weather update: sunny and warm.
2  Investment banks report increased profits amid...
3      Sports update: The local team won their game.
4  Economic forecast indicates growth in the tech...
5     Non-financial content: a recipe for apple pie.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0



Filtered Dataset (Only Financial Content):
                                                text
0  The stock market rallied today with equities u...
1  Investment banks report increased profits amid...
