In [1]:
from datasets import load_dataset
import pandas as pd

# Load the dataset (from Hugging Face)
dataset = load_dataset('zeroshot/twitter-financial-news-sentiment')
# List the available splits
print(dataset.keys())

# Access the training and validation splits
train_dataset = dataset['train']
validation_dataset = dataset['validation']

# Convert the training dataset to a DataFrame
df_train = pd.DataFrame(train_dataset)
df_validation = pd.DataFrame(validation_dataset)


dict_keys(['train', 'validation'])


In [2]:
print(df_train.head())
print(df_train.columns)
print(df_train['label'].value_counts())

                                                text  label
0  $BYND - JPMorgan reels in expectations on Beyo...      0
1  $CCL $RCL - Nomura points to bookings weakness...      0
2  $CX - Cemex cut at Credit Suisse, J.P. Morgan ...      0
3  $ESS: BTIG Research cuts to Neutral https://t....      0
4  $FNKO - Funko slides after Piper Jaffray PT cu...      0
Index(['text', 'label'], dtype='object')
label
2    6178
1    1923
0    1442
Name: count, dtype: int64


In [3]:
import re
import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove URLs, mentions, hashtags
    text = re.sub(r'http\S+|@\S+|#\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    text_tokens = text.split()
    filtered_text = ' '.join([word for word in text_tokens if word not in stop_words])
    return filtered_text


In [4]:
# Apply preprocessing to the training data
df_train['clean_text'] = df_train['text'].apply(preprocess_text)
df_validation['clean_text'] = df_validation['text'].apply(preprocess_text)

print(df_train['clean_text'].head())

0         bynd jpmorgan reels expectations beyond meat
1    ccl rcl nomura points bookings weakness carniv...
2    cx cemex cut credit suisse jp morgan weak buil...
3                       ess btig research cuts neutral
4               fnko funko slides piper jaffray pt cut
Name: clean_text, dtype: object


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
# Fit the vectorizer on the training data
X_train_tfidf = vectorizer.fit_transform(df_train['clean_text'])

# Transform the validation data
X_validation_tfidf = vectorizer.transform(df_validation['clean_text'])


In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['label'])
y_validation = label_encoder.transform(df_validation['label'])


In [7]:
from sklearn.svm import SVC
#Support Vector Machine (linear, one against all method of classification)
svm_classifier = SVC(kernel='linear', class_weight='balanced') #'balanced weights samples to for more accurate training
svm_classifier.fit(X_train_tfidf, y_train)



In [8]:
# Predicting
y_pred = svm_classifier.predict(X_validation_tfidf)

# Metrics
from sklearn.metrics import classification_report, accuracy_score

# Use numerical labels as strings
target_names = [str(label) for label in sorted(set(y_validation))]

print("Accuracy:", accuracy_score(y_validation, y_pred))
print(classification_report(y_validation, y_pred, target_names=target_names))

Accuracy: 0.7948073701842546
              precision    recall  f1-score   support

           0       0.56      0.69      0.62       347
           1       0.69      0.72      0.71       475
           2       0.90      0.84      0.87      1566

    accuracy                           0.79      2388
   macro avg       0.72      0.75      0.73      2388
weighted avg       0.81      0.79      0.80      2388



In [10]:
#CROSS VALIDATION 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import classification_report

# Prepare data
X = df_train['text']  # Feature data: raw text from training set
y = df_train['label']  # Labels: sentiment classes

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Step 1: Convert text to TF-IDF features
    ('svm', SVC(kernel='linear', class_weight='balanced'))  # Step 2: Train a linear SVM classifier
])

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # Stratified 5-fold cross-validation

# Cross-validation with detailed metrics
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']  # Evaluation metrics
cv_results = cross_validate(pipeline, X, y, cv=cv, scoring=scoring)  # Perform cross-validation

print("Cross-validation results:")
for metric in scoring:
    scores = cv_results['test_' + metric]
    print(f"{metric}: {scores.mean():.3f} (+/- {scores.std():.3f})")  # Print mean and std for each metric

# Fit the pipeline on the entire training data
pipeline.fit(X, y)  # Train the pipeline on the full training set

# Prepare validation data
X_validation = df_validation['text']  # Feature data: raw text from validation set
y_validation = df_validation['label']  # Labels: sentiment classes from validation set

# Predict on validation data
y_pred = pipeline.predict(X_validation)  # Predict labels for validation data

# Classification report on validation set
print("Classification report on validation set:")
print(classification_report(y_validation, y_pred, target_names=['negative', 'positive', 'neutral']))  # Print detailed classification metrics


Cross-validation results:
accuracy: 0.785 (+/- 0.006)
precision_macro: 0.707 (+/- 0.005)
recall_macro: 0.730 (+/- 0.002)
f1_macro: 0.717 (+/- 0.003)
Classification report on validation set:
              precision    recall  f1-score   support

    negative       0.58      0.71      0.64       347
    positive       0.68      0.74      0.71       475
     neutral       0.91      0.84      0.88      1566

    accuracy                           0.80      2388
   macro avg       0.73      0.76      0.74      2388
weighted avg       0.82      0.80      0.81      2388

