> The goal of this notebook is to explore 3 different NLP techniques in order to better predict the tone of our news data. To do so, we have selected models from 'simple' level to state of the art tehcnique:
- FinBERT
- Vader
- TF-IDF

# Import and Loading data

In [None]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, recall_score, f1_score, classification_report
from NowcastingEco import NowcastingEco
from tqdm import tqdm

finance_news = pd.read_excel('new_annotated_articles2.xlsx')
finance_news.head()

In [None]:
# Check if 'Headlines' contains only strings and contains no empty elements
headline_strings_only = finance_news['headline'].dtype == object
print("Headlines contains only strings:", headline_strings_only)
print("Nb of empty headlines:", finance_news['headline'].isnull().sum())

# FinBERT

In [None]:
!pip install transformers

In [None]:
# Load data
df = pd.read_excel('new_annotated_articles2.xlsx')
# Replace -1 values with 2 in 'annotated_tone' to match expected outputs of finBERT
df['annotated_tone'] = df['annotated_tone'].replace(-1, 2)
# Split data into training and testing sets
train_text, test_text, train_labels, test_labels = train_test_split(df['headline'], df['annotated_tone'], test_size=0.2)

# Initialize the finBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Tokenize the data
train_encodings = tokenizer(train_text.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_text.tolist(), truncation=True, padding=True, max_length=512)

# If there's a GPU available
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# move the model to the GPU
model = model.to(device)

# Create a PyTorch Dataset
class FinSentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create DataLoader
train_dataset = FinSentimentDataset(train_encodings, train_labels.tolist())
test_dataset = FinSentimentDataset(test_encodings, test_labels.tolist())
train_loader = DataLoader(train_dataset, batch_size=40, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=40, shuffle=True)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train the model
model.train()
for epoch in range(5):  
    total_loss = 0
    total_batches = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        print(input_ids)
        attention_mask = batch['attention_mask'].to(device)
        print(attention_mask)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        total_batches += 1
        loss.backward()
        optimizer.step()
        print(f'Batch Loss: {loss.item()}')
    print(f'Average Loss after Epoch {epoch+1}: {total_loss/total_batches}')

model.save_pretrained("/content/finBERT_pretrained_v1")

In [None]:
# Test the model
model.eval()
correct = 0
total = 0
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=-1)
    print(outputs.logits)
    print(predictions)
    correct += (predictions == labels).sum().item()
    total += labels.numel()

print(f'Accuracy: {correct/total}')

**Past results:**

> Epochs: 3 - lr: 1e-5 - Batch size: 16 - **Accuracy: 0.76 / Loss:**

> Epochs: 3 - lr: 1e-4 - Batch size: 32 - **Accuracy: 0.79 / Loss: 0.25106**

> Epochs: 5 - lr: 5e-5 - Batch size: 32 - **Accuracy: 0.795 / Loss: 0.14585**

> Epochs: 5 - lr: 1e-4 - Batch size: 64 - **Accuracy: 0.77 / Loss: 0.13444**

> Epochs: 5 - lr: 1e-4 - Batch size: 32 - **Accuracy:0.77 / Loss: 0.1806** 

> Epochs: 5 - lr: 5e-5 - Batch size: 32 - **Accuracy:0.73 / Loss: 0.1089**

> Epochs: 5 - lr: 5e-5 - Batch size 40 - **Accuracy: 0.82 / Loss: 0.15391**

Apply FinBERT on our news data to predict a new tone:

In [None]:
# Load data with NowcastingEco() from news data
df = pd.read_csv('/Users/amaury/Documents/!DSBA/CRP/headlines_english_arabic_countries.csv')
crash_test = NowcastingEco(df)
crash_test.clean_data() # Egypt

#For Egypt
df_egypt = crash_test.df

In [None]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = "/content/drive/MyDrive/finBERT_pretrained_v1"

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert", force_download=True, resume_download=False)

model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

In [None]:
def predict_tone(text):
    #print(text)
    inputs = tokenizer.encode_plus(text, return_tensors='pt', truncation=True, padding='max_length', max_length=512).to(device)
    #print(inputs)
    outputs = model(**inputs)
    predicted = np.argmax(outputs.logits.detach().cpu().numpy())
    #print(predicted)
    return predicted


# Apply the prediction function to the adjust_headline column with progress bar
tqdm.pandas()
df_egypt['tone_prediction'] = df_egypt['title'].progress_apply(predict_tone)

In [None]:
# Export data with the new tones
#df['tone_prediction'] = df['tone_prediction'].replace(1, "positive")
#df['tone_prediction'] = df['tone_prediction'].replace(2, "negative")
#df_egypt.to_csv('/content/drive/MyDrive/egypt_tone_pred_v2.csv', index=True)

# Vader

In [None]:
#importing the necessary packages and the excel sheet with annotated articles

sid = SentimentIntensityAnalyzer()

test_set = pd.read_excel('/Users/jeanlahellec/Downloads/new_annotated_articles2.xlsx')
test_set.head()

In [None]:
#applying vader to the headlines
test_set['scores'] = test_set['headline'].apply(lambda title: sid.polarity_scores(title) if pd.notnull(title) else None)
test_set= test_set.dropna(subset=['scores'])
test_set.shape

In [None]:
#extracting only the compound score (regularized compounded score of neutral, positive and negative scores)
test_set['compound']  = test_set['scores'].apply(lambda score_dict: score_dict['compound'])
test_set['comp_score'] = test_set['compound'].apply(lambda c: 1 if c >=0 else -1)
test_set.head()

In [None]:
# Assuming 'actual' column represents the ground truth labels
actual_labels = test_set['annotated_tone'].values

# Assuming 'predicted' column represents the predicted labels
predicted_labels = test_set['comp_score'].values

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)

accuracy_classes=classification_report(actual_labels, predicted_labels)
# Calculate recall
recall = recall_score(actual_labels, predicted_labels)

# Calculate F1 score
f1 = f1_score(actual_labels, predicted_labels)

print(accuracy_classes)
print("Accuracy:", accuracy)
print("Recall:", recall)
print("F1 Score:", f1)

# TF-IDF

In [None]:
# Check if 'Headlines' contains only strings and contains no empty elements
headline_strings_only = finance_news['headline'].dtype == object
print("Headlines contains only strings:", headline_strings_only)
print("Nb of empty headlines:", finance_news['headline'].isnull().sum())

finance_news[finance_news['headline'].isnull()]

### Grams

In [None]:
# Step 1: Preprocessing
finance_news = finance_news[['headline', 'annotated_tone']].dropna()

# Step 2: Splitting the data
X = finance_news['headline']
y = finance_news['annotated_tone']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Vectorizing the data with unigrams, bigrams, and trigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Step 4: Training different classifiers
classifiers = [
    LogisticRegression(),
    SVC(),
    MultinomialNB(),
    RandomForestClassifier(),
]

for classifier in classifiers:
    classifier.fit(X_train_vectorized, y_train)

    # Step 5: Predicting and evaluating
    y_pred = classifier.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{classifier.__class__.__name__} Accuracy: {accuracy}")

### TF-IDF + Word2Vec

In [None]:
# Step 1: Preprocessing
finance_news = finance_news[['headline', 'annotated_tone']].dropna()
X = finance_news['headline']
y = finance_news['annotated_tone']

# Step 2: TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_tfidf = vectorizer.fit_transform(X)

# Step 3: Word Embedding Generation
embedding_size = 100  # Specify the desired size of word embeddings
window_size = 2  # Specify the context window size for Word2Vec
min_word_count = 1  # Specify the minimum word count threshold for Word2Vec
word_embeddings = Word2Vec(sentences=[sentence.split() for sentence in X], vector_size=embedding_size, window=window_size, min_count=min_word_count)

# Step 4: Combine TF-IDF and Word Embeddings
combined_embeddings = []
for headline in X:
    word_embedding_weights = []
    for word in headline.split():
        if word in word_embeddings.wv and word in vectorizer.vocabulary_:
            tfidf_weight = vectorizer.idf_[vectorizer.vocabulary_[word]]
            word_embedding_weights.append(tfidf_weight * word_embeddings.wv[word])
    if word_embedding_weights:
        headline_embedding = np.mean(word_embedding_weights, axis=0)
    else:
        headline_embedding = np.zeros(embedding_size)
    combined_embeddings.append(headline_embedding)

# Step 5: Train and Evaluate Different Classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

for clf_name, classifier in classifiers.items():
    print(f"Classifier: {clf_name}")
    X_train, X_test, y_train, y_test = train_test_split(combined_embeddings, y, test_size=0.2, random_state=42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    
    # Predict and Output the Tone for Each News
    predicted_tone = classifier.predict(combined_embeddings)
    finance_news['predicted_tone'] = predicted_tone
    print(finance_news[['headline', 'annotated_tone', 'predicted_tone']])
    print('-' * 50)