---

# $$ \textbf{Data preprocessing} $$

---

* By: *César Alejandro Villegas Espíndola*.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk 
import re
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cesarve/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/cesarve/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/cesarve/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [1]:
!pip install datasets > /dev/null 2>&1

## Load the dataset

In [8]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("NickyNicky/finance-financialmodelingprep-stock-news-sentiments-rss-feed")

# Convert the dataset to a Pandas DataFrame
df = pd.DataFrame(dataset['train'])

# Display the DataFrame
df.head()

Unnamed: 0,symbol,publishedDate,title,image,site,text,url,sentiment,sentimentScore
0,HE,2023-10-04T21:54:28.000Z,"HE REMINDER: Kessler Topaz Meltzer & Check, LL...",/next-assets/images/schema-image-default.png,benzinga,"RADNOR, Pa., Oct. 04, 2023 (GLOBE NEWSWIRE) --...",https://www.benzinga.com/pressreleases/23/10/g...,Negative,-0.6417
1,CRS,2023-10-04T21:40:00.000Z,Carpenter Technology Announces Conference Call...,https://ml.globenewswire.com/Resource/Download...,globenewswire,"PHILADELPHIA, Oct. 04, 2023 (GLOBE NEWSWIRE) -...",https://www.globenewswire.com/news-release/202...,Positive,0.4767
2,BB,2023-10-04T21:34:09.000Z,What's Going On With BlackBerry Stock After Ho...,https://cdn.benzinga.com/files/images/story/20...,benzinga,BlackBerry Ltd (NYSE: BB) shares are trading h...,https://www.benzinga.com/news/23/10/35098537/w...,Positive,0.9895
3,TECK,2023-10-04T21:17:00.000Z,Teck to Release Third Quarter 2023 Results on ...,https://ml.globenewswire.com/Resource/Download...,globenewswire,"VANCOUVER, British Columbia, Oct. 04, 2023 (GL...",https://www.globenewswire.com/news-release/202...,Positive,0.83
4,AGF,2023-10-04T21:13:38.000Z,AGF Reports September 2023 Assets Under Manage...,/next-assets/images/schema-image-default.png,benzinga,"TORONTO, Oct. 04, 2023 (GLOBE NEWSWIRE) -- AGF...",https://www.benzinga.com/pressreleases/23/10/g...,Positive,0.9768


In [9]:
df = df[['sentiment', 'text']]
df.head()

Unnamed: 0,sentiment,text
0,Negative,"RADNOR, Pa., Oct. 04, 2023 (GLOBE NEWSWIRE) --..."
1,Positive,"PHILADELPHIA, Oct. 04, 2023 (GLOBE NEWSWIRE) -..."
2,Positive,BlackBerry Ltd (NYSE: BB) shares are trading h...
3,Positive,"VANCOUVER, British Columbia, Oct. 04, 2023 (GL..."
4,Positive,"TORONTO, Oct. 04, 2023 (GLOBE NEWSWIRE) -- AGF..."


Label counts before balancing

In [37]:
total_elements = df.size
print(f"Total values in the df: {total_elements}")

# Verify the count of every label
label_counts = df['sentiment'].value_counts()
print(f"Label counts before balancing:\n {label_counts}")

Total values in the df: 284000
Label counts before balancing:
 sentiment
Positive    125630
Negative     14521
Neutral       1849
Name: count, dtype: int64


### Preprocessing

In [44]:
X = balanced_df['text']
y = balanced_df['sentiment']

In [50]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False)
y_one_hot = onehot_encoder.fit_transform(y.values.reshape(-1, 1))

#### Lemmatization and Stopwords Removal

In [51]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove non-alphabet characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower().split()
    # Remove stopwords and lemmatize
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    all_stopwords.remove('no')
    text = [lemmatizer.lemmatize(word) for word in text if word not in set(all_stopwords)]
    return ' '.join(text)

corpus = [preprocess_text(sentence) for sentence in X]

In [52]:
print(len(corpus))
corpus[:5]

4998


['somerset n j july globe newswire carecloud inc nasdaq ccld ccldp ccldo leader healthcare technology solution medical practice health system nationwide today announced recognized healthcare technology report list power healthcare technology company addition company recognition carecloud president chief executive officer hadi chaudhry selected one top healthcare technology ceo',
 'oil price rose tuesday amid tight supply speculation oil could economy west texas intermediate cl f settled per barrel tuesday brent also closed fractionally higher per barrel oil price steadily climbed since late june amid output cut imposed opec unilateral production curb saudi arabia export restriction russia',
 'wednesday traveler heard notice air mission notam',
 'recent activity long dormant bitcoin crypto btc wallet caught attention crypto community sparking speculation motivation behind move wallet owned called whale holder large amount cryptocurrency whose action significantly impact token price one 

In [57]:
y_one_hot = np.array(y_one_hot)

def find_index_for_label(y_one_hot, label):
    for index, y in enumerate(y_one_hot):
        if np.array_equal(y, label):
            return index
    return None

# Labels
positive_label = [1, 0, 0]
negative_label = [0, 1, 0]
neutral_label = [0, 0, 1]

# Search the index based on the sentiment
positive_index = find_index_for_label(y_one_hot, positive_label)
negative_index = find_index_for_label(y_one_hot, negative_label)
neutral_index = find_index_for_label(y_one_hot, neutral_label)

# Show the news and it´s corresponding label
if positive_index is not None:
    print(f'Positive news text (index {positive_index}):')
    print(X[positive_index])
    print('Label:', y_one_hot[positive_index])
else:
    print('No positive news found.')

if negative_index is not None:
    print(f'\nNegative news text (index {negative_index}):')
    print(X[negative_index])
    print('Label:', y_one_hot[negative_index])
else:
    print('No negative news found.')

if neutral_index is not None:
    print(f'\nNeutral news text (index {neutral_index}):')
    print(X[neutral_index])
    print('Label:', y_one_hot[neutral_index])
else:
    print('No neutral news found.')

Positive news text (index 1):
Oil prices rose on Tuesday amid tight supplies and speculation over what $100 oil could do to the economy. West Texas Intermediate (CL=F) settled at $90.39 per barrel on Tuesday while Brent also closed fractionally higher at $93.96 per barrel. Oil prices have steadily climbed since late June amid output cuts imposed by OPEC+ and unilateral production curbs from Saudi Arabia and export restrictions from Russia.
Label: [1. 0. 0.]

Negative news text (index 3):
Recent activity in long-dormant Bitcoin (CRYPTO: BTC) wallets has caught the attention of the crypto community, sparking speculation about the motivations behind these moves. These wallets are owned by so-called "whales," or holders of large amounts of cryptocurrency whose actions can significantly impact token prices. One whale, whose wallet was last active in 2012, moved over 400 Bitcoin worth $11 million over the weekend. Some 360 Bitcoin going to one wallet and 40 Bitcoin to other wallets, a...
Lab

In [54]:
positive_sample = X[positive_index]
negative_sample = X[negative_index]
neutral_sample = X[neutral_index]

In [55]:
### TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_new = tfidf.fit_transform(corpus)

# Guardar el Tfidf Vectorizer
import pickle
with open('/Users/cesarve/Documents/GitHub/stock-news-sentiment-analysis/model/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [56]:
# Save the y_one_hot array to a file
np.save('/Users/cesarve/Documents/GitHub/stock-news-sentiment-analysis/data/processed/y.npy', y_one_hot)