---

# $$ \textbf{Data preprocessing} $$

---

* By: *César Alejandro Villegas Espíndola*.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk 
import re
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
nltk.download("wordnet")
nltk.download("omw-1.4")

In [4]:
!pip install datasets > /dev/null 2>&1

## Load the dataset

In [5]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("NickyNicky/finance-financialmodelingprep-stock-news-sentiments-rss-feed")

# Convert the dataset to a Pandas DataFrame
df = pd.DataFrame(dataset['train'])

# Display the DataFrame
df.head()

Unnamed: 0,symbol,publishedDate,title,image,site,text,url,sentiment,sentimentScore
0,HE,2023-10-04T21:54:28.000Z,"HE REMINDER: Kessler Topaz Meltzer & Check, LL...",/next-assets/images/schema-image-default.png,benzinga,"RADNOR, Pa., Oct. 04, 2023 (GLOBE NEWSWIRE) --...",https://www.benzinga.com/pressreleases/23/10/g...,Negative,-0.6417
1,CRS,2023-10-04T21:40:00.000Z,Carpenter Technology Announces Conference Call...,https://ml.globenewswire.com/Resource/Download...,globenewswire,"PHILADELPHIA, Oct. 04, 2023 (GLOBE NEWSWIRE) -...",https://www.globenewswire.com/news-release/202...,Positive,0.4767
2,BB,2023-10-04T21:34:09.000Z,What's Going On With BlackBerry Stock After Ho...,https://cdn.benzinga.com/files/images/story/20...,benzinga,BlackBerry Ltd (NYSE: BB) shares are trading h...,https://www.benzinga.com/news/23/10/35098537/w...,Positive,0.9895
3,TECK,2023-10-04T21:17:00.000Z,Teck to Release Third Quarter 2023 Results on ...,https://ml.globenewswire.com/Resource/Download...,globenewswire,"VANCOUVER, British Columbia, Oct. 04, 2023 (GL...",https://www.globenewswire.com/news-release/202...,Positive,0.83
4,AGF,2023-10-04T21:13:38.000Z,AGF Reports September 2023 Assets Under Manage...,/next-assets/images/schema-image-default.png,benzinga,"TORONTO, Oct. 04, 2023 (GLOBE NEWSWIRE) -- AGF...",https://www.benzinga.com/pressreleases/23/10/g...,Positive,0.9768


In [9]:
df = df[['sentiment', 'text']].dropna().reset_index(drop=True)
df.head()

Unnamed: 0,sentiment,text
0,Negative,"RADNOR, Pa., Oct. 04, 2023 (GLOBE NEWSWIRE) --..."
1,Positive,"PHILADELPHIA, Oct. 04, 2023 (GLOBE NEWSWIRE) -..."
2,Positive,BlackBerry Ltd (NYSE: BB) shares are trading h...
3,Positive,"VANCOUVER, British Columbia, Oct. 04, 2023 (GL..."
4,Positive,"TORONTO, Oct. 04, 2023 (GLOBE NEWSWIRE) -- AGF..."


In [12]:
total_elements = df.size
print(f"Total values in the df: {total_elements}")

# Verify the count of every label
label_counts = df['sentiment'].value_counts()
print(f"Label counts before balancing:\n {label_counts}")

Total values in the df: 284000
Label counts before balancing:
 sentiment
Positive    125630
Negative     14521
Neutral       1849
Name: count, dtype: int64


In [19]:
desired_label_counts = 7000 // len(label_counts)
print(f"Desired count per label: {desired_label_counts}")

Desired count per label: 2333


In [23]:
# Balanced dataframe
balanced_df = pd.DataFrame()

for label in label_counts.index:
    label_df = df[df['sentiment'] == label]

    if len(label_df) > desired_label_counts:
        label_df = label_df.sample(desired_label_counts, random_state=42)
    elif len(label_df) < desired_label_counts:
        label_df = label_df.sample(desired_label_counts, replace=True, random_state=42)

    balanced_df = pd.concat([balanced_df, label_df])
    
balanced_label_counts = balanced_df['sentiment'].value_counts()
print("Label counts after balancing:\n", balanced_label_counts)

Label counts after balancing:
 sentiment
Positive    2333
Negative    2333
Neutral     2333
Name: count, dtype: int64


In [25]:
# Total elements of the balanced df
print(f"Total elements after balancing: {balanced_df.size}")

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Mezclar el DataFrame
balanced_df.head()

Total elements after balancing: 13998


Unnamed: 0,sentiment,text
0,Negative,The value of Bed Bath & Beyond (NASDAQ:BBBY) s...
1,Positive,Forian Inc. (NASDAQ: FORA) released financial ...
2,Negative,The CNN Money Fear and Greed index showed some...
3,Positive,Canada’s Federal Court of Appeal has dismissed...
4,Negative,Regulators continue to scrutinize Microsoft's ...


### Preprocessing

In [50]:
X = balanced_df['text']
y = balanced_df['sentiment']

In [63]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False)
y_one_hot = onehot_encoder.fit_transform(y.values.reshape(-1, 1))

#### Lemmatization and Stopwords Removal

In [64]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove non-alphabet characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text).lower().split()
    # Remove stopwords and lemmatize
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    all_stopwords.remove('no')
    text = [lemmatizer.lemmatize(word) for word in text if word not in set(all_stopwords)]
    return ' '.join(text)

corpus = [preprocess_text(sentence) for sentence in X]

In [65]:
print(len(corpus))
corpus[:5]

6999


['value bed bath beyond nasdaq bbby share tripled since jan threat bankruptcy created short squeeze bbby stock short interest nearly half float bbby stock jumped jan another overnight opening today per share called meme stock rose sympathy amc entertainment nyse amc jan gamestop nyse gme rose meme investorplace writing bed bath beyon',
 'forian inc nasdaq forum released financial result quarter ended september revealing revenue million increase compared million q sequentially second quarter q financial highlight net loss quarter million per share compared million per share prior year adjusted ebitda quarter negative million compared negative million p',
 'cnn money fear greed index showed easing fear level among u investor thursday dow jones snapped five day losing streak following credit suisse nyse c announcement loan swiss national bank first republic bank nyse frc also announced deal billion deposit major bank trader also assessed economic report showed u initial jobless claim drop

In [66]:
y_one_hot = np.array(y_one_hot)

def find_index_for_label(y_one_hot, label):
    for index, y in enumerate(y_one_hot):
        if np.array_equal(y, label):
            return index
    return None

# Labels
positive_label = [1, 0, 0]
negative_label = [0, 1, 0]
neutral_label = [0, 0, 1]

# Search the index based on the sentiment
positive_index = find_index_for_label(y_one_hot, positive_label)
negative_index = find_index_for_label(y_one_hot, negative_label)
neutral_index = find_index_for_label(y_one_hot, neutral_label)

# Show the news and it´s corresponding label
if positive_index is not None:
    print(f'Positive news text (index {positive_index}):')
    print(X[positive_index])
    print('Label:', y_one_hot[positive_index])
else:
    print('No positive news found.')

if negative_index is not None:
    print(f'\nNegative news text (index {negative_index}):')
    print(X[negative_index])
    print('Label:', y_one_hot[negative_index])
else:
    print('No negative news found.')

if neutral_index is not None:
    print(f'\nNeutral news text (index {neutral_index}):')
    print(X[neutral_index])
    print('Label:', y_one_hot[neutral_index])
else:
    print('No neutral news found.')

Positive news text (index 0):
The value of Bed Bath & Beyond (NASDAQ:BBBY) shares has tripled since Jan. 6. The threat of bankruptcy created a short squeeze in BBBY stock. With short interest at nearly half its float, BBBY stock jumped 68% on Jan. 11 and another 20% overnight, opening today at $4.22 per share. Other so-called “meme stocks” from 2021 rose in sympathy. AMC Entertainment (NYSE:AMC) was up 21% on Jan. 11 and Gamestop (NYSE:GME) rose 7%. What Does This Meme? InvestorPlace has been writing about Bed Bath & Beyon...
Label: [1. 0. 0.]

Negative news text (index 9):
Nokia OyjPörssitiedote29.08.2023 klo 21.00...
Label: [0. 1. 0.]

Neutral news text (index 1):
Forian Inc. (NASDAQ: FORA) released financial results for the quarter ended September 30, 2022, revealing revenue of $7.2 million, a 45% increase compared to $5 million in the Q3 2021 and 10% sequentially over the second quarter of 2022. Q3 2022 Financial Highlights Net loss for the quarter was $5.1 million, or $0.16 per sh

In [68]:
positive_sample = X[0]
negative_sample = X[9]
neutral_sample = X[1]

In [70]:
### TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_new = tfidf.fit_transform(corpus)

# Guardar el Tfidf Vectorizer
import pickle
with open('/Users/cesarve/Documents/GitHub/stock-news-sentiment-analysis/model/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)