<h1> Financial Sentiment Analysis </h1>

In [102]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [103]:
df = pd.read_csv("D:\\Datasets\\financial.csv", encoding = 'latin1')
df.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [104]:
#I will be assigning values for easy coding
X = df["Sentence"]
y = df["Sentiment"]

<h2> Preprocessing </h2>

<h3> Label Encoding </h3>

In [105]:
#I want to know first the kinds of sentiments in the target, and how many are there
print(y.unique())
print(y.nunique())

['positive' 'negative' 'neutral']
3


In [106]:
#I will be doing manual label encoding in target variable
target = {
    'positive' : 0,
    'negative' : 1,
    'neutral': 2,
}

y_encoded = y.map(target)

<h3> Removing Special Characters</h3>

In [107]:
#Check first if there are newline, carriage returns or special characters
carriage_rets = X.str.contains(r'\r', na = False).any()
newline = X.str.contains(r'\n', na = False).any()
special_char = X.str.contains(r'[^\w\s]', na = False).any()
special_char1 = X.str.contains(r'\$', na = False).any()

print(f"Carriage returns:{carriage_rets}")
print(f"Newlines: {newline}")
print(f"special char: {special_char}")
print(f"$: {special_char1}")

Carriage returns:False
Newlines: False
special char: True
$: True


In [108]:
# Clean the data
X_cleaned = X.str.replace(r'[\r\n]', ' ', regex=True)  # Replace newlines and carriage returns with space
X_cleaned = X_cleaned.str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
X_cleaned = X_cleaned.str.replace(r'\$', '', regex=True)  

In [109]:
# recheck
carriage_rets = X_cleaned.str.contains(r'\r', na=False).any()
newline = X_cleaned.str.contains(r'\n', na = False).any()
special_char = X_cleaned.str.contains(r'[^\w\s]', na = False).any()
special_char1 = X_cleaned.str.contains(r'\$', na = False).any()

print(f"Carriage returns:{carriage_rets}")
print(f"Newlines: {newline}")
print(f"special char: {special_char}")
print(f"$: {special_char1}")

Carriage returns:False
Newlines: False
special char: False
$: False


<h3> Stemming </h3>

In [110]:
#Use nltk library to access Porter Stemmer
#I decided not to use stopwords since some words may affect the sentiment result
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abrah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [111]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

corpus = []

for i in range(len(df)):
    text = X_cleaned.iloc[i]
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    text = ' '.join(text)
    corpus.append(text)


<h3> Vectorization </h3>

In [112]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer =  CountVectorizer()
X_vectorized  = vectorizer.fit_transform(corpus).toarray()

<h2> Split Data </h2>

In [113]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size = 0.2, random_state =42)

<h2> Build Model </h2>

<h3> Naive Bayes </h3>

In [114]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model = nb_model.fit(X_train, y_train)

In [115]:
nb_model_pred = nb_model.predict(X_test)

<h4> Evaluation </h4>

In [116]:
from sklearn.metrics import accuracy_score, classification_report

nb_ac = accuracy_score(y_test, nb_model_pred)
nb_cr = classification_report(y_test, nb_model_pred)

print(f"Naive Bayes Model Accuracy: {nb_ac}")
print(nb_cr)

Naive Bayes Model Accuracy: 0.6937553464499572
              precision    recall  f1-score   support

           0       0.72      0.67      0.69       372
           1       0.40      0.35      0.37       175
           2       0.75      0.81      0.78       622

    accuracy                           0.69      1169
   macro avg       0.62      0.61      0.61      1169
weighted avg       0.69      0.69      0.69      1169



<h3> Logistic Regression </h3>

In [117]:
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression(multi_class='ovr', max_iter = 1000, solver = 'saga', class_weight = 'balanced')
logreg_model = logreg_model.fit(X_train, y_train)

In [118]:
logreg_model_pred = logreg_model.predict(X_test)

<h4> Evaluation </h4>

In [119]:
logreg_ac = accuracy_score(y_test, logreg_model_pred)
logreg_cr = classification_report(y_test, logreg_model_pred)

print(f"Model Accuracy: {logreg_ac}")
print(logreg_cr)

Model Accuracy: 0.6869118905047049
              precision    recall  f1-score   support

           0       0.75      0.73      0.74       372
           1       0.36      0.38      0.37       175
           2       0.75      0.75      0.75       622

    accuracy                           0.69      1169
   macro avg       0.62      0.62      0.62      1169
weighted avg       0.69      0.69      0.69      1169



<h2> XGBoost </h2>

In [120]:
from xgboost import XGBClassifier

In [121]:
xgb_model = XGBClassifier(objective='multi:softmax', n_estimators = 200 ,num_class=3, random_state=42)
xgb_model = xgb_model.fit(X_train, y_train)

In [122]:
xgb_model_pred = xgb_model.predict(X_test)

<h4> Evaluation </h4>

In [123]:
xg_ac = accuracy_score(y_test, xgb_model_pred)

print(f"XGBoost Model Accuracy: {xg_ac}")
print(classification_report(y_test, xgb_model_pred))

XGBoost Model Accuracy: 0.6843455945252352
              precision    recall  f1-score   support

           0       0.75      0.67      0.71       372
           1       0.36      0.21      0.27       175
           2       0.70      0.83      0.76       622

    accuracy                           0.68      1169
   macro avg       0.60      0.57      0.58      1169
weighted avg       0.67      0.68      0.67      1169



<h1> SVC </h1>

In [124]:
from sklearn.svm import SVC

In [125]:
svc_model = SVC()
svc_model = svc_model.fit(X_train, y_train)

In [126]:
svc_model_pred = svc_model.predict(X_test)

In [127]:
svc_ac = accuracy_score(y_test, svc_model_pred)
svc_cr = classification_report(y_test, svc_model_pred)

print(f"Model Accuracy: {svc_ac}")
print(svc_cr)

Model Accuracy: 0.6826347305389222
              precision    recall  f1-score   support

           0       0.78      0.56      0.65       372
           1       0.34      0.07      0.11       175
           2       0.67      0.93      0.78       622

    accuracy                           0.68      1169
   macro avg       0.60      0.52      0.51      1169
weighted avg       0.65      0.68      0.64      1169



<h1> Prediction of Input Data </h1>

In [128]:
input_text = input("Input text here: ")

input_text = input_text.split()
input_text = [lemmatizer.lemmatize(word) for word in input_text]
input_text = ' '.join(input_text)

input_text = vectorizer.transform([input_text])

#-predict---
predicted = nb_model.predict(input_text)

if predicted == 0:
    print("Result: The sentiment is Positive :)")
elif predicted == 1:
    print("Result: The sentiment is Negative :(")
else:
    print("Result: The sentiment is just Neutral :|")
    


Input text here:  increased


Result: The sentiment is Positive :)


In [129]:
input_text = input("Input text here: ")

input_text = input_text.split()
input_text = [lemmatizer.lemmatize(word) for word in input_text]
input_text = ' '.join(input_text)

input_text = vectorizer.transform([input_text])

#-predict---
predicted = logreg_model.predict(input_text)

if predicted == 0:
    print("Result: The sentiment is Positive :)")
elif predicted == 1:
    print("Result: The sentiment is Negative :(")
else:
    print("Result: The sentiment is just Neutral :|")
    


Input text here:  increased


Result: The sentiment is Positive :)


In [130]:
input_text = input("Input text here: ")

input_text = input_text.split()
input_text = [lemmatizer.lemmatize(word) for word in input_text]
input_text = ' '.join(input_text)

input_text = vectorizer.transform([input_text])

#-predict---
predicted = xgb_model.predict(input_text)

if predicted == 0:
    print("Result: The sentiment is Positive :)")
elif predicted == 1:
    print("Result: The sentiment is Negative :(")
else:
    print("Result: The sentiment is just Neutral :|")
    


Input text here:  increased


Result: The sentiment is Positive :)


<h1> Model Local Deployment </h1>

In [132]:
import pickle

In [133]:
# Save the XGBoost model
pickle.dump(nb_model, open('nb_model.pkl', 'wb'))

# Load the XGBoost model
nb_model = pickle.load(open('nb_model.pkl', 'rb'))


In [134]:
#save .pkl of vectorizer
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

#save .pkl of stemmer
with open('lemmatizer.pkl', 'wb') as file:
    pickle.dump(lemmatizer, file)

print("Vectorizer and lemmatizer saved successfully!")

Vectorizer and lemmatizer saved successfully!


In [41]:
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [53]:
df["Sentence"].iloc[5837]

'RISING costs have forced packaging producer Huhtamaki to axe 90 jobs at its Hampshire manufacturing plant .'