# Financial News Sentiment Analyzer

Goal: Build an NLP-based system that classifies financial news headlines as positive,
negative, or neutral.  

## Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import random
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

## Downloading the Financial PhraseBank Dataset

In [None]:
df = pd.read_csv('/content/data.csv')
df.head(10)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
5,$SPY wouldn't be surprised to see a green close,positive
6,Shell's $70 Billion BG Deal Meets Shareholder ...,negative
7,SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANG...,negative
8,Kone 's net sales rose by some 14 % year-on-ye...,positive
9,The Stockmann department store will have a tot...,neutral


In [None]:
df.shape # 2 columns Sentence and Sentiment

(5842, 2)

In [None]:
df.isnull().sum() #check for missing values

Unnamed: 0,0
Sentence,0
Sentiment,0


In [None]:
df['Sentiment'].value_counts() # Checking class distribution between positive, negative and neutral sentiment

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
neutral,3130
positive,1852
negative,860


## Text Preprocessing using TF-IDF for Logistic Regression

In [None]:
import re
import nltk


nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation/numbers
    text = text.lower()  # Lowercase
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

df['cleaned_text'] = df['Sentence'].apply(clean_text) #creating a new column with cleaned text


from sklearn.preprocessing import LabelEncoder

# First we will encode our sentiment labels
le = LabelEncoder()
df['Sentiment_encoded'] = le.fit_transform(df['Sentiment'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Define the data into features (X) and labels (y)
X = df['cleaned_text']
y = df['Sentiment_encoded']

#Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


## Creating a pipeline to train the Logistic Regression Model

Pipelines can help to optimise NLP workflows and makes model deployment easier.

In [None]:
model = make_pipeline(
    TfidfVectorizer(stop_words='english',ngram_range=(1, 2), max_features=1000),
    LogisticRegression(max_iter=1000, class_weight='balanced')
)

model.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid
param_grid = {
    'logisticregression__C': [0.1, 1, 10],  # Regularization parameter
    'logisticregression__max_iter': [500, 1000],
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best model evaluation
y_pred_grid = grid_search.predict(X_test)
print(f"Best Parameters: {grid_search.best_params_}")
print(classification_report(y_test, y_pred_grid, target_names=le.classes_))


Best Parameters: {'logisticregression__C': 0.1, 'logisticregression__max_iter': 500}
              precision    recall  f1-score   support

    negative       0.35      0.55      0.43       172
     neutral       0.74      0.70      0.72       626
    positive       0.66      0.56      0.60       371

    accuracy                           0.63      1169
   macro avg       0.58      0.60      0.58      1169
weighted avg       0.66      0.63      0.64      1169



### Evaluate the model

In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test,y_pred))
print("Classification Report:\n", classification_report(y_test,y_pred, target_names=le.classes_))

Accuracy: 0.6313088109495295
Classification Report:
               precision    recall  f1-score   support

    negative       0.33      0.52      0.40       172
     neutral       0.76      0.67      0.71       626
    positive       0.67      0.62      0.65       371

    accuracy                           0.63      1169
   macro avg       0.59      0.60      0.59      1169
weighted avg       0.67      0.63      0.64      1169

