In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
dataset=pd.read_csv('/content/analyst_ratings_processed.csv')
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset['date']=pd.to_datetime(dataset['date'],errors='coerce')

In [None]:
dataset.isnull().sum()

In [None]:
dataset.dropna(inplace=True)

In [None]:
dataset.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
dataset['title']=dataset['title'].str.lower()

Removing Special Characters

In [None]:
dataset['title_clean'] = dataset['title'].str.replace(r'[^\w\s]', '', regex=True)
dataset['title_clean'] = dataset['title_clean'].str.replace(r'\s+', ' ', regex=True)
dataset['title_clean'] = dataset['title_clean'].str.strip()

Tokenization

In [None]:
dataset['tokens'] = dataset['title_clean'].str.split()

Removing Stop words


In [None]:
my_stopwords = {
    "is", "am", "are", "the", "a", "an", "and", "or", "but", "if",
    "of", "at", "by", "for", "with", "about", "into", "through",
    "during", "before", "after", "to", "in", "on", "from","this","that"
}
def filter_stopwords(word_list):
    cleaned_list = []
    for word in word_list:
        if word.lower() not in my_stopwords:
            cleaned_list.append(word)
    return cleaned_list

dataset['tokens_clean'] = dataset['tokens'].apply(filter_stopwords)

Lemmatizer

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def apply_lemmatization(token_list):
    return [lemmatizer.lemmatize(word) for word in token_list]

dataset['lemmatized_tokens'] = dataset['tokens_clean'].apply(apply_lemmatization)

Vectorization


In [None]:
dataset['final_text'] = dataset['lemmatized_tokens'].apply(lambda x: " ".join(x))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=20000)
X = tfidf.fit_transform(dataset['final_text'])


Labeling

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
vader = SentimentIntensityAnalyzer()

In [None]:

new_words = {
    'lower': -2.5,
    'higher': 2.5,
    'rise': 1.5,
    'fall': -1.5,
    'drop': -2.0,
    'surge': 2.0,
    'crash': -3.5,
    'growth': 2.0,
    'loss': -2.5,
    'profit': 2.5
}
vader.lexicon.update(new_words)
def get_sentiment_label_v2(text):

    if 'lower' in text or 'down' in text or 'fall' in text:
        return -1

    if 'higher' in text or 'rise' in text or 'surge' in text:
        return 1


    score = vader.polarity_scores(text)['compound']
    if score >= 0.3:
        return 1
    elif score <= -0.3:
        return -1
    else:
        return 0
dataset['sentiment_label'] = dataset['final_text'].apply(get_sentiment_label_v2)


dataset_filtered = dataset[dataset['sentiment_label'] != 0].copy()
y=dataset['sentiment_label']

In [None]:

dataset['sentiment_label'] = dataset['final_text'].apply(get_sentiment_label_v2)

dataset = dataset[dataset['sentiment_label'] != 0]
print(dataset['sentiment_label'].value_counts())

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_balanced, y_balanced = rus.fit_resample(X, y)


print("New Balanced Counts:")
print(y_balanced.value_counts())

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
model.score(X_test,y_test)*100

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [None]:
f1 = f1_score(y_test, model.predict(X_test), average='weighted') * 100
f1

In [None]:
precision_score(y_test, model.predict(X_test), average='weighted')*100

In [None]:
recall_score(y_test, model.predict(X_test), average='weighted')*100

In [None]:
accuracy_score(y_test,model.predict(X_test))*100

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred, labels=[-1, 1])


plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative (-1)', 'Positive (1)'],
            yticklabels=['Negative (-1)', 'Positive (1)'])

plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix: Financial Sentiment Model')
plt.show()

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output


text_input = widgets.Textarea(
    value='',
    placeholder='Type a financial headline here...',
    description='Headline:',
    layout={'width': '500px', 'height': '100px'}
)

button = widgets.Button(
    description='Analyze Sentiment',
    button_style='success',
    tooltip='Click to see confidence'
)

output = widgets.Output()


def on_button_clicked(b):
    with output:
        clear_output()
        user_text = text_input.value

        if not user_text.strip():
            print("Please enter a headline to analyze.")
            return


        test_vector = tfidf.transform([user_text])


        prediction = model.predict(test_vector)[0]
        probabilities = model.predict_proba(test_vector)[0]


        conf_index = 1 if prediction == 1 else 0
        confidence = probabilities[conf_index] * 100


        print("-" * 50)
        if prediction == 1:
            print(f"RESULT: POSITIVE (1) ðŸš€")
            print(f"CONFIDENCE: {confidence:.2f}%")
        else:
            print(f"RESULT: NEGATIVE (-1) ðŸ“‰")
            print(f"CONFIDENCE: {confidence:.2f}%")
        print("-" * 50)


button.on_click(on_button_clicked)
display(text_input, button, output)