In [1]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
# Set custom NLTK path and download necessary resources
nltk.data.path.insert(0, '/Users/bhanuprakash/Documents/trainings/RAG/pre-analysis/nltk/')
nltk.download('stopwords', download_dir='/Users/bhanuprakash/Documents/trainings/RAG/pre-analysis/nltk/')
nltk.download('punkt', download_dir='/Users/bhanuprakash/Documents/trainings/RAG/pre-analysis/nltk/')
nltk.download('wordnet', download_dir='/Users/bhanuprakash/Documents/trainings/RAG/pre-analysis/nltk/')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bhanuprakash/Documents/trainings/RAG/pre-
[nltk_data]     analysis/nltk/...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bhanuprakash/Documents/trainings/RAG/pre-
[nltk_data]     analysis/nltk/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bhanuprakash/Documents/trainings/RAG/pre-
[nltk_data]     analysis/nltk/...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Initialize Sentiment Analyzer and load data
analyzer = SentimentIntensityAnalyzer()
news_df = pd.read_csv('../../pre-analysis/datasets/yahoo_finance_news1.csv')

In [4]:
print("News Data Sample:")
print(news_df.head())
print(news_df.info())

News Data Sample:
  ticker                                              title  \
0    AAL  American Airlines just flew its longest flight...   
1    AAL  Zacks.com featured highlights include American...   
2    AAL  American Airlines tests boarding technology th...   
3    AAL  Is American Airlines Group Inc. (AAL) the Best...   
4    AAL  How American Airlines Is Fighting 'Gate Lice' ...   

                  publisher  \
0                    Quartz   
1                     Zacks   
2  Associated Press Finance   
3            Insider Monkey   
4              Investopedia   

                                                link        date  
0  https://finance.yahoo.com/m/4b53d028-e5c9-337d...  1730127060  
1  https://finance.yahoo.com/news/zacks-com-featu...  1730096760  
2  https://finance.yahoo.com/news/american-airlin...  1729960165  
3  https://finance.yahoo.com/news/american-airlin...  1729938583  
4  https://finance.yahoo.com/m/f9f3b3be-c05f-366b...  1729933200  
<class 'pandas

In [5]:
# Check and handle missing values
print("\nMissing values in News Data:", news_df.isnull().sum())
news_df.dropna(inplace=True)


Missing values in News Data: ticker       0
title        0
publisher    0
link         0
date         0
dtype: int64


In [6]:
# Define stop words and initialize lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
# Text pre-processing function
def clean_text(text):
    # Lowercasing
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [8]:
# Clean titles
news_df['cleaned_title'] = news_df['title'].apply(clean_text)
print("\nCleaned Titles Sample:")
print(news_df[['title', 'cleaned_title']].head())


Cleaned Titles Sample:
                                               title  \
0  American Airlines just flew its longest flight...   
1  Zacks.com featured highlights include American...   
2  American Airlines tests boarding technology th...   
3  Is American Airlines Group Inc. (AAL) the Best...   
4  How American Airlines Is Fighting 'Gate Lice' ...   

                                       cleaned_title  
0          american airline flew longest flight ever  
1  zackscom featured highlight include american a...  
2  american airline test boarding technology audi...  
3  american airline group inc aal best airline st...  
4  american airline fighting gate louse boarding ...  


In [9]:
def get_sentiment_score(text):
    sentiment = analyzer.polarity_scores(text)
    if sentiment['compound'] >= 0.05:
        return 'positive'
    elif sentiment['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

In [10]:
# Apply sentiment analysis to cleaned titles
news_df['sentiment'] = news_df['cleaned_title'].apply(get_sentiment_score)
print("\nSentiment Analysis Sample:")
print(news_df[['title', 'sentiment']].head())


Sentiment Analysis Sample:
                                               title sentiment
0  American Airlines just flew its longest flight...   neutral
1  Zacks.com featured highlights include American...  positive
2  American Airlines tests boarding technology th...  negative
3  Is American Airlines Group Inc. (AAL) the Best...  positive
4  How American Airlines Is Fighting 'Gate Lice' ...  negative


In [11]:
# Save the data with sentiment scores to a new CSV
output_path = '../../pre-analysis/datasets/news_with_sentiment.csv'
news_df.to_csv(output_path, index=False)
print(f"\nSentiment analysis results saved to {output_path}")


Sentiment analysis results saved to ../../pre-analysis/datasets/news_with_sentiment.csv
