In [10]:
# ===== Cell 1: Imports & NLTK Setup =====
import pandas as pd
import re
import nltk
import os

# Create a folder for NLTK data and ensure Python can see it
nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)

# Force-download packages directly into our folder
nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)
nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))


In [11]:
# Replace this path with the location of your CSV
csv_path = "../data/raw_analyst_ratings.csv"  # <-- Update if needed

# Check if file exists first
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"{csv_path} not found. Please check the file path.")

# Load CSV
df = pd.read_csv(csv_path)

# Remove unwanted column if exists
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# Inspect the data
df.head()


Unnamed: 0,headline,url,publisher,date,stock
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


In [12]:
# Function to clean text
def clean_text(text):
    text = str(text).lower()                     # lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()    # remove extra spaces
    return text

# Apply cleaning
df["clean_headline"] = df["headline"].apply(clean_text)
df[["headline", "clean_headline"]].head()


Unnamed: 0,headline,clean_headline
0,Stocks That Hit 52-Week Highs On Friday,stocks that hit 52week highs on friday
1,Stocks That Hit 52-Week Highs On Wednesday,stocks that hit 52week highs on wednesday
2,71 Biggest Movers From Friday,71 biggest movers from friday
3,46 Stocks Moving In Friday's Mid-Day Session,46 stocks moving in fridays midday session
4,B of A Securities Maintains Neutral on Agilent...,b of a securities maintains neutral on agilent...


In [13]:
# Function to remove stopwords
def remove_stopwords(text):
    tokens = text.split()                       # split by space
    filtered = [word for word in tokens if word not in stop_words]
    return " ".join(filtered)

# Apply stopword removal
df["clean_no_stopwords"] = df["clean_headline"].apply(remove_stopwords)
df[["clean_headline", "clean_no_stopwords"]].head()


Unnamed: 0,clean_headline,clean_no_stopwords
0,stocks that hit 52week highs on friday,stocks hit 52week highs friday
1,stocks that hit 52week highs on wednesday,stocks hit 52week highs wednesday
2,71 biggest movers from friday,71 biggest movers friday
3,46 stocks moving in fridays midday session,46 stocks moving fridays midday session
4,b of a securities maintains neutral on agilent...,b securities maintains neutral agilent technol...


In [14]:
# Simple tokenizer using Python split (no NLTK needed)
def simple_tokenize(text):
    # split on spaces
    return text.split()

# Apply tokenizer
df["tokens"] = df["clean_no_stopwords"].apply(simple_tokenize)

# Inspect result
df[["clean_no_stopwords", "tokens"]].head()


Unnamed: 0,clean_no_stopwords,tokens
0,stocks hit 52week highs friday,"[stocks, hit, 52week, highs, friday]"
1,stocks hit 52week highs wednesday,"[stocks, hit, 52week, highs, wednesday]"
2,71 biggest movers friday,"[71, biggest, movers, friday]"
3,46 stocks moving fridays midday session,"[46, stocks, moving, fridays, midday, session]"
4,b securities maintains neutral agilent technol...,"[b, securities, maintains, neutral, agilent, t..."


In [17]:
# Save preprocessed data to the data folder
import os

# Ensure data folder exists
data_folder = r"C:\10Academy\week-1\financial-news-sentiment\data"
os.makedirs(data_folder, exist_ok=True)

# Save the CSV
preprocessed_path = os.path.join(data_folder, "preprocessed_sentiment.csv")
df.to_csv(preprocessed_path, index=False)

print(f"Preprocessed data saved successfully at:\n{preprocessed_path}")


Preprocessed data saved successfully at:
C:\10Academy\week-1\financial-news-sentiment\data\preprocessed_sentiment.csv
