In [None]:
# ===== Cell 1: Imports & NLTK Setup =====
import pandas as pd
import re
import nltk
import os

# Create a folder for NLTK data and ensure Python can see it
nltk_data_dir = os.path.join(os.path.expanduser("~"), "nltk_data")
os.makedirs(nltk_data_dir, exist_ok=True)
nltk.data.path.append(nltk_data_dir)

# Force-download packages directly into our folder
nltk.download('stopwords', download_dir=nltk_data_dir, quiet=True)
nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))


In [None]:
# Replace this path with the location of your CSV
csv_path = "../data/raw_analyst_ratings.csv"  # <-- Update if needed

# Check if file exists first
if not os.path.exists(csv_path):
    raise FileNotFoundError(f"{csv_path} not found. Please check the file path.")

# Load CSV
df = pd.read_csv(csv_path)

# Remove unwanted column if exists
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# Inspect the data
df.head()


In [None]:
# Function to clean text
def clean_text(text):
    text = str(text).lower()                     # lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()    # remove extra spaces
    return text

# Apply cleaning
df["clean_headline"] = df["headline"].apply(clean_text)
df[["headline", "clean_headline"]].head()


In [None]:
# Function to remove stopwords
def remove_stopwords(text):
    tokens = text.split()                       # split by space
    filtered = [word for word in tokens if word not in stop_words]
    return " ".join(filtered)

# Apply stopword removal
df["clean_no_stopwords"] = df["clean_headline"].apply(remove_stopwords)
df[["clean_headline", "clean_no_stopwords"]].head()


In [None]:
# Simple tokenizer using Python split (no NLTK needed)
def simple_tokenize(text):
    # split on spaces
    return text.split()

# Apply tokenizer
df["tokens"] = df["clean_no_stopwords"].apply(simple_tokenize)

# Inspect result
df[["clean_no_stopwords", "tokens"]].head()


In [None]:
# Save preprocessed data to the data folder
import os

# Ensure data folder exists
data_folder = r"C:\10Academy\week-1\financial-news-sentiment\data"
os.makedirs(data_folder, exist_ok=True)

# Save the CSV
preprocessed_path = os.path.join(data_folder, "preprocessed_sentiment.csv")
df.to_csv(preprocessed_path, index=False)

print(f"Preprocessed data saved successfully at:\n{preprocessed_path}")
