# NLP Project: Stock Market Prediction

## 1. Data Loading & Initial Setup

In [9]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Load prices
try:
    prices = pd.read_csv('data/DJIA_table.csv', parse_dates=["Date"])
    prices = prices.sort_values("Date").reset_index(drop=True)
    prices.rename(columns=str.lower, inplace=True)

    # Load news
    news = pd.read_csv("data/RedditNews.csv", encoding="utf-8")
    news.rename(columns=str.lower, inplace=True)
    news["date"] = pd.to_datetime(news["date"], errors='coerce')

    #filter prices to match news date range
    min_news_date = news["date"].min()
    prices = prices[prices["date"] >= min_news_date].reset_index(drop=True)

    # Combine headlines
    headline_cols = [c for c in news.columns if c != "date"]
    news["text"] = news[headline_cols].astype(str).apply(lambda row: " ".join(row.values), axis=1)
    news = news[["date", "text"]]

    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"ERROR: Data file not found. {e}")

Data loaded successfully.


## 2. NLP Preprocessing

In [10]:
stop_words = set(stopwords.words('english'))
lemm = WordNetLemmatizer()

def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|\s+www\S+", "", t)
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    tokens = nltk.word_tokenize(t, preserve_line=True)
    tokens = [lemm.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

tqdm.pandas(desc="Cleaning text")
news["clean"] = news["text"].progress_apply(clean_text)

print("Preprocessing finished.")
news.head()

Cleaning text: 100%|██████████| 73608/73608 [00:06<00:00, 10635.19it/s]

Preprocessing finished.





Unnamed: 0,date,text,clean
0,2016-07-01,A 117-year-old woman in Mexico City finally re...,117 year old woman mexico city finally receive...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host,imf chief back athens permanent olympic host
2,2016-07-01,"The president of France says if Brexit won, so...",president france say brexit donald trump
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...,british man must give police hour notice sex t...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...,100 nobel laureate urge greenpeace stop opposi...


## 3. Merge Data & Create Target Variable

In [11]:
df = prices.merge(news, on="date", how="left")

# Fill missing news with empty strings
if 'clean' in df.columns:
    df["clean"] = df["clean"].fillna("")
else:
    df["clean"] = ""

# FIX: Convert 'close' column to numeric to avoid TypeError
df['close'] = pd.to_numeric(df['close'].astype(str).astype(str).str.replace(',', ''), errors='coerce')

# Create target variable
df["close_next"] = df["close"].shift(-1)
df["return_next"] = df["close_next"] / df["close"] - 1
df.dropna(subset=["close_next"], inplace=True)
df["target"] = (df["return_next"] > 0).astype(int)

print("Merge finished.")
df.head()

Merge finished.


Unnamed: 0,date,close,high,low,open,volume,text,clean,close_next,return_next,target
0,2008-06-09,12280.320312,12331.8603515625,12195.3203125,12210.1298828125,266350000,b'United States quits Human Rights Council',united state quits human right council,12280.320312,0.0,0
1,2008-06-09,12280.320312,12331.8603515625,12195.3203125,12210.1298828125,266350000,"b""Pentagon blocked Cheney's attack on Iran""",pentagon blocked cheney attack iran,12280.320312,0.0,0
2,2008-06-09,12280.320312,12331.8603515625,12195.3203125,12210.1298828125,266350000,"b""'J Street,' a new liberal Jewish organizatio...",street new liberal jewish organization hope ch...,12280.320312,0.0,0
3,2008-06-09,12280.320312,12331.8603515625,12195.3203125,12210.1298828125,266350000,"b'Former Ambassador Joseph Wilson: ""[The U.S. ...",former ambassador joseph wilson military prose...,12280.320312,0.0,0
4,2008-06-09,12280.320312,12331.8603515625,12195.3203125,12210.1298828125,266350000,b'EU leaders anxiously await Irish verdict on ...,leader anxiously await irish verdict lisbon tr...,12280.320312,0.0,0


## 4. Model Training & Evaluation

In [12]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000)
X_text = tfidf.fit_transform(df['clean'])

X = X_text
y = df['target']

# Time-based split (80% train, 20% test)
train_size = int(len(df) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Train the model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("--- Model Evaluation Results ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

--- Model Evaluation Results ---
Accuracy: 0.8629
F1 Score: 0.0000
ROC AUC Score: 0.5000

Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.93      9174
           1       0.00      0.00      0.00      1458

    accuracy                           0.86     10632
   macro avg       0.43      0.50      0.46     10632
weighted avg       0.74      0.86      0.80     10632



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
