## Import common libraries

In [7]:
import pandas as pd
import re
from pathlib import Path
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer

In [8]:
# --------------------------------------
# 1. DOWNLOAD NLTK RESOURCES
# --------------------------------------
nltk.download('punkt')
nltk.download("punkt_tab")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')



stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Load cleaned dataset

In [9]:
from pathlib import Path
import pandas as pd

# Current directory is notebooks/
notebook_dir = Path().resolve()

# Move up one level to machine-learning/
ml_dir = notebook_dir.parent

# Build path to the processed CSV
data_path = ml_dir / "data" / "cleaned" / "mental_health_journal_cleaned.csv"

# Load the data
df = pd.read_csv(data_path)
print(f"Loaded data from: {data_path}")
print(df.head())

Loaded data from: D:\project Github\web dev + machine learning\mental-health-journal\machine-learning\data\cleaned\mental_health_journal_cleaned.csv
                                                text     mood  mood_score  \
0     Today I practiced mindfulness and felt calmer.    happy           8   
1  I felt low today. Missed some important calls ...      sad           3   
2  I was worried about the upcoming presentation,...  anxious           4   
3  Today was a great day! I went jogging and felt...    happy           9   
4  A normal day, went through routine tasks witho...  neutral           6   

                                   tags   category                  created_at  
0               ["mindfulness", "calm"]  Self-care  2025-09-22 19:46:41.691538  
1                ["work", "motivation"]       Work  2025-09-22 21:56:26.714902  
2            ["presentation", "stress"]     Career  2025-09-22 21:56:56.809605  
3  ["exercise", "wellness", "outdoors"]     Health  2025-09-22 2

## Data Preprocessing pipeline

### Text Normalization


We perform text normalization, including lowercasing, removing punctuation, and removing extra whitespace.

In [10]:
def normalize_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df["clean_text"] = df["text"].apply(normalize_text)

In [None]:
# --------------------------------------
# 2. TOKENIZATION
# --------------------------------------
df["tokens"] = df["clean_text"].apply(nltk.word_tokenize)

# --------------------------------------
# 3. REMOVE STOPWORDS
# --------------------------------------
df["tokens_no_stop"] = df["tokens"].apply(
    lambda tokens: [t for t in tokens if t not in stop_words]
)

# --------------------------------------
# 4. LEMMATIZATION
# --------------------------------------
df["lemmatized"] = df["tokens_no_stop"].apply(
    lambda tokens: [lemmatizer.lemmatize(t) for t in tokens]
)
df["lemmatized_text"] = df["lemmatized"].apply(lambda x: " ".join(x))

# --------------------------------------
# 5. SENTIMENT LABELS
# --------------------------------------
def get_sentiment_label(text):
    score = sia.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

df["sentiment_label"] = df["clean_text"].apply(get_sentiment_label)

# --------------------------------------
# 6. VECTORIZATION (TF-IDF)
# --------------------------------------
vectorizer = TfidfVectorizer(max_features=3000)
tfidf_matrix = vectorizer.fit_transform(df["lemmatized_text"])

In [None]:
# --------------------------------------
#  TEXT NORMALIZATION
# --------------------------------------
def normalize_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df["clean_text"] = df["text"].apply(normalize_text)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        10 non-null     object
 1   mood        10 non-null     object
 2   mood_score  10 non-null     int64 
 3   tags        10 non-null     object
 4   category    10 non-null     object
 5   created_at  10 non-null     object
 6   clean_text  10 non-null     object
dtypes: int64(1), object(6)
memory usage: 692.0+ bytes


### Tokenization

Split text into words

In [None]:
# --------------------------------------
#  TOKENIZATION
# --------------------------------------
df["tokens"] = df["clean_text"].apply(nltk.word_tokenize)

### Stopword Removal

Remove frequent but meaningless words ("the", "and", "is"...).

In [None]:
#  REMOVE STOPWORDS
# --------------------------------------
df["tokens_no_stop"] = df["tokens"].apply(
    lambda tokens: [t for t in tokens if t not in stop_words]
)

### Lemmatization

Convert words to their root form (e.g., running → run).

In [None]:
# --------------------------------------
# 5. LEMMATIZATION
# --------------------------------------
df["lemmatized"] = df["tokens_no_stop"].apply(
    lambda tokens: [lemmatizer.lemmatize(t) for t in tokens]
)
# Join back to text
df["lemmatized_text"] = df["lemmatized"].apply(lambda x: " ".join(x))


### Sentiment Labels 

We can generate:

VADER sentiment scores

Convert to labels: positive, neutral, negative

In [None]:
# --------------------------------------
# SENTIMENT LABELS
# --------------------------------------
def get_sentiment_label(text):
    score = sia.polarity_scores(text)["compound"]
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

df["sentiment_label"] = df["clean_text"].apply(get_sentiment_label)

### Vectorization

Transform text into numeric representation:

TF-IDF or

Transformer embeddings

In [None]:
# --------------------------------------
#  VECTORIZATION (TF-IDF)
# --------------------------------------
vectorizer = TfidfVectorizer(max_features=3000)
tfidf_matrix = vectorizer.fit_transform(df["lemmatized_text"])

# To view vocabulary:
print(vectorizer.get_feature_names_out())

['anything' 'balancing' 'better' 'call' 'calmer' 'chatting' 'couldnt'
 'day' 'done' 'energized' 'family' 'feeling' 'felt' 'find' 'focus' 'get'
 'good' 'great' 'happening' 'important' 'ive' 'jogging' 'life' 'low'
 'mindfulness' 'missed' 'much' 'need' 'nice' 'normal' 'overwhelmed'
 'personal' 'practiced' 'presentation' 'productive' 'progress' 'project'
 'really' 'relaxing' 'routine' 'sleep' 'special' 'spent' 'struggled'
 'struggling' 'task' 'time' 'today' 'unproductive' 'upcoming' 'well'
 'went' 'without' 'work' 'working' 'worried']


In [None]:
# Save processed dataset

# Get base directory relative to notebook
base_dir = Path().resolve().parent  # since notebook is inside /notebooks/

# Construct full save path
processed_path = base_dir / "data" / "processed" / "mental_health_journal_cleaned.csv"

# Save file
df.to_csv(processed_path, index=False)
print(f"✅ Saved processed dataset to: {processed_path}")