## Import common libraries

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score
import mlflow
import mlflow.sklearn
import joblib
import os

## Load processed dataset

In [2]:
# -----------------------------
# Load Cleaned Dataset
# -----------------------------
# Current directory is notebooks/
notebook_dir = Path().resolve()

# Move up one level to machine-learning/
ml_dir = notebook_dir.parent

# Build path to the processed CSV
data_path = ml_dir / "data" / "processed" / "mental_health_journal_cleaned.csv"

# Load the data
df = pd.read_csv(data_path)
print(f"Loaded data from: {data_path}")


Loaded data from: D:\project Github\web dev + machine learning\mental-health-journal\machine-learning\data\processed\mental_health_journal_cleaned.csv


In [3]:
X = df["lemmatized_text"]      # journal entries
# Use the actual target column name from the dataframe
y = df["sentiment_label"]  # target label: sentiment (positive/negative/neutral) 
print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())  

X shape: (10,)
y distribution:
 sentiment_label
positive    5
negative    4
neutral     1
Name: count, dtype: int64


In [4]:
#Setup MLflow tracking URI
# Current directory is notebooks/
notebook_dir = Path().resolve()

# Move up one level to machine-learning/
ml_dir = notebook_dir.parent

# Store MLflow runs locally
mlflow_dir = ml_dir / "mlruns"
mlflow.set_tracking_uri(f"file:///{mlflow_dir.as_posix()}")

mlflow.set_experiment("mental_health_journal_sentiment")

  return FileStore(store_uri, store_uri)


<Experiment: artifact_location=('file:///D:/project Github/web dev + machine '
 'learning/mental-health-journal/machine-learning/mlruns/437235503741645059'), creation_time=1765895457911, experiment_id='437235503741645059', last_update_time=1765895457911, lifecycle_stage='active', name='mental_health_journal_sentiment', tags={}>

In [5]:
X = df["text"]      # journal entries
y = df["sentiment_label"]     # target label: mood category, sentiment, risk level, etc.

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

label_encoder.classes_
# -----------------------------
# Train-Test Split
# -----------------------------
# Ensure stratified split is only used when each class has at least 2 samples
class_counts = pd.Series(y_encoded).value_counts()
if class_counts.min() < 2:
    print(f"Class counts:\n{class_counts}\nWarning: at least one class has fewer than 2 samples. Proceeding without stratification.")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=None
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )
# -----------------------------
# ML Pipeline
# -----------------------------
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=5000)),
    ("clf", LogisticRegression(max_iter=200))
])

# -----------------------------
# MLflow Tracking
# -----------------------------
with mlflow.start_run():

    mlflow.log_param("vectorizer", "TF-IDF")
    mlflow.log_param("model", "Logistic Regression")

    # Train model
    pipeline.fit(X_train, y_train)

    # Evaluate on training set
    train_pred = pipeline.predict(X_train)
    mlflow.log_metric("train_accuracy", (train_pred == y_train).mean())
 
 # Save model
    joblib.dump(pipeline, "../models/mental_health_model.pkl")
    mlflow.sklearn.log_model(pipeline, "mental_health_model")

print("Training Completed!")


Class counts:
2    5
0    4
1    1
Name: count, dtype: int64




Training Completed!
