# Stress Level Prediction from Journal Entries
This notebook implements a machine learning model to predict stress levels from journal entries.

In [25]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import joblib

## Data Preparation
Create a dataset of journal entries with their corresponding stress levels

In [26]:
def generate_synthetic_data(n_samples=100):
    # Key phrases for each stress level
    high_stress_phrases = [
        "couldn't", "exhausted", "anxious", "overwhelming", "stressed", 
        "sleepless", "racing mind", "too much", "terrible", "on edge",
        "worried", "panic", "insomnia", "nightmare", "restless"
    ]
    
    low_stress_phrases = [
        "great workout", "positive", "decent sleep", "peaceful", "relaxed", 
        "calm", "refreshed", "full 8 hours", "enjoyed", "good rest",
        "happy", "energetic", "content", "well-rested", "mindful"
    ]
    
    medium_stress_phrases = [
        "bit tired", "managing okay", "busy", "nothing too stressful", 
        "a bit restless", "somewhat tired", "slightly anxious", "okay",
        "could be better", "handling it", "neutral", "moderate", "coping"
    ]
    
    journal_entries = []
    stress_levels = []
    
    for _ in range(n_samples):
        stress_class = np.random.choice(['High', 'Medium', 'Low'])
        
        # Generate a synthetic journal entry based on the stress level
        if stress_class == 'High':
            phrases = np.random.choice(high_stress_phrases, size=np.random.randint(2, 5), replace=False)
            fillers = ["I'm feeling", "Today was", "I am", "My day has been", "I've been"]
            entry = f"{np.random.choice(fillers)} {' and '.join(phrases)}. Sleep has been poor."
        
        elif stress_class == 'Low':
            phrases = np.random.choice(low_stress_phrases, size=np.random.randint(2, 5), replace=False)
            fillers = ["I'm feeling", "Today was", "I had", "Enjoyed my day", "I've been"]
            entry = f"{np.random.choice(fillers)} {' and '.join(phrases)}. Slept well."
        
        else:  # Medium
            phrases = np.random.choice(medium_stress_phrases, size=np.random.randint(2, 5), replace=False)
            fillers = ["I'm feeling", "Today was", "I am", "My day has been", "It's been"]
            entry = f"{np.random.choice(fillers)} {' and '.join(phrases)}. Sleep was okay."
        
        journal_entries.append(entry)
        stress_levels.append(stress_class)
    
    # Include the original examples
    original_data = [
        ["I couldn't sleep well last night. Feeling exhausted and anxious. Work is overwhelming.", "High"],
        ["Had a great workout today! Feeling pumped and positive. Got decent sleep too.", "Low"],
        ["Didn't get much rest. Kids were up all night. I'm a bit tired but managing okay.", "Medium"],
        ["Feeling really stressed about upcoming deadlines. My sleep was all over the place this week.", "High"],
        ["Had a peaceful day. Managed to catch up on sleep. Feeling relaxed.", "Low"],
        ["I'm okay, just a bit tired. Work was busy but nothing too stressful.", "Medium"],
        ["Another sleepless night. My mind won't stop racing. Everything feels too much right now.", "High"],
        ["Enjoyed a walk in the park. Got a full 8 hours of sleep. Feeling calm and refreshed.", "Low"],
        ["Busy day. Got some sleep but still a bit restless. Managing stress okay.", "Medium"],
        ["Too many things to handle today. My sleep has been terrible, and I'm feeling on edge.", "High"]
    ]
    
    for entry, level in original_data:
        journal_entries.append(entry)
        stress_levels.append(level)
    
    return pd.DataFrame({'JournalEntry': journal_entries, 'StressLevel': stress_levels})

# Generate dataset with 100 synthetic samples + 10 original samples
df = generate_synthetic_data(100)
df.head()


Unnamed: 0,JournalEntry,StressLevel
0,I am bit tired and okay. Sleep was okay.,Medium
1,My day has been restless and insomnia. Sleep h...,High
2,I'm feeling relaxed and happy and positive and...,Low
3,My day has been worried and sleepless and terr...,High
4,I'm feeling busy and handling it and managing ...,Medium


## Text Preprocessing
Define and apply text preprocessing function

In [27]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)  
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['CleanedText'] = df['JournalEntry'].apply(preprocess_text)
df.head()

Unnamed: 0,JournalEntry,StressLevel,CleanedText
0,I am bit tired and okay. Sleep was okay.,Medium,i am bit tired and okay sleep was okay
1,My day has been restless and insomnia. Sleep h...,High,my day has been restless and insomnia sleep ha...
2,I'm feeling relaxed and happy and positive and...,Low,i m feeling relaxed and happy and positive and...
3,My day has been worried and sleepless and terr...,High,my day has been worried and sleepless and terr...
4,I'm feeling busy and handling it and managing ...,Medium,i m feeling busy and handling it and managing ...


## Model Training Setup
Split data and create the pipeline

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    df['CleanedText'], df['StressLevel'], test_size=0.2, random_state=42, stratify=df['StressLevel']
)

# Create a pipeline with fewer features and a simpler model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=100, ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

## Model Training and Evaluation

In [29]:
pipeline.fit(X_train, y_train)

# Evaluate with cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print(f"Cross-validation accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Evaluate on test set
y_pred = pipeline.predict(X_test)
print(f"Test accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Cross-validation accuracy: 0.9882 ± 0.0235
Test accuracy: 0.9545

Classification Report:
               precision    recall  f1-score   support

        High       1.00      0.83      0.91         6
         Low       0.90      1.00      0.95         9
      Medium       1.00      1.00      1.00         7

    accuracy                           0.95        22
   macro avg       0.97      0.94      0.95        22
weighted avg       0.96      0.95      0.95        22



## Save Model

In [30]:
joblib.dump(pipeline, 'stress_predictor_improved.joblib')
print("Model saved as 'stress_predictor_improved.joblib'")

Model saved as 'stress_predictor_improved.joblib'


## Test Predictions

In [31]:
def predict_stress_level(text, model=pipeline):
    cleaned_text = preprocess_text(text)
    prediction = model.predict([cleaned_text])[0]
    
    # Get prediction probabilities for more insights
    proba = model.predict_proba([cleaned_text])[0]
    classes = model.classes_
    
    # Create a dictionary of class:probability pairs
    confidence = {class_name: f"{prob:.2f}" for class_name, prob in zip(classes, proba)}
    
    return prediction, confidence

In [32]:
print("Stress Level Predictor")
print("-" * 50)
print("Enter your journal entry (press Enter twice to finish):")

# Collect multi-line input
lines = []
while True:
    line = input()
    if line:
        lines.append(line)
    else:
        break

journal_entry = " ".join(lines)

if journal_entry.strip():
    prediction, confidence = predict_stress_level(journal_entry)
    print("\nYour Entry:", journal_entry)
    print("Predicted Stress Level:", prediction)
    print("Confidence scores:", confidence)
else:
    print("No input provided.")

Stress Level Predictor
--------------------------------------------------
Enter your journal entry (press Enter twice to finish):

Your Entry: decent sleep
Predicted Stress Level: Low
Confidence scores: {'High': '0.19', 'Low': '0.61', 'Medium': '0.20'}
