# Mental Health Conversations Data Exploration

This notebook explores the Kaggle mental health conversations dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# Load the data
df = pd.read_csv('../data/raw/mental_health_conversations.csv')
print(f"Loaded {len(df)} conversations")

In [None]:
# Display the first few rows
df.head()

In [None]:
# Basic data info
print("Dataset information:")
df.info()

# Check for missing values
print("\nMissing values:")
df.isnull().sum()

In [None]:
# Add message length analysis
df['patient_msg_length'] = df['patient_message'].str.len()
df['therapist_resp_length'] = df['therapist_response'].str.len()

# Plot message length distributions
fig, ax = plt.subplots(1, 2, figsize=(15, 5))

sns.histplot(df['patient_msg_length'], kde=True, ax=ax[0])
ax[0].set_title('Patient Message Length')
ax[0].set_xlabel('Characters')

sns.histplot(df['therapist_resp_length'], kde=True, ax=ax[1])
ax[1].set_title('Therapist Response Length')
ax[1].set_xlabel('Characters')

plt.tight_layout()
plt.show()

In [None]:
# Add a simple response type classification
def categorize_response(text):
    text = text.lower()
    if any(word in text for word in ['suggest', 'recommend', 'try', 'should']):
        return 'direct_advice'
    elif any(word in text for word in ['feel', 'seem', 'sounds like']):
        return 'reflection'
    elif any(word in text for word in ['?', 'tell me more', 'could you']):
        return 'question'
    else:
        return 'other'

df['response_type'] = df['therapist_response'].apply(categorize_response)

# Plot response type distribution
plt.figure(figsize=(10, 6))
sns.countplot(y='response_type', data=df, order=df['response_type'].value_counts().index)
plt.title('Distribution of Response Types')
plt.tight_layout()
plt.show()

In [None]:
# Analyze response length by response type
plt.figure(figsize=(10, 6))
sns.boxplot(x='therapist_resp_length', y='response_type', data=df, orient='h')
plt.title('Response Length by Response Type')
plt.xlabel('Characters')
plt.tight_layout()
plt.show()

In [None]:
# Basic text analysis - most common words in patient messages
from collections import Counter
import re

def get_top_words(texts, n=20, min_length=3):
    all_words = ' '.join(texts).lower()
    # Remove punctuation and split into words
    words = re.findall(r'\b\w+\b', all_words)
    # Filter out short words and count
    word_counts = Counter([w for w in words if len(w) >= min_length])
    return word_counts.most_common(n)

# Get top words for patient messages
top_patient_words = get_top_words(df['patient_message'])

# Plot
plt.figure(figsize=(12, 6))
words, counts = zip(*top_patient_words)
sns.barplot(x=list(counts), y=list(words))
plt.title('Most Common Words in Patient Messages')
plt.xlabel('Count')
plt.tight_layout()
plt.show()