In [None]:
# 1. Imports
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

plt.rcParams.update({'figure.max_open_warning': 0})

In [None]:
# 2. Paths & load
DATA_PATH = 'dreaddit_StressAnalysis - Sheet1.csv'
OUT_DIR = Path('dreaddit_eda_outputs')
OUT_DIR.mkdir(exist_ok=True)

df = pd.read_csv(DATA_PATH)

print('rows, cols:', df.shape)
print('columns:', df.columns.tolist())

In [None]:
# 3. Select required columns
keep_cols = [c for c in df.columns if (
    c == 'text' or c == 'label' or c.startswith('lex_liwc_') or c.startswith('lex_dal_') or c.startswith('syntax_') or c == 'sentiment' or c.startswith('social_')
)]

expected_extra = ['confidence', 'post_id', 'id', 'subreddit', 'social_karma']
for c in expected_extra:
    if c in df.columns and c not in keep_cols:
        keep_cols.append(c)

eda_df = df[keep_cols].copy()

In [None]:
# 4. Basic statistics
summary = eda_df.describe(include='all').transpose()
summary.to_csv(OUT_DIR / 'basic_summary.csv')

class_counts = eda_df['label'].value_counts(dropna=False)
class_counts.to_csv(OUT_DIR / 'label_counts.csv')

In [None]:
# 5. Class balance
import seaborn as sns
plt.figure(figsize=(6,4))
sns.barplot(x=class_counts.index, y=class_counts.values)
plt.title('Class distribution')
plt.xlabel('label')
plt.ylabel('count')
plt.tight_layout()
plt.savefig(OUT_DIR/'class_distribution.png')
plt.close()

In [None]:
# 6. Text length analysis
eda_df['char_len'] = eda_df['text'].astype(str).apply(len)
eda_df['token_list'] = eda_df['text'].astype(str).apply(word_tokenize)
eda_df['token_len'] = eda_df['token_list'].apply(len)

eda_df[['char_len','token_len']].describe().to_csv(OUT_DIR/'text_length_summary.csv')

plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
sns.histplot(eda_df['char_len'], bins=50, kde=True)
plt.title('Character length')
plt.subplot(1,2,2)
sns.histplot(eda_df['token_len'], bins=50, kde=True)
plt.title('Token length')
plt.tight_layout()
plt.savefig(OUT_DIR/'text_length_histograms.png')
plt.close()

plt.figure(figsize=(8,4))
sns.boxplot(x='label', y='token_len', data=eda_df)
plt.title('Token length by class')
plt.tight_layout()
plt.savefig(OUT_DIR/'token_length_by_class.png')
plt.close()

In [None]:
# 7. LIWC distributions
liwc_cols = [c for c in eda_df.columns if c.startswith('lex_liwc_')]
interesting = [c for c in ['lex_liwc_posemo','lex_liwc_negemo','lex_liwc_anx','lex_liwc_anger','lex_liwc_sad'] if c in liwc_cols]

for c in interesting:
    plt.figure(figsize=(6,4))
    sns.histplot(eda_df[c].dropna(), bins=40, kde=True)
    plt.title(f'Distribution: {c}')
    plt.tight_layout()
    plt.savefig(OUT_DIR/f'{c}_dist.png')
    plt.close()

plt.figure(figsize=(10,6))
sub = eda_df.melt(id_vars=['label'], value_vars=interesting, var_name='liwc', value_name='value')
sns.violinplot(x='liwc', y='value', hue='label', data=sub, split=True)
plt.title('Selected LIWC features by class')
plt.tight_layout()
plt.savefig(OUT_DIR/'liwc_by_class_violin.png')
plt.close()

In [None]:
# 8. DAL distribution
dal_cols = [c for c in eda_df.columns if c.startswith('lex_dal_')]
if dal_cols:
    plt.figure(figsize=(12,4))
    for i,c in enumerate(dal_cols):
        plt.subplot(1,len(dal_cols),i+1)
        sns.histplot(eda_df[c].dropna(), bins=40, kde=True)
        plt.title(c)
    plt.tight_layout()
    plt.savefig(OUT_DIR/'dal_distributions.png')
    plt.close()

In [None]:
# 9. Sentiment
if 'sentiment' in eda_df.columns:
    plt.figure(figsize=(6,4))
    if pd.api.types.is_numeric_dtype(eda_df['sentiment']):
        sns.histplot(eda_df['sentiment'].dropna(), bins=40, kde=True)
    else:
        sns.countplot(x='sentiment', data=eda_df)
    plt.tight_layout()
    plt.savefig(OUT_DIR/'sentiment_distribution.png')
    plt.close()

In [None]:
# 10. Syntax complexity
syntax_cols = [c for c in eda_df.columns if c.startswith('syntax_') or c in ['syntax_fk_grade','syntax_ari']]
for c in syntax_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(eda_df[c].dropna(), bins=40, kde=True)
    plt.title(c)
    plt.tight_layout()
    plt.savefig(OUT_DIR/f'{c}_dist.png')
    plt.close()

In [None]:
# 11. Social metadata
social_cols = [c for c in eda_df.columns if c.startswith('social_')]
if social_cols:
    eda_df[social_cols].describe().to_csv(OUT_DIR/'social_summary.csv')

    plt.figure(figsize=(12,4))
    for i,c in enumerate(social_cols):
        plt.subplot(1,len(social_cols),i+1)
        sns.histplot(eda_df[c].dropna(), bins=40, kde=True)
        plt.title(c)
    plt.tight_layout()
    plt.savefig(OUT_DIR/'social_distributions.png')
    plt.close()

In [None]:
# 12. Correlation heatmap
corr_cols = [c for c in eda_df.columns if (
    c.startswith('lex_liwc_') or c.startswith('lex_dal_') or c.startswith('syntax_') or c=='sentiment' or c.startswith('social_')
) and pd.api.types.is_numeric_dtype(eda_df[c])]

corr_df = eda_df[corr_cols].dropna()
pearson_corr = corr_df.corr(method='pearson')
spearman_corr = corr_df.corr(method='spearman')

plt.figure(figsize=(14,12))
sns.heatmap(pearson_corr, cmap='vlag', center=0)
plt.tight_layout()
plt.savefig(OUT_DIR/'pearson_correlation_heatmap.png')
plt.close()

plt.figure(figsize=(14,12))
sns.heatmap(spearman_corr, cmap='vlag', center=0)
plt.tight_layout()
plt.savefig(OUT_DIR/'spearman_correlation_heatmap.png')
plt.close()

pearson_corr.to_csv(OUT_DIR/'pearson_corr.csv')
spearman_corr.to_csv(OUT_DIR/'spearman_corr.csv')

In [None]:
# 13. Sample posts
sample_by_class = eda_df.groupby('label')['text'].apply(lambda s: s.sample(n=min(10,len(s)), random_state=42)).reset_index()
sample_by_class.to_csv(OUT_DIR/'sample_posts_by_class.csv', index=False)

In [None]:
# 14. Noise & outliers
if 'confidence' in eda_df.columns:
    low_conf = eda_df[eda_df['confidence'] < 0.6]
    low_conf.to_csv(OUT_DIR/'low_confidence_labels.csv', index=False)

q1 = eda_df['token_len'].quantile(0.25)
q3 = eda_df['token_len'].quantile(0.75)
irq = q3 - q1
outliers_len = eda_df[(eda_df['token_len'] < (q1 - 1.5*irq)) | (eda_df['token_len'] > (q3 + 1.5*irq))]
outliers_len.to_csv(OUT_DIR/'outliers_by_length.csv', index=False)

In [None]:
# 15. Numeric matrix
numeric_cols = corr_cols + ['token_len','char_len']
X_numeric = eda_df[numeric_cols].copy()
X_numeric.to_csv(OUT_DIR/'numeric_matrix_for_modeling.csv', index=False)

In [None]:
# 16. Report summary
with open(OUT_DIR/'eda_report_summary.txt','w') as f:
    f.write('summary file')

In [None]:
# 17. Train/Test + CV
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

target = eda_df['label']
X = eda_df[numeric_cols].fillna(0)

X_train, X_test, y_train, y_test = train_test_split(
    X, target, test_size=0.2, stratify=target, random_state=42
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = []
fold = 1

for train_idx, val_idx in skf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=500))
    ])

    pipe.fit(X_tr, y_tr)
    preds = pipe.predict(X_val)

    cv_results.append(accuracy_score(y_val, preds))

final_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=500))
])

final_model.fit(X_train, y_train)
preds_test = final_model.predict(X_test)