# Exploratory Data Analysis: Text Emotion Dataset

This notebook explores the distribution and characteristics of the `dair-ai/emotion` dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Load data
train_df = pd.read_csv('../data/processed/train.csv')

print(f"Total training samples: {len(train_df)}")
train_df.head()

## 1. Class Distribution
Let's see if the dataset is balanced.

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=train_df, x='label_name', order=train_df['label_name'].value_counts().index, palette='viridis')
plt.title('Emotion Distribution in Training Set')
plt.xlabel('Emotion')
plt.ylabel('Count')
plt.show()

## 2. Text Length Analysis
Does the length of the text vary by emotion?

In [None]:
train_df['text_len'] = train_df['text'].apply(lambda x: len(x.split()))

plt.figure(figsize=(12, 6))
sns.boxplot(data=train_df, x='label_name', y='text_len', palette='magma')
plt.title('Text Length (Word Count) by Emotion')
plt.xlabel('Emotion')
plt.ylabel('Number of Words')
plt.show()

## 3. Word Clouds
Frequent words per emotion.

In [None]:
emotions = train_df['label_name'].unique()

fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.flatten()

for i, emotion in enumerate(emotions):
    text = " ".join(train_df[train_df['label_name'] == emotion]['text'])
    wordcloud = WordCloud(width=400, height=200, background_color='white').generate(text)
    axes[i].imshow(wordcloud, interpolation='bilinear')
    axes[i].set_title(f'Word Cloud: {emotion.upper()}')
    axes[i].axis('off')

plt.tight_layout()
plt.show()