# Customer Feedback Classification - Exploratory Data Analysis

This notebook explores the customer feedback dataset before model training.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import config

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Data

In [None]:
df = pd.read_csv(config.RAW_DATA_PATH)
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
df.info()

## 2. Label Distribution

In [None]:
label_counts = df['label'].value_counts()
print("Label Distribution:")
print(label_counts)
print(f"\nClass balance: {label_counts.min() / label_counts.max():.2f}")

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

label_counts.plot(kind='bar', ax=ax1, color='steelblue', alpha=0.8)
ax1.set_title('Label Distribution (Count)', fontsize=14, fontweight='bold')
ax1.set_xlabel('Label')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

label_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%', startangle=90)
ax2.set_title('Label Distribution (Percentage)', fontsize=14, fontweight='bold')
ax2.set_ylabel('')

plt.tight_layout()
plt.show()

## 3. Text Length Analysis

In [None]:
df['text_length'] = df['feedback_text'].str.len()
df['word_count'] = df['feedback_text'].str.split().str.len()

print("Text Length Statistics:")
print(df[['text_length', 'word_count']].describe())

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

df['text_length'].hist(bins=30, ax=axes[0, 0], color='skyblue', edgecolor='black')
axes[0, 0].set_title('Distribution of Text Length (Characters)', fontweight='bold')
axes[0, 0].set_xlabel('Character Count')
axes[0, 0].set_ylabel('Frequency')

df['word_count'].hist(bins=20, ax=axes[0, 1], color='lightcoral', edgecolor='black')
axes[0, 1].set_title('Distribution of Word Count', fontweight='bold')
axes[0, 1].set_xlabel('Word Count')
axes[0, 1].set_ylabel('Frequency')

df.boxplot(column='text_length', by='label', ax=axes[1, 0])
axes[1, 0].set_title('Text Length by Label', fontweight='bold')
axes[1, 0].set_xlabel('Label')
axes[1, 0].set_ylabel('Character Count')
plt.sca(axes[1, 0])
plt.xticks(rotation=45)

df.boxplot(column='word_count', by='label', ax=axes[1, 1])
axes[1, 1].set_title('Word Count by Label', fontweight='bold')
axes[1, 1].set_xlabel('Label')
axes[1, 1].set_ylabel('Word Count')
plt.sca(axes[1, 1])
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 4. Common Words Analysis

In [None]:
def get_top_words(texts, n=20):
    all_words = []
    for text in texts:
        words = re.findall(r'\b\w+\b', text.lower())
        all_words.extend(words)
    
    stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 
                 'of', 'with', 'is', 'was', 'are', 'been', 'be', 'have', 'has', 'had',
                 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'can', 'may',
                 'i', 'you', 'we', 'they', 'it', 'this', 'that', 'these', 'those', 'my'}
    
    filtered_words = [w for w in all_words if w not in stopwords and len(w) > 2]
    return Counter(filtered_words).most_common(n)

print("Top 20 words across all feedback:")
top_words = get_top_words(df['feedback_text'])
for word, count in top_words:
    print(f"  {word}: {count}")

In [None]:
words, counts = zip(*top_words)
plt.figure(figsize=(12, 6))
plt.barh(words, counts, color='teal', alpha=0.8)
plt.xlabel('Frequency', fontsize=12)
plt.title('Top 20 Most Common Words', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Label-Specific Word Analysis

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, label in enumerate(df['label'].unique()):
    label_texts = df[df['label'] == label]['feedback_text']
    top_words = get_top_words(label_texts, n=10)
    words, counts = zip(*top_words)
    
    axes[idx].barh(words, counts, alpha=0.8)
    axes[idx].set_xlabel('Frequency')
    axes[idx].set_title(f'Top Words: {label}', fontweight='bold')
    axes[idx].invert_yaxis()

plt.tight_layout()
plt.show()

## 6. Sample Feedback by Category

In [None]:
for label in df['label'].unique():
    print(f"\n{'='*60}")
    print(f"Sample feedback for: {label.upper()}")
    print('='*60)
    samples = df[df['label'] == label].sample(min(3, len(df[df['label'] == label])))
    for idx, row in samples.iterrows():
        print(f"\n{row['feedback_text']}")
    print()

## 7. Summary Statistics

In [None]:
summary = df.groupby('label').agg({
    'feedback_text': 'count',
    'text_length': ['mean', 'std', 'min', 'max'],
    'word_count': ['mean', 'std', 'min', 'max']
}).round(2)

summary.columns = ['_'.join(col).strip() for col in summary.columns.values]
summary = summary.rename(columns={'feedback_text_count': 'count'})
print("\nSummary Statistics by Label:")
print(summary)

## Conclusions

Key findings from EDA:
1. Dataset is relatively balanced across 4 categories
2. Text lengths vary but are generally consistent within categories
3. Each category has distinctive vocabulary patterns
4. Bug reports tend to be more technical
5. Praise messages are typically shorter and more emotional
6. Feature requests often contain specific action verbs
7. Cancellation risk feedback shows dissatisfaction keywords