# Exploratory Data Analysis - TikNep Dataset

This notebook performs comprehensive EDA on the clean_annotated_4k.csv dataset containing Nepali comments.

## Dataset Structure
The dataset contains **4 different annotation tasks** on the same text corpus:

1. Sentiment Analysis (SEN)** - Multi-class (Neutral=0, Negative=1, Positive=2)
2. Hate and Offense Detection (HAO)** - Binary (Not Offensive=0, Offensive=1)
3. Political Instance Detection (POL)** - Binary (Non-Political=0, Political=1)
4. Multi-label Topic Classification** - 9 topics (PGS, FT, EYE, HBF, FR, EMP, BF, SW, DO)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Define consistent color palette for all visualizations
COLOR_PALETTE = {
    # Sentiment Analysis (3 classes)
    'sentiment': ['#5DADE2', '#E74C3C', '#52BE80'],  # Blue, Red, Green
    
    # Binary Classification (2 classes) - Consistent across HAO and POL
    'binary_negative': '#5DADE2',  # Blue for negative/non-offensive/non-political
    'binary_positive': '#E74C3C',  # Red for positive/offensive/political
    
    # Multi-label/General visualizations
    'primary': '#5DADE2',      # Primary blue
    'secondary': '#E74C3C',    # Secondary red
    'tertiary': '#52BE80',     # Tertiary green
    'accent': '#F39C12',       # Accent orange
    
    # Topic classification gradient (9 topics)
    'topics': ['#5DADE2', '#3498DB', '#2E86C1', '#E74C3C', '#C0392B', 
               '#52BE80', '#27AE60', '#F39C12', '#D68910']
}

## 1. Data Loading and Initial Inspection

In [None]:
df = pd.read_csv('../data/processed/clean_annotated_4k.csv')

print(f"Dataset Shape: {df.shape}")
print(f"Total Rows: {df.shape[0]}")
print(f"Total Columns: {df.shape[1]}")

In [None]:
print("First 5 rows of the dataset:")
df.head()

In [None]:
print("Column Information:")
print("\nColumn Names:")
print(df.columns.tolist())
print("\nData Types:")
df.info()

In [None]:
print("Missing Values:")
missing_values = df.isnull().sum()
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")
print(f"Percentage of missing values: {(missing_values.sum() / (df.shape[0] * df.shape[1]) * 100):.2f}%")

In [None]:
print(f"Duplicate rows: {df.duplicated().sum()}")
print(f"Duplicate Text ID: {df['Text ID'].duplicated().sum()}")
print(f"Duplicate Text: {df['Text'].duplicated().sum()}")

In [None]:
df['text_length'] = df['Text'].str.len()
df['word_count'] = df['Text'].str.split().str.len()

print("Text Length Statistics:")
print(df['text_length'].describe())
print("\nWord Count Statistics:")
print(df['word_count'].describe())

## 2. Sentiment Analysis (SEN)

**Task Type:** Multi-class Classification  
**Classes:** 0 = Neutral, 1 = Negative, 2 = Positive

In [None]:
print("="*80)
print("TASK 1: SENTIMENT ANALYSIS (SEN)")
print("="*80)
print("\nTask Type: Multi-class Classification")
print("Classes: 0 = Neutral, 1 = Negative, 2 = Positive")
print("-"*80)

sentiment_counts = df['SEN'].value_counts().sort_index()
sentiment_map = {0: 'Neutral', 1: 'Negative', 2: 'Positive'}
df['sentiment_label'] = df['SEN'].map(sentiment_map)

print("\nClass Distribution:")
for sentiment in [0, 1, 2]:
    count = sentiment_counts[sentiment]
    percentage = (count / len(df)) * 100
    label = sentiment_map[sentiment]
    print(f"  Class {sentiment} ({label:>8}): {count:4d} samples ({percentage:5.2f}%)")

print("\nClass Balance Analysis:")
majority_class = sentiment_counts.max()
minority_class = sentiment_counts.min()
imbalance_ratio = majority_class / minority_class
print(f"  Most frequent class:  {sentiment_counts.idxmax()} ({sentiment_map[sentiment_counts.idxmax()]}) - {majority_class} samples")
print(f"  Least frequent class: {sentiment_counts.idxmin()} ({sentiment_map[sentiment_counts.idxmin()]}) - {minority_class} samples")
print(f"  Imbalance Ratio: {imbalance_ratio:.2f}:1")
print("="*80)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
sentiment_counts.plot(kind='bar', ax=axes[0], color=COLOR_PALETTE['sentiment'])
axes[0].set_title('Sentiment Distribution', fontsize=16, fontweight='bold')
axes[0].set_xlabel('Sentiment (0=Neutral, 1=Negative, 2=Positive)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(['Neutral', 'Negative', 'Positive'], rotation=0)
axes[0].grid(axis='y', alpha=0.3)
for i, v in enumerate(sentiment_counts):
    axes[0].text(i, v + 30, str(v), ha='center', fontweight='bold')

# Pie chart
labels = ['Neutral', 'Negative', 'Positive']
axes[1].pie(sentiment_counts, labels=labels, colors=COLOR_PALETTE['sentiment'], autopct='%1.1f%%', 
            startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Sentiment Distribution (%)', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
print("\n" + "="*80)
print("SAMPLE COMMENTS BY CLASS")
print("="*80)

for sentiment in [0, 1, 2]:
    sentiment_name = sentiment_map[sentiment]
    print(f"\nClass {sentiment}: {sentiment_name.upper()}")
    print("-"*80)
    samples = df[df['SEN'] == sentiment]['Text'].head(5)
    for i, comment in enumerate(samples, 1):
        print(f"  {i}. {comment}")
print("="*80)

## 3. Hate and Offense Detection (HAO)

**Task Type:** Binary Classification  
**Classes:** 0 = Not Offensive, 1 = Offensive

In [None]:
print("="*80)
print("TASK 2: HATE AND OFFENSE DETECTION (HAO)")
print("="*80)
print("\nTask Type: Binary Classification")
print("Classes: 0 = Not Offensive, 1 = Offensive")
print("-"*80)

hao_counts = df['HAO'].value_counts().sort_index()
hao_map = {0: 'Not Offensive', 1: 'Offensive'}

print("\nClass Distribution:")
for hao_class in [0, 1]:
    count = hao_counts[hao_class]
    percentage = (count / len(df)) * 100
    label = hao_map[hao_class]
    print(f"  Class {hao_class} ({label:>14}): {count:4d} samples ({percentage:5.2f}%)")

print("\nClass Balance Analysis:")
majority_class = hao_counts.max()
minority_class = hao_counts.min()
imbalance_ratio = majority_class / minority_class
print(f"  Most frequent class:  {hao_counts.idxmax()} ({hao_map[hao_counts.idxmax()]}) - {majority_class} samples")
print(f"  Least frequent class: {hao_counts.idxmin()} ({hao_map[hao_counts.idxmin()]}) - {minority_class} samples")
print(f"  Imbalance Ratio: {imbalance_ratio:.2f}:1")
print("="*80)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
colors_hao = [COLOR_PALETTE['binary_negative'], COLOR_PALETTE['binary_positive']]
hao_counts.plot(kind='bar', ax=axes[0], color=colors_hao)
axes[0].set_title('Hate and Offense Distribution', fontsize=16, fontweight='bold')
axes[0].set_xlabel('Class (0=Not Offensive, 1=Offensive)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(['Not Offensive', 'Offensive'], rotation=0)
axes[0].grid(axis='y', alpha=0.3)
for i, v in enumerate(hao_counts):
    axes[0].text(i, v + 50, str(v), ha='center', fontweight='bold')

# Pie chart
labels = ['Not Offensive', 'Offensive']
axes[1].pie(hao_counts, labels=labels, colors=colors_hao, autopct='%1.1f%%', 
            startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Hate and Offense Distribution (%)', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
print("\n" + "="*80)
print("SAMPLE COMMENTS BY CLASS")
print("="*80)

for hao_class in [0, 1]:
    label = hao_map[hao_class]
    print(f"\nClass {hao_class}: {label.upper()}")
    print("-"*80)
    samples = df[df['HAO'] == hao_class]['Text'].head(5)
    for i, comment in enumerate(samples, 1):
        print(f"  {i}. {comment}")
print("="*80)

## 4. Political Instance Detection (POL)

**Task Type:** Binary Classification  
**Classes:** 0 = Non-Political, 1 = Political

In [None]:
print("="*80)
print("TASK 3: POLITICAL INSTANCE DETECTION (POL)")
print("="*80)
print("\nTask Type: Binary Classification")
print("Classes: 0 = Non-Political, 1 = Political")
print("-"*80)

pol_counts = df['POL'].value_counts().sort_index()
pol_map = {0: 'Non-Political', 1: 'Political'}

print("\nClass Distribution:")
for pol_class in [0, 1]:
    count = pol_counts[pol_class]
    percentage = (count / len(df)) * 100
    label = pol_map[pol_class]
    print(f"  Class {pol_class} ({label:>14}): {count:4d} samples ({percentage:5.2f}%)")

print("\nClass Balance Analysis:")
majority_class = pol_counts.max()
minority_class = pol_counts.min()
imbalance_ratio = majority_class / minority_class
print(f"  Most frequent class:  {pol_counts.idxmax()} ({pol_map[pol_counts.idxmax()]}) - {majority_class} samples")
print(f"  Least frequent class: {pol_counts.idxmin()} ({pol_map[pol_counts.idxmin()]}) - {minority_class} samples")
print(f"  Imbalance Ratio: {imbalance_ratio:.2f}:1")
print("="*80)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
colors_pol = [COLOR_PALETTE['binary_negative'], COLOR_PALETTE['binary_positive']]
pol_counts.plot(kind='bar', ax=axes[0], color=colors_pol)
axes[0].set_title('Political Instance Distribution', fontsize=16, fontweight='bold')
axes[0].set_xlabel('Class (0=Non-Political, 1=Political)', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_xticklabels(['Non-Political', 'Political'], rotation=0)
axes[0].grid(axis='y', alpha=0.3)
for i, v in enumerate(pol_counts):
    axes[0].text(i, v + 50, str(v), ha='center', fontweight='bold')

# Pie chart
labels = ['Non-Political', 'Political']
axes[1].pie(pol_counts, labels=labels, colors=colors_pol, autopct='%1.1f%%', 
            startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Political Instance Distribution (%)', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
print("\n" + "="*80)
print("SAMPLE COMMENTS BY CLASS")
print("="*80)

for pol_class in [0, 1]:
    label = pol_map[pol_class]
    print(f"\nClass {pol_class}: {label.upper()}")
    print("-"*80)
    samples = df[df['POL'] == pol_class]['Text'].head(5)
    for i, comment in enumerate(samples, 1):
        print(f"  {i}. {comment}")
print("="*80)

## 5. Multi-label Topic Classification

**Task Type:** Multi-label Classification  
**Number of Topics:** 9

This section performs detailed exploratory data analysis on the **Multi-label Topic Classification** task from the TikNep dataset.

### Topic Categories:

1. **PGS**: Politics, Governance and Social Concern
2. **FT**: Food and Travel
3. **EYE**: Education, Youth and Employment
4. **HBF**: Health, Beauty and Fitness
5. **FR**: Family and Relationship
6. **EMP**: Entertainment, Music and Pop Culture
7. **BF**: Business and Finance
8. **SW**: Spirituality and Well-being
9. **DO**: Daily Diaries and Other Hobbies

Each comment can have **multiple topic labels** (multi-label classification).

In [None]:
# Define topic columns
topic_columns = {
    'PGS': 'Politics, Governance and Social Concern',
    'FT': 'Food and Travel',
    'EYE': 'Education, Youth and Employment',
    'HBF': 'Health, Beauty and Fitness',
    'FR': 'Family and Relationship',
    'EMP': 'Entertainment, Music and Pop Culture',
    'BF': 'Business and Finance',
    'SW': 'Spirituality and Well-being',
    'DO': 'Daily Diaries and Other Hobbies'
}

topic_col_list = list(topic_columns.keys())

print("Topic Categories:")
print("=" * 80)
for abbr, full_name in topic_columns.items():
    print(f"{abbr:5} : {full_name}")

In [None]:
# Multi-label statistics
# Calculate number of topics per comment
df['num_topics'] = df[topic_col_list].sum(axis=1)

print("="*80)
print("MULTI-LABEL STATISTICS")
print("="*80)

print("\nTopic Distribution Summary:")
print(f"  Average topics per comment: {df['num_topics'].mean():.2f}")
print(f"  Median topics per comment:  {df['num_topics'].median():.0f}")
print(f"  Standard deviation:         {df['num_topics'].std():.2f}")
print(f"  Min topics per comment:     {df['num_topics'].min()}")
print(f"  Max topics per comment:     {df['num_topics'].max()}")

print("\nDistribution by Number of Topics:")
print("-"*80)
num_topics_dist = df['num_topics'].value_counts().sort_index()
for num_topics, count in num_topics_dist.items():
    percentage = (count / len(df)) * 100
    print(f"  {int(num_topics)} topic(s): {count:4d} samples ({percentage:5.2f}%)")

print("\nMulti-label Analysis:")
print("-"*80)
no_topic = (df['num_topics'] == 0).sum()
single_topic = (df['num_topics'] == 1).sum()
multi_topic = (df['num_topics'] >= 2).sum()

print(f"  Comments with 0 topics:  {no_topic:4d} samples ({(no_topic/len(df)*100):5.2f}%)")
print(f"  Comments with 1 topic:   {single_topic:4d} samples ({(single_topic/len(df)*100):5.2f}%)")
print(f"  Comments with 2+ topics: {multi_topic:4d} samples ({(multi_topic/len(df)*100):5.2f}%)")
print("="*80)

In [None]:
# Visualize distribution of number of topics per comment
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
num_topics_dist.plot(kind='bar', ax=axes[0], color=COLOR_PALETTE['primary'], edgecolor='black')
axes[0].set_title('Distribution of Number of Topics per Comment', fontsize=16, fontweight='bold')
axes[0].set_xlabel('Number of Topics', fontsize=12)
axes[0].set_ylabel('Number of Comments', fontsize=12)
axes[0].grid(axis='y', alpha=0.3)
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

for i, v in enumerate(num_topics_dist):
    axes[0].text(i, v + 20, str(v), ha='center', fontweight='bold')

# Pie chart
labels_pie = ['No topics', '1 topic', '2+ topics']
sizes_pie = [no_topic, single_topic, multi_topic]
colors_pie = [COLOR_PALETTE['primary'], COLOR_PALETTE['secondary'], COLOR_PALETTE['tertiary']]
axes[1].pie(sizes_pie, labels=labels_pie, colors=colors_pie, autopct='%1.1f%%',
           startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Multi-label Distribution', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Topic frequency distribution
topic_counts = df[topic_col_list].sum().sort_values(ascending=False)

print("="*80)
print("INDIVIDUAL TOPIC DISTRIBUTION")
print("="*80)
print(f"\n{'Rank':<6} {'Topic':<6} {'Full Name':<50} {'Count':<10} {'Percentage'}")
print("-"*80)

for rank, (topic_abbr, count) in enumerate(topic_counts.items(), 1):
    percentage = (count / len(df)) * 100
    full_name = topic_columns[topic_abbr]
    print(f"{rank:<6} {topic_abbr:<6} {full_name:<50} {count:<10} {percentage:5.2f}%")

# Calculate class imbalance
max_count = topic_counts.max()
min_count = topic_counts.min()
imbalance_ratio = max_count / min_count

print("\nClass Balance Analysis:")
print("-"*80)
print(f"  Most frequent topic:  {topic_counts.index[0]} ({topic_columns[topic_counts.index[0]]}) - {topic_counts.iloc[0]} samples")
print(f"  Least frequent topic: {topic_counts.index[-1]} ({topic_columns[topic_counts.index[-1]]}) - {topic_counts.iloc[-1]} samples")
print(f"  Imbalance Ratio: {imbalance_ratio:.2f}:1")
print("="*80)

## 6. Overall Summary Statistics

In [None]:
# Comprehensive summary
print("="*80)
print("COMPREHENSIVE EDA SUMMARY - TikNep Dataset")
print("="*80)

print("\n1. DATASET OVERVIEW")
print("-"*80)
print(f"   Total Comments:      {len(df):,}")
print(f"   Total Features:      {df.shape[1]}")
print(f"   Missing Values:      {df.isnull().sum().sum()}")
print(f"   Duplicate Comments:  {df['Text'].duplicated().sum()}")

print("\n2. SENTIMENT ANALYSIS (SEN)")
print("-"*80)
sentiment_counts_summary = df['SEN'].value_counts().sort_index()
for sentiment in [0, 1, 2]:
    count = sentiment_counts_summary[sentiment]
    percentage = (count / len(df)) * 100
    print(f"   Class {sentiment} ({sentiment_map[sentiment]:>8}): {count:,} samples ({percentage:.1f}%)")
sentiment_imbalance = sentiment_counts_summary.max() / sentiment_counts_summary.min()
print(f"   Imbalance Ratio: {sentiment_imbalance:.2f}:1")

print("\n3.  HATE AND OFFENSE DETECTION (HAO)")
print("-"*80)
hao_counts_summary = df['HAO'].value_counts().sort_index()
for hao_class in [0, 1]:
    label = hao_map[hao_class]
    count = hao_counts_summary[hao_class]
    percentage = (count / len(df)) * 100
    print(f"   Class {hao_class} ({label:>14}): {count:,} samples ({percentage:.1f}%)")
hao_imbalance = hao_counts_summary.max() / hao_counts_summary.min()
print(f"   Imbalance Ratio: {hao_imbalance:.2f}:1")

print("\n4. POLITICAL INSTANCE DETECTION (POL)")
print("-"*80)
pol_counts_summary = df['POL'].value_counts().sort_index()
for pol_class in [0, 1]:
    label = pol_map[pol_class]
    count = pol_counts_summary[pol_class]
    percentage = (count / len(df)) * 100
    print(f"   Class {pol_class} ({label:>14}): {count:,} samples ({percentage:.1f}%)")
pol_imbalance = pol_counts_summary.max() / pol_counts_summary.min()
print(f"   Imbalance Ratio: {pol_imbalance:.2f}:1")

print("\n5. MULTI-LABEL TOPIC CLASSIFICATION")
print("-"*80)
print(f"   Average topics per comment: {df['num_topics'].mean():.2f}")
print(f"   Comments with 0 topics:     {(df['num_topics'] == 0).sum():,} samples")
print(f"   Comments with 1 topic:      {(df['num_topics'] == 1).sum():,} samples")
print(f"   Comments with 2+ topics:    {(df['num_topics'] >= 2).sum():,} samples")
print(f"   Max topics on a comment:    {df['num_topics'].max()}")
print("\n   Top 3 Topics:")
top_3_topics = df[topic_col_list].sum().sort_values(ascending=False).head(3)
for topic_abbr, count in top_3_topics.items():
    percentage = (count / len(df)) * 100
    print(f"     {topic_abbr}: {count:,} samples ({percentage:.1f}%)")

print("\n6. TEXT CHARACTERISTICS")
print("-"*80)
print(f"   Average text length:  {df['text_length'].mean():.1f} characters")
print(f"   Average word count:   {df['word_count'].mean():.1f} words")
print(f"   Shortest comment:     {df['text_length'].min()} characters")
print(f"   Longest comment:      {df['text_length'].max()} characters")

print("\n" + "="*80)