# üìÇ Data Discovery

In this section, we will perform an initial inspection of the dataset files to understand their structure and content before loading them into dataframes.

In [None]:
import pandas as pd
import numpy as np

# File paths
train_path = r"C:\Users\yassi\Documents\projects\MlOps Ag news classification\dataset\train.csv"
test_path = r"C:\Users\yassi\Documents\projects\MlOps Ag news classification\dataset\test.csv"

print("="*70)
print("üîç FILE DISCOVERY - Training Data")
print("="*70)

# Read first few lines without processing
with open(train_path, 'r', encoding='utf-8') as f:
    print("\nFirst 5 raw lines:")
    for i in range(5):
        line = f.readline().strip()
        print(f"Line {i+1}: {line[:150]}...")  # Show first 150 chars

# Load without assumptions
train_sample = pd.read_csv(train_path, header=None, nrows=10)
print(f"\nüìä Shape: {train_sample.shape}")
print(f"üìä Number of columns detected: {train_sample.shape[1]}")
print(f"\nüìã Column data types:\n{train_sample.dtypes}")
print("\n" + "="*70)
print("First 10 rows:")
print("="*70)
print(train_sample)

print("\n" + "="*70)
print("üîç FILE DISCOVERY - Test Data")
print("="*70)

# Check test file too
test_sample = pd.read_csv(test_path, header=None, nrows=10)
print(f"\nüìä Shape: {test_sample.shape}")
print(f"üìä Number of columns detected: {test_sample.shape[1]}")
print("\nFirst 10 rows:")
print(test_sample)


# üìä Exploratory Data Analysis (EDA)

We will now perform a comprehensive Exploratory Data Analysis to understand the dataset characteristics, class distribution, and text properties. This step is crucial for informing our preprocessing and modeling strategies.

## 1. Import Libraries and Load Data

We start by importing the necessary libraries for data manipulation, visualization, and analysis. Then, we load the training and testing datasets and display basic information about them.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load data WITH headers
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("‚úÖ Data loaded successfully!")
print("\n" + "="*70)
print("üìä DATASET OVERVIEW")
print("="*70)
print(f"\nTraining set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"\nTotal samples: {len(train_df) + len(test_df):,}")

print("\n" + "="*70)
print("üìã Column Names:")
print("="*70)
print(train_df.columns.tolist())

print("\n" + "="*70)
print("üìã Data Types:")
print("="*70)
print(train_df.dtypes)

print("\n" + "="*70)
print("üìã First 5 Training Examples:")
print("="*70)
print(train_df.head())

print("\n" + "="*70)
print("üìã Dataset Info:")
print("="*70)
train_df.info()

## 2. Label Distribution Analysis

Understanding the distribution of target labels is essential to identify any class imbalance issues. We will calculate and display the counts and percentages for each category in both the training and test sets.

In [None]:
# AG News label mapping (0-indexed!)
label_mapping = {
    0: "World",
    1: "Sports", 
    2: "Business",
    3: "Science/Tech"
}

print("="*70)
print("üìä LABEL DISTRIBUTION")
print("="*70)

# Training set labels
print("\nüîπ TRAINING SET:")
print("-" * 70)
train_label_counts = train_df['label'].value_counts().sort_index()
print(train_label_counts)
print(f"\nTotal: {train_label_counts.sum():,}")

# Add percentages
print("\nWith percentages:")
for label, count in train_label_counts.items():
    pct = (count / len(train_df)) * 100
    category = label_mapping[label]
    print(f"Label {label} ({category:15s}): {count:6,} samples ({pct:.2f}%)")

# Test set labels
print("\n" + "="*70)
print("üîπ TEST SET:")
print("-" * 70)
test_label_counts = test_df['label'].value_counts().sort_index()
print(test_label_counts)
print(f"\nTotal: {test_label_counts.sum():,}")

# Add percentages
print("\nWith percentages:")
for label, count in test_label_counts.items():
    pct = (count / len(test_df)) * 100
    category = label_mapping[label]
    print(f"Label {label} ({category:15s}): {count:6,} samples ({pct:.2f}%)")

# Check for class imbalance
print("\n" + "="*70)
print("‚öñÔ∏è CLASS BALANCE CHECK")
print("="*70)
train_balance = train_label_counts.max() / train_label_counts.min()
test_balance = test_label_counts.max() / test_label_counts.min()
print(f"Training set imbalance ratio: {train_balance:.2f}")
print(f"Test set imbalance ratio: {test_balance:.2f}")

if train_balance == 1.0:
    print("‚úÖ Classes are PERFECTLY BALANCED!")
else:
    print(f"‚ö†Ô∏è Classes are imbalanced (ratio: {train_balance:.2f})")


## 3. Visualization of Label Distribution

Visualizing the label distribution helps in quickly assessing the balance of the dataset. We will use bar charts to show the number of samples per category.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_palette("husl")
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Training set
categories = [label_mapping[i] for i in sorted(label_mapping.keys())]
train_counts = [train_label_counts[i] for i in sorted(train_label_counts.index)]

axes[0].bar(categories, train_counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A'])
axes[0].set_title('Training Set Label Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Category', fontsize=12)
axes[0].set_ylabel('Number of Samples', fontsize=12)
axes[0].set_ylim(0, 35000)

# Add value labels on bars
for i, v in enumerate(train_counts):
    axes[0].text(i, v + 500, f'{v:,}', ha='center', va='bottom', fontsize=11, fontweight='bold')

# Test set
test_counts = [test_label_counts[i] for i in sorted(test_label_counts.index)]

axes[1].bar(categories, test_counts, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A'])
axes[1].set_title('Test Set Label Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Category', fontsize=12)
axes[1].set_ylabel('Number of Samples', fontsize=12)
axes[1].set_ylim(0, 2200)

# Add value labels on bars
for i, v in enumerate(test_counts):
    axes[1].text(i, v + 50, f'{v:,}', ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

print("‚úÖ Visualizations created!")


## 4. Text Length Analysis

Analyzing the length of the text samples (in terms of characters and words) can provide insights into the data quality and help in setting parameters for models (e.g., max sequence length).

In [None]:
# Add text length columns
train_df['text_length'] = train_df['text'].str.len()
train_df['word_count'] = train_df['text'].str.split().str.len()

test_df['text_length'] = test_df['text'].str.len()
test_df['word_count'] = test_df['text'].str.split().str.len()

print("="*70)
print("üìù TEXT LENGTH STATISTICS")
print("="*70)

print("\nüîπ TRAINING SET:")
print("-" * 70)
print("\nCharacter Length:")
print(train_df['text_length'].describe())

print("\nWord Count:")
print(train_df['word_count'].describe())

print("\n" + "="*70)
print("üîπ TEST SET:")
print("-" * 70)
print("\nCharacter Length:")
print(test_df['text_length'].describe())

print("\nWord Count:")
print(test_df['word_count'].describe())

print("\n" + "="*70)
print("üìä TEXT LENGTH BY CATEGORY (Training Set)")
print("="*70)
for label in sorted(train_df['label'].unique()):
    category = label_mapping[label]
    subset = train_df[train_df['label'] == label]
    avg_length = subset['text_length'].mean()
    avg_words = subset['word_count'].mean()
    print(f"\n{category:15s}: Avg chars: {avg_length:.0f} | Avg words: {avg_words:.1f}")


## 5. Visualization of Text Length Distributions

We visualize the distribution of text lengths and word counts using histograms and box plots. This helps in identifying outliers and understanding the variability across different categories.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Text length distribution (training)
axes[0, 0].hist(train_df['text_length'], bins=50, color='steelblue', alpha=0.7, edgecolor='black')
axes[0, 0].axvline(train_df['text_length'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {train_df["text_length"].mean():.0f}')
axes[0, 0].axvline(train_df['text_length'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {train_df["text_length"].median():.0f}')
axes[0, 0].set_title('Training Set - Character Length Distribution', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Number of Characters', fontsize=10)
axes[0, 0].set_ylabel('Frequency', fontsize=10)
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# 2. Word count distribution (training)
axes[0, 1].hist(train_df['word_count'], bins=50, color='coral', alpha=0.7, edgecolor='black')
axes[0, 1].axvline(train_df['word_count'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {train_df["word_count"].mean():.1f}')
axes[0, 1].axvline(train_df['word_count'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {train_df["word_count"].median():.0f}')
axes[0, 1].set_title('Training Set - Word Count Distribution', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Number of Words', fontsize=10)
axes[0, 1].set_ylabel('Frequency', fontsize=10)
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# 3. Text length by category
category_data = []
for label in sorted(train_df['label'].unique()):
    category_data.append(train_df[train_df['label'] == label]['text_length'])

axes[1, 0].boxplot(category_data, labels=categories, patch_artist=True,
                    boxprops=dict(facecolor='lightblue', alpha=0.7),
                    medianprops=dict(color='red', linewidth=2))
axes[1, 0].set_title('Text Length by Category (Training)', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Category', fontsize=10)
axes[1, 0].set_ylabel('Character Length', fontsize=10)
axes[1, 0].grid(alpha=0.3, axis='y')

# 4. Word count by category
word_data = []
for label in sorted(train_df['label'].unique()):
    word_data.append(train_df[train_df['label'] == label]['word_count'])

axes[1, 1].boxplot(word_data, labels=categories, patch_artist=True,
                    boxprops=dict(facecolor='lightgreen', alpha=0.7),
                    medianprops=dict(color='red', linewidth=2))
axes[1, 1].set_title('Word Count by Category (Training)', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Category', fontsize=10)
axes[1, 1].set_ylabel('Word Count', fontsize=10)
axes[1, 1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("‚úÖ Text length visualizations created!")


## 6. Sample Texts Inspection

It is always good practice to manually inspect some random samples from each category to get a feel for the data and verify that the labels make sense.

In [None]:
print("="*70)
print("üìñ SAMPLE TEXTS FROM EACH CATEGORY")
print("="*70)

# Show 2 examples from each category
for label in sorted(train_df['label'].unique()):
    category = label_mapping[label]
    samples = train_df[train_df['label'] == label].sample(2, random_state=42)
    
    print(f"\n{'='*70}")
    print(f"üè∑Ô∏è  {category.upper()} (Label: {label})")
    print(f"{'='*70}")
    
    for idx, (i, row) in enumerate(samples.iterrows(), 1):
        text = row['text']
        text_preview = text[:300] + "..." if len(text) > 300 else text
        print(f"\nExample {idx}:")
        print(f"Length: {len(text)} chars | {len(text.split())} words")
        print(f"Text: {text_preview}")
        print("-" * 70)

print("\n‚úÖ Sample texts displayed!")

## 7. Most Common Words Analysis

We analyze the most frequent words in each category (excluding stopwords) to identify keywords that might be predictive of the class labels.

In [None]:
from collections import Counter
import re

def get_top_words(texts, n=15):
    """Get top N most common words (excluding stopwords)"""
    # Simple stopwords
    stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 
                 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
                 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
                 'would', 'should', 'could', 'may', 'might', 'must', 'can', 'this',
                 'that', 'these', 'those', 'it', 'its', 'their', 'them', 'they', 'he',
                 'she', 'his', 'her', 'him', 'i', 'you', 'we', 'us', 'our', 'your'}
    
    all_words = []
    for text in texts:
        # Convert to lowercase and extract words
        words = re.findall(r'\b[a-z]+\b', text.lower())
        # Filter stopwords and short words
        words = [w for w in words if w not in stopwords and len(w) > 2]
        all_words.extend(words)
    
    return Counter(all_words).most_common(n)

print("="*70)
print("üî§ TOP 15 WORDS BY CATEGORY")
print("="*70)

for label in sorted(train_df['label'].unique()):
    category = label_mapping[label]
    texts = train_df[train_df['label'] == label]['text']
    top_words = get_top_words(texts, n=15)
    
    print(f"\n{'='*70}")
    print(f"üìå {category.upper()}")
    print(f"{'='*70}")
    
    for i, (word, count) in enumerate(top_words, 1):
        print(f"{i:2d}. {word:15s} : {count:6,} occurrences")

print("\n‚úÖ Top words analysis complete!")


## 8. Word Cloud Generation

Word clouds provide a visual representation of the most frequent words in each category, allowing for a quick intuitive understanding of the main topics.

In [None]:
from wordcloud import WordCloud

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

# Color schemes for each category
colors = ['Reds', 'Blues', 'Greens', 'Purples']

for idx, label in enumerate(sorted(train_df['label'].unique())):
    category = label_mapping[label]
    
    # Get all text for this category
    category_text = ' '.join(train_df[train_df['label'] == label]['text'].values)
    
    # Create word cloud
    wordcloud = WordCloud(
        width=800, 
        height=600,
        background_color='white',
        colormap=colors[idx],
        max_words=100,
        relative_scaling=0.5,
        min_font_size=10
    ).generate(category_text)
    
    # Plot
    axes[idx].imshow(wordcloud, interpolation='bilinear')
    axes[idx].set_title(f'{category}', fontsize=16, fontweight='bold', pad=10)
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

print("‚úÖ Word clouds created!")


## 9. EDA Summary and Insights

We summarize the key findings from our Exploratory Data Analysis, highlighting the dataset characteristics, data quality, and implications for modeling.

In [None]:
print("="*70)
print("üìä EXPLORATORY DATA ANALYSIS - FINAL SUMMARY")
print("="*70)

print("\n‚úÖ DATASET CHARACTERISTICS")
print("-" * 70)
print(f"Training samples:     {len(train_df):,}")
print(f"Test samples:         {len(test_df):,}")
print(f"Total samples:        {len(train_df) + len(test_df):,}")
print(f"Number of classes:    {train_df['label'].nunique()}")
print(f"Features:             {list(train_df.columns)}")

print("\n‚úÖ CLASS DISTRIBUTION")
print("-" * 70)
print("Perfect balance: Each class has exactly 25% of data")
print("‚Ä¢ World:        30,000 train | 1,900 test")
print("‚Ä¢ Sports:       30,000 train | 1,900 test")
print("‚Ä¢ Business:     30,000 train | 1,900 test")
print("‚Ä¢ Science/Tech: 30,000 train | 1,900 test")

print("\n‚úÖ TEXT CHARACTERISTICS")
print("-" * 70)
print(f"Average text length:  {train_df['text_length'].mean():.0f} characters")
print(f"Average word count:   {train_df['word_count'].mean():.1f} words")
print(f"Median text length:   {train_df['text_length'].median():.0f} characters")
print(f"Median word count:    {train_df['word_count'].median():.0f} words")
print(f"Min text length:      {train_df['text_length'].min()} characters")
print(f"Max text length:      {train_df['text_length'].max()} characters")

print("\n‚úÖ DATA QUALITY")
print("-" * 70)
print(f"Missing values (train): {train_df.isnull().sum().sum()}")
print(f"Missing values (test):  {test_df.isnull().sum().sum()}")
print(f"Duplicate rows (train): {train_df.duplicated().sum()}")
print(f"Duplicate rows (test):  {test_df.duplicated().sum()}")

print("\n‚úÖ CATEGORY DISTINCTIVENESS")
print("-" * 70)
print("‚Ä¢ World:        Political/conflict terms (iraq, president, minister)")
print("‚Ä¢ Sports:       Sports-specific terms (game, team, season, cup)")
print("‚Ä¢ Business:     Financial terms (stocks, oil, prices, profit)")
print("‚Ä¢ Science/Tech: Technology terms (microsoft, software, internet)")

print("\n‚úÖ KEY INSIGHTS FOR MODELING")
print("-" * 70)
print("1. ‚úÖ Perfectly balanced classes ‚Üí No need for resampling")
print("2. ‚úÖ Clear vocabulary separation ‚Üí Good for classification")
print("3. ‚úÖ Consistent text lengths ‚Üí Stable model input")
print("4. ‚úÖ No missing data ‚Üí Clean dataset")
print("5. ‚úÖ Large dataset (120k) ‚Üí Good for deep learning")
print("6. ‚úÖ Reasonable text length (avg 38 words) ‚Üí Fits transformers")

print("\n‚úÖ RECOMMENDED MODEL APPROACH")
print("-" * 70)
print("‚Ä¢ Model: DistilBERT (efficient transformer)")
print("‚Ä¢ Tokenizer: distilbert-base-uncased")
print("‚Ä¢ Max sequence length: 128 tokens (covers 95% of texts)")
print("‚Ä¢ Batch size: 32 (good balance)")
print("‚Ä¢ Expected accuracy: 94%+ (based on literature)")

print("\n" + "="*70)
print("üéâ EDA COMPLETE - READY FOR MODEL TRAINING!")
print("="*70)


# üßπ Data Preprocessing

Data preprocessing is a critical step in NLP. We will clean and normalize the text data to prepare it for vectorization and modeling.

## 1. Preprocessing Pipeline Definition

We define a comprehensive preprocessing function that includes lowercasing, removing HTML tags, URLs, numbers, punctuation, and stopwords, as well as lemmatization.

In [None]:
import re
import string
import contractions
import spacy
from nltk.corpus import stopwords

# Load spaCy model (you already have it)
print("‚è≥ Loading spaCy en_core_web_sm...")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
print("‚úÖ Loaded!")

# Get stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Complete text preprocessing pipeline:
    1. Lowercase
    2. Fix HTML entities (#39; ‚Üí ')
    3. Remove emails
    4. Remove HTML tags
    5. Remove URLs
    6. Remove numbers
    7. Expand contractions
    8. Remove punctuation
    9. Remove extra whitespace
    10. Remove stopwords
    11. Lemmatization
    """
    # 1. Lowercase
    text = text.lower()
    
    # 2. Fix HTML entities like #39; ‚Üí '
    text = re.sub(r'#39;', "'", text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'&lt;', '<', text)
    text = re.sub(r'&gt;', '>', text)
    text = re.sub(r'&quot;', '"', text)
    
    # 3. Remove emails
    text = re.sub(r'\S+@\S+', '', text)
    
    # 4. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # 5. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 6. Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # 7. Expand contractions (can't ‚Üí cannot, won't ‚Üí will not)
    text = contractions.fix(text)
    
    # 8. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # 9. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 10 & 11. Remove stopwords + Lemmatization (using spaCy)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and len(token.text) > 2]
    
    # Join back into text
    text = ' '.join(tokens)
    
    return text

print("‚úÖ Preprocessing function ready!")
print("\n" + "="*70)
print("üß™ TESTING PREPROCESSING")
print("="*70)

# Test on a sample
test_text = "Wall St. Bears 32#8 & Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling band of ultra-cynics, are seeing green again."

print("\nüìù BEFORE:")
print(test_text)

print("\nüìù AFTER:")
processed = preprocess_text(test_text)
print(processed)

print("\n‚úÖ Test complete! Ready to process all data.")


## 2. Applying Preprocessing

We apply the defined preprocessing pipeline to the entire training and test datasets. This may take some time due to the size of the dataset and the complexity of the operations.

In [None]:
from tqdm import tqdm
tqdm.pandas()  # Enable progress bar for pandas

print("="*70)
print("üîß APPLYING PREPROCESSING TO FULL DATASET")
print("="*70)

# Apply to training data
print("\n‚è≥ Processing training data (120,000 samples)...")
print("   This may take 5-10 minutes...")
train_df['text_clean'] = train_df['text'].progress_apply(preprocess_text)

# Apply to test data
print("\n‚è≥ Processing test data (7,600 samples)...")
test_df['text_clean'] = test_df['text'].progress_apply(preprocess_text)

print("\n‚úÖ Preprocessing complete!")

# Show examples
print("\n" + "="*70)
print("üìù PREPROCESSING RESULTS - EXAMPLES")
print("="*70)

for i in range(3):
    print(f"\n--- Example {i+1} ---")
    print(f"BEFORE: {train_df['text'].iloc[i][:150]}...")
    print(f"AFTER:  {train_df['text_clean'].iloc[i][:150]}...")

# Check text length after preprocessing
print("\n" + "="*70)
print("üìä TEXT LENGTH AFTER PREPROCESSING")
print("="*70)
train_df['clean_word_count'] = train_df['text_clean'].str.split().str.len()
print(f"Average words (before): {train_df['word_count'].mean():.1f}")
print(f"Average words (after):  {train_df['clean_word_count'].mean():.1f}")
print(f"Reduction: {(1 - train_df['clean_word_count'].mean()/train_df['word_count'].mean())*100:.1f}%")

print("\n‚úÖ Data ready for vectorization!")


## 3. TF-IDF Parameter Analysis

Before vectorizing, we analyze the preprocessed text to determine optimal parameters for TF-IDF, such as vocabulary size and document frequency thresholds.

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

print("="*70)
print("üìä ANALYZING PREPROCESSED DATA FOR TF-IDF PARAMETERS")
print("="*70)

# Combine all preprocessed text for analysis
all_texts = train_df['text_clean'].tolist()

# 1. VOCABULARY ANALYSIS
print("\nüî§ VOCABULARY ANALYSIS")
print("-" * 70)

# Get all words
all_words = ' '.join(all_texts).split()
word_freq = Counter(all_words)

print(f"Total words (tokens):     {len(all_words):,}")
print(f"Unique words (vocab):     {len(word_freq):,}")
print(f"Average word frequency:   {len(all_words)/len(word_freq):.2f}")

# 2. WORD FREQUENCY DISTRIBUTION
print("\nüìä WORD FREQUENCY DISTRIBUTION")
print("-" * 70)

# Count words by frequency ranges
freq_1 = sum(1 for w, c in word_freq.items() if c == 1)
freq_2_5 = sum(1 for w, c in word_freq.items() if 2 <= c <= 5)
freq_6_10 = sum(1 for w, c in word_freq.items() if 6 <= c <= 10)
freq_11_50 = sum(1 for w, c in word_freq.items() if 11 <= c <= 50)
freq_51_100 = sum(1 for w, c in word_freq.items() if 51 <= c <= 100)
freq_100_plus = sum(1 for w, c in word_freq.items() if c > 100)

print(f"Words appearing 1 time:       {freq_1:,} ({freq_1/len(word_freq)*100:.1f}%)")
print(f"Words appearing 2-5 times:    {freq_2_5:,} ({freq_2_5/len(word_freq)*100:.1f}%)")
print(f"Words appearing 6-10 times:   {freq_6_10:,} ({freq_6_10/len(word_freq)*100:.1f}%)")
print(f"Words appearing 11-50 times:  {freq_11_50:,} ({freq_11_50/len(word_freq)*100:.1f}%)")
print(f"Words appearing 51-100 times: {freq_51_100:,} ({freq_51_100/len(word_freq)*100:.1f}%)")
print(f"Words appearing 100+ times:   {freq_100_plus:,} ({freq_100_plus/len(word_freq)*100:.1f}%)")

# 3. TOP WORDS (most common)
print("\nüìà TOP 20 MOST COMMON WORDS")
print("-" * 70)
for word, count in word_freq.most_common(20):
    print(f"{word:20s}: {count:,}")

# 4. DOCUMENT LENGTH ANALYSIS
print("\nüìè DOCUMENT LENGTH (after preprocessing)")
print("-" * 70)
doc_lengths = [len(text.split()) for text in all_texts]
print(f"Min words:    {min(doc_lengths)}")
print(f"Max words:    {max(doc_lengths)}")
print(f"Mean words:   {np.mean(doc_lengths):.1f}")
print(f"Median words: {np.median(doc_lengths):.1f}")
print(f"Std words:    {np.std(doc_lengths):.1f}")

# 5. DOCUMENT FREQUENCY ANALYSIS
print("\nüìÑ DOCUMENT FREQUENCY ANALYSIS")
print("-" * 70)

# Use CountVectorizer to get document frequencies
count_vec = CountVectorizer()
count_matrix = count_vec.fit_transform(all_texts)
doc_freq = (count_matrix > 0).sum(axis=0).A1  # Document frequency for each word

vocab_list = count_vec.get_feature_names_out()
df_dict = dict(zip(vocab_list, doc_freq))

# Analyze document frequency distribution
total_docs = len(all_texts)
df_below_3 = sum(1 for df in doc_freq if df < 3)
df_above_95pct = sum(1 for df in doc_freq if df > 0.95 * total_docs)

print(f"Total documents: {total_docs:,}")
print(f"Words in < 3 docs (rare):      {df_below_3:,} ({df_below_3/len(vocab_list)*100:.1f}%)")
print(f"Words in > 95% docs (common):  {df_above_95pct:,}")

print("\n‚úÖ Analysis complete!")


## 4. Optimal TF-IDF Parameters Determination

Based on the analysis, we select the best parameters for the TF-IDF vectorizer to balance feature richness and computational efficiency.

In [None]:
print("="*70)
print("üìä DETERMINING OPTIMAL TF-IDF PARAMETERS")
print("="*70)

# Calculate how many words remain after filtering
print("\nüîç IMPACT OF min_df PARAMETER")
print("-" * 70)

for min_df_val in [1, 2, 3, 5, 10]:
    remaining = sum(1 for df in doc_freq if df >= min_df_val)
    pct = remaining / len(vocab_list) * 100
    print(f"min_df={min_df_val:2d}: {remaining:,} words remain ({pct:.1f}%)")

# Calculate ngram impact
print("\nüîç NGRAM ANALYSIS")
print("-" * 70)


from sklearn.feature_extraction.text import CountVectorizer

# Sample for speed
sample_texts = train_df['text_clean'].sample(10000, random_state=42).tolist()

# Unigrams only
vec_1gram = CountVectorizer(ngram_range=(1,1), min_df=3)
vec_1gram.fit(sample_texts)
print(f"Unigrams (1,1) with min_df=3:           {len(vec_1gram.get_feature_names_out()):,} features")

# Unigrams + Bigrams
vec_2gram = CountVectorizer(ngram_range=(1,2), min_df=3)
vec_2gram.fit(sample_texts)
print(f"Unigrams + Bigrams (1,2) with min_df=3: {len(vec_2gram.get_feature_names_out()):,} features")

# Unigrams + Bigrams + Trigrams
vec_3gram = CountVectorizer(ngram_range=(1,3), min_df=3)
vec_3gram.fit(sample_texts)
print(f"Uni + Bi + Trigrams (1,3) with min_df=3: {len(vec_3gram.get_feature_names_out()):,} features")

# Show sample bigrams that might be useful
print("\nüìã SAMPLE BIGRAMS (might capture context):")
print("-" * 70)
bigram_features = [f for f in vec_2gram.get_feature_names_out() if ' ' in f]
print(bigram_features[:30])

# Final recommendation
print("\n" + "="*70)
print("‚úÖ RECOMMENDED TF-IDF PARAMETERS")
print("="*70)

recommended_params = {
    'max_features': 15000,       # Capture most important features
    'min_df': 3,                 # Remove words appearing in < 3 docs (removes 62.7% noise)
    'max_df': 0.95,              # Remove words in > 95% docs
    'ngram_range': (1, 2),       # Unigrams + Bigrams (captures phrases)
    'sublinear_tf': True,        # Log scaling (reduces impact of high freq words)
}

print("\nBased on data analysis:")
for param, value in recommended_params.items():
    print(f"  {param:20s}: {value}")

print("\nüìù RATIONALE:")
print("  ‚Ä¢ min_df=3: Removes 62.7% rare/noisy words")
print("  ‚Ä¢ max_features=15000: Keeps top features, manageable size")
print("  ‚Ä¢ ngram_range=(1,2): Bigrams capture context like 'oil price', 'world cup'")
print("  ‚Ä¢ sublinear_tf=True: Log scaling helps with word like 'reuters' (28k occurrences)")


## 5. TF-IDF Vectorization and Data Splitting

We split the training data into training and validation sets, and then apply the TF-IDF vectorization using the optimized parameters.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

print("="*70)
print("üìä APPLYING TF-IDF VECTORIZATION")
print("="*70)

# 1. Create train/validation split
print("\n‚è≥ Creating train/validation split...")

X_train, X_val, y_train, y_val = train_test_split(
    train_df['text_clean'], 
    train_df['label'],
    test_size=0.1,  # 10% for validation
    random_state=42,
    stratify=train_df['label']  # Keep class balance
)

X_test = test_df['text_clean']
y_test = test_df['label']

print("\nüìä DATA SPLITS:")
print("-" * 70)
print(f"Training set:   {len(X_train):,} samples")
print(f"Validation set: {len(X_val):,} samples")
print(f"Test set:       {len(X_test):,} samples")

# 2. Apply TF-IDF with optimized parameters
print("\n‚è≥ Applying TF-IDF Vectorization with optimized parameters...")

tfidf = TfidfVectorizer(
    max_features=15000,      # Top 15,000 features
    min_df=3,                # Remove words in < 3 docs
    max_df=0.95,             # Remove words in > 95% docs  
    ngram_range=(1, 2),      # Unigrams + Bigrams
    sublinear_tf=True        # Log scaling
)

# Fit on training data only, transform all
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

print("\n‚úÖ TF-IDF Vectorization complete!")

print("\nüìä FEATURE MATRIX SHAPES:")
print("-" * 70)
print(f"Train: {X_train_tfidf.shape} ({X_train_tfidf.shape[0]:,} samples √ó {X_train_tfidf.shape[1]:,} features)")
print(f"Val:   {X_val_tfidf.shape} ({X_val_tfidf.shape[0]:,} samples √ó {X_val_tfidf.shape[1]:,} features)")
print(f"Test:  {X_test_tfidf.shape} ({X_test_tfidf.shape[0]:,} samples √ó {X_test_tfidf.shape[1]:,} features)")

# 3. Verify class balance
print("\n‚öñÔ∏è CLASS DISTRIBUTION IN SPLITS:")
print("-" * 70)
print(f"Train:      {dict(sorted(Counter(y_train).items()))}")
print(f"Validation: {dict(sorted(Counter(y_val).items()))}")
print(f"Test:       {dict(sorted(Counter(y_test).items()))}")

# 4. Show top features
print("\nüìã TOP 30 TF-IDF FEATURES:")
print("-" * 70)
feature_names = tfidf.get_feature_names_out()
print(list(feature_names[:30]))

# Save for later use
print("\n‚úÖ Data ready for model training!")


# ü§ñ Model Training

In this section, we will train and evaluate several baseline machine learning models to establish a performance benchmark.

## 1. Baseline Model Training

We train multiple models including Logistic Regression, Naive Bayes, Linear SVM, and Random Forest, and compare their performance on the validation and test sets.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import time

print("="*70)
print("üöÄ TRAINING BASELINE MODELS")
print("="*70)

# Define models to compare (n_jobs=4 for parallelization)
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, n_jobs=4),
    'Naive Bayes (Multinomial)': MultinomialNB(),  # No n_jobs param
    'Linear SVM': LinearSVC(max_iter=2000, random_state=42),  # No n_jobs param
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=4)
}

# Store results
results = {}

for name, model in models.items():
    print(f"\n{'='*70}")
    print(f"üìå Training: {name}")
    print(f"{'='*70}")
    
    # Train
    start_time = time.time()
    model.fit(X_train_tfidf, y_train)
    train_time = time.time() - start_time
    
    # Predict on validation
    y_val_pred = model.predict(X_val_tfidf)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    # Predict on test
    y_test_pred = model.predict(X_test_tfidf)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # Store results
    results[name] = {
        'model': model,
        'train_time': train_time,
        'val_accuracy': val_accuracy,
        'test_accuracy': test_accuracy,
        'y_test_pred': y_test_pred
    }
    
    print(f"‚è±Ô∏è  Training time:     {train_time:.2f} seconds")
    print(f"üìä Validation accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")
    print(f"üìä Test accuracy:       {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Summary comparison
print("\n" + "="*70)
print("üìä BASELINE MODELS COMPARISON SUMMARY")
print("="*70)
print(f"\n{'Model':<30} {'Val Acc':>10} {'Test Acc':>10} {'Time (s)':>10}")
print("-" * 70)

for name, res in sorted(results.items(), key=lambda x: x[1]['test_accuracy'], reverse=True):
    print(f"{name:<30} {res['val_accuracy']*100:>9.2f}% {res['test_accuracy']*100:>9.2f}% {res['train_time']:>10.2f}")

# Find best model
best_model_name = max(results, key=lambda x: results[x]['test_accuracy'])
print(f"\nüèÜ BEST MODEL: {best_model_name} ({results[best_model_name]['test_accuracy']*100:.2f}%)")


## 2. Model Evaluation

We perform a detailed evaluation of the best-performing model (Linear SVM), including classification report, confusion matrix, and misclassification analysis.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

print("="*70)
print("üìä DETAILED EVALUATION: LINEAR SVM")
print("="*70)

# Get predictions from best model
best_model = results['Linear SVM']['model']
y_test_pred = results['Linear SVM']['y_test_pred']

# Classification Report
print("\nüìã CLASSIFICATION REPORT:")
print("-" * 70)
print(classification_report(y_test, y_test_pred, target_names=list(label_mapping.values())))

# Confusion Matrix
print("\nüìä CONFUSION MATRIX:")
print("-" * 70)
cm = confusion_matrix(y_test, y_test_pred)
print(cm)

# Per-class accuracy
print("\nüìä PER-CLASS ACCURACY:")
print("-" * 70)
for i, category in label_mapping.items():
    class_acc = cm[i, i] / cm[i, :].sum()
    print(f"{category:15s}: {class_acc*100:.2f}% ({cm[i,i]}/{cm[i,:].sum()})")

# Visualize confusion matrix
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=list(label_mapping.values()),
            yticklabels=list(label_mapping.values()),
            annot_kws={'size': 14})
plt.title('Confusion Matrix - Linear SVM', fontsize=16, fontweight='bold')
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.tight_layout()
plt.show()

# Misclassification analysis
print("\nüîç MISCLASSIFICATION ANALYSIS:")
print("-" * 70)
total_errors = (y_test != y_test_pred).sum()
print(f"Total errors: {total_errors} out of {len(y_test)} ({total_errors/len(y_test)*100:.2f}%)")

print("\nMost common confusions:")
for i in range(4):
    for j in range(4):
        if i != j and cm[i,j] > 50:  # Show confusions > 50
            print(f"  {label_mapping[i]:12s} ‚Üí {label_mapping[j]:12s}: {cm[i,j]} errors")

print("\n‚úÖ Evaluation complete!")
