# Data cleaning and feature creation

## 1. Imports and config


In [13]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import re
from loguru import logger
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Import your existing config
from src.config import DATA_FILES, PROCESSED_DATA_DIR, FIGURES_DIR

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. Load raw data

In [2]:
logger.info("Loading raw datasets...")

train_df = pd.read_csv(DATA_FILES["train_raw"])
test_df = pd.read_csv(DATA_FILES["test_raw"])

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Train columns: {list(train_df.columns)}")

# Display sample data
print("\nSample training data:")
print(train_df.head())

print("\nClass distribution:")
if 'Label' in train_df.columns:
    class_dist = train_df['Label'].value_counts()
    print(class_dist)
    print(f"Class balance ratio: {class_dist.max() / class_dist.min():.2f}:1")

[32m2025-08-09 21:58:12.840[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mLoading raw datasets...[0m
Train shape: (10240, 2)
Test shape: (2551, 2)
Train columns: ['Statement', 'Label']

Sample training data:
                                           Statement  Label
0  Says the Annies List political group supports ...  False
1  When did the decline of coal start? It started...   True
2  Hillary Clinton agrees with John McCain "by vo...   True
3  Health care reform legislation is likely to ma...  False
4  The economic turnaround started at the end of ...   True

Class distribution:
Label
True     5752
False    4488
Name: count, dtype: int64
Class balance ratio: 1.28:1


## 3. Data quality analysis

In [5]:
print("\nData Quality Analysis")
print("-" * 30)

# Missing values
missing_train = train_df.isnull().sum()
print(f"Missing values in training data:")
for col, missing in missing_train.items():
    if missing > 0:
        print(f"  {col}: {missing} ({missing/len(train_df)*100:.1f}%)")

if missing_train.sum() == 0:
    print("No missing values found")

# Duplicates
duplicates = train_df['Statement'].duplicated().sum()
print(f"\nDuplicate statements: {duplicates} ({duplicates/len(train_df)*100:.1f}%)")

# Text length analysis
train_df['original_length'] = train_df['Statement'].astype(str).str.len()
print(f"\nText length statistics:")
print(f"  Mean: {train_df['original_length'].mean():.1f} characters")
print(f"  Median: {train_df['original_length'].median():.1f} characters")
print(f"  Min: {train_df['original_length'].min()}")
print(f"  Max: {train_df['original_length'].max()}")

# Very short texts
short_threshold = 50
short_texts = (train_df['original_length'] < short_threshold).sum()
print(f"  Texts < {short_threshold} chars: {short_texts} ({short_texts/len(train_df)*100:.1f}%)")


Data Quality Analysis
------------------------------
Missing values in training data:
No missing values found

Duplicate statements: 17 (0.2%)

Text length statistics:
  Mean: 106.9 characters
  Median: 99.0 characters
  Min: 11
  Max: 3145
  Texts < 50 chars: 667 (6.5%)


## 4. Text cleaning

In [6]:
print("Enhanced Text Cleaning")
print("-" * 30)

def advanced_text_cleaning(text):
    """Advanced text cleaning pipeline."""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<[^<]+?>', '', text)
    
    # Remove special characters but keep basic punctuation temporarily
    text = re.sub(r'[^\w\s!?.,;:]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def create_modeling_text(text):
    """Final cleaning for model input."""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Remove punctuation and numbers for TF-IDF
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace and very short words
    words = text.split()
    words = [word for word in words if len(word) >= 3]
    
    return ' '.join(words)

# Apply cleaning
print("Applying advanced text cleaning...")
train_df['statement_cleaned'] = train_df['Statement'].apply(advanced_text_cleaning)
test_df['statement_cleaned'] = test_df['Statement'].apply(advanced_text_cleaning)

# Create final modeling text
train_df['cleaned_text'] = train_df['statement_cleaned'].apply(create_modeling_text)
test_df['cleaned_text'] = test_df['statement_cleaned'].apply(create_modeling_text)

print("Text cleaning complete")

Enhanced Text Cleaning
------------------------------
Applying advanced text cleaning...
✅ Text cleaning complete


## 5. Feature engineering

In [8]:
print("Feature Engineering")
print("-" * 30)

def create_text_features(df, text_col='Statement'):
    """Create comprehensive text-based features."""
    df_enhanced = df.copy()
    
    # Basic length features
    df_enhanced['text_length'] = df[text_col].astype(str).str.len()
    df_enhanced['word_count'] = df[text_col].astype(str).str.split().str.len()
    df_enhanced['avg_word_length'] = df_enhanced['text_length'] / (df_enhanced['word_count'] + 1)
    
    # Punctuation features
    df_enhanced['exclamation_count'] = df[text_col].astype(str).str.count(r'!')
    df_enhanced['question_count'] = df[text_col].astype(str).str.count(r'\?')
    df_enhanced['period_count'] = df[text_col].astype(str).str.count(r'\.')
    
    # Capitalization features
    df_enhanced['caps_count'] = df[text_col].astype(str).apply(
        lambda x: sum(1 for c in str(x) if c.isupper())
    )
    df_enhanced['caps_ratio'] = df_enhanced['caps_count'] / (df_enhanced['text_length'] + 1)
    
    # Sentence-level features
    df_enhanced['sentence_count'] = df[text_col].astype(str).str.count(r'[.!?]+')
    df_enhanced['avg_sentence_length'] = df_enhanced['word_count'] / (df_enhanced['sentence_count'] + 1)
    
    # Additional features
    df_enhanced['digit_count'] = df[text_col].astype(str).str.count(r'\d')
    df_enhanced['digit_ratio'] = df_enhanced['digit_count'] / (df_enhanced['text_length'] + 1)
    
    return df_enhanced

# Create features
print("Creating text features...")
train_df = create_text_features(train_df)
test_df = create_text_features(test_df)

feature_columns = [
    'text_length', 'word_count', 'avg_word_length',
    'exclamation_count', 'question_count', 'period_count',
    'caps_count', 'caps_ratio', 'sentence_count', 'avg_sentence_length',
    'digit_count', 'digit_ratio'
]

print(f"Created {len(feature_columns)} new features")

Feature Engineering
------------------------------
Creating text features...
Created 12 new features


## 6. Data cleaning and filtering

In [11]:
print("Data Filtering")
print("-" * 30)

# Remove duplicates
initial_train_size = len(train_df)
train_df = train_df.drop_duplicates(subset=['Statement'])
print(f"Removed {initial_train_size - len(train_df)} duplicate statements")

# Filter very short texts
min_length = 20
initial_size = len(train_df)
train_df = train_df[train_df['cleaned_text'].str.len() >= min_length]
print(f"Removed {initial_size - len(train_df)} texts shorter than {min_length} characters")

# Apply same filtering to test set
test_df = test_df[test_df['cleaned_text'].str.len() >= min_length]

# Remove empty texts
train_df = train_df[train_df['cleaned_text'].str.len() > 0]
test_df = test_df[test_df['cleaned_text'].str.len() > 0]

print(f"Final training samples: {len(train_df)}")
print(f"Final test samples: {len(test_df)}")

Data Filtering
------------------------------
Removed 0 duplicate statements
Removed 0 texts shorter than 20 characters
Final training samples: 10178
Final test samples: 2536


## 7. Train/test split

In [14]:
print("Creating Train/Validation Split")
print("-" * 30)

# Select columns for final dataset
final_columns = ['cleaned_text'] + feature_columns
if 'Label' in train_df.columns:
    final_columns.append('Label')

X = train_df[final_columns]
y = train_df['Label'] if 'Label' in train_df.columns else None

if y is not None:
    # Stratified split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training set: {len(X_train)} samples")
    print(f"Validation set: {len(X_val)} samples")
    print(f"Train class distribution: {y_train.value_counts().to_dict()}")
    print(f"Val class distribution: {y_val.value_counts().to_dict()}")
else:
    X_train = X
    X_val = None
    print("No labels found - using all data for training")

Creating Train/Validation Split
------------------------------
Training set: 8142 samples
Validation set: 2036 samples
Train class distribution: {True: 4580, False: 3562}
Val class distribution: {True: 1145, False: 891}


## 8. Visualization

## 9. Summary

In [27]:
test_final = test_df[final_columns[:-1]]  # Remove label column if it exists

print("ENHANCED PREPROCESSING COMPLETE")
print("=" * 60)

print(f"Final Dataset Summary:")
print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val) if X_val is not None else 0}")
print(f"Test samples: {len(test_final)}")
print(f"Features per sample: {len(final_columns)-1}")  # -1 for label

print("Features created:")
for i, feature in enumerate(feature_columns, 1):
    print(f"  {i:2d}. {feature}")

logger.success("Enhanced preprocessing notebook execution complete.")

ENHANCED PREPROCESSING COMPLETE
Final Dataset Summary:
Training samples: 8142
Validation samples: 2036
Test samples: 2536
Features per sample: 13
Features created:
   1. text_length
   2. word_count
   3. avg_word_length
   4. exclamation_count
   5. question_count
   6. period_count
   7. caps_count
   8. caps_ratio
   9. sentence_count
  10. avg_sentence_length
  11. digit_count
  12. digit_ratio

📈 Key improvements:
   ✅ Advanced text cleaning (URLs, HTML, emails removed)
   ✅ 12 engineered features
   ✅ Duplicate removal
   ✅ Quality filtering
   ✅ Proper train/validation split

🚀 Next steps:
   1. Run training: make train
   2. The enhanced modeling.py will automatically use these features
   3. Expect significant performance improvement!
[32m2025-08-09 22:36:48.271[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mEnhanced preprocessing notebook execution complete! 🎯[0m
