In [10]:
"""
Sports vs Politics Text Classifier
Label Convention: 1 = Sports, 0 = Politics
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, classification_report, confusion_matrix)
import pickle
import warnings
import os
from collections import Counter
import re

warnings.filterwarnings('ignore')

print("="*80)
print("SPORTS VS POLITICS TEXT CLASSIFIER")
print("Label Convention: 1 = Sports, 0 = Politics")
print("="*80)

def create_data_files_from_dataframes(sports_df, politics_df, output_dir="data"):
    """
    Create sports.txt and politics.txt from DataFrames

    Args:
        sports_df: DataFrame with 'content' column containing sports articles
        politics_df: DataFrame with 'content' column containing politics articles
        output_dir: Directory to save files (default: "data")

    Example:
        sports_df = pd.DataFrame({'content': ['article 1', 'article 2']})
        politics_df = pd.DataFrame({'content': ['article 1', 'article 2']})
        create_data_files_from_dataframes(sports_df, politics_df)
    """

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Extract texts
    sports_texts = sports_df["content"].tolist()
    politics_texts = politics_df["content"].tolist()

    # Write sports file
    sports_file = os.path.join(output_dir, "sports.txt")
    with open(sports_file, "w", encoding="utf-8") as f:
        for article in sports_texts:
            # Replace newlines with spaces, write one article per line
            f.write(article.replace("\n", " ") + "\n")

    print(f"✓ Created {sports_file} with {len(sports_texts)} articles")

    # Write politics file
    politics_file = os.path.join(output_dir, "politics.txt")
    with open(politics_file, "w", encoding="utf-8") as f:
        for article in politics_texts:
            # Replace newlines with spaces, write one article per line
            f.write(article.replace("\n", " ") + "\n")

    print(f"✓ Created {politics_file} with {len(politics_texts)} articles")
    print(f"\n Files created successfully in '{output_dir}/' directory")


def create_sample_data_files():
    """
    Create sample data files for testing
    Call this if you don't have DataFrames ready
    """

    # Sample sports articles
    sports_articles = [
        "Manchester United defeated Liverpool 2-1 in a thrilling Premier League match at Old Trafford.",
        "The Olympic Games opening ceremony showcased spectacular performances from athletes worldwide.",
        "Tennis star won the Grand Slam tournament after a five-set battle on center court.",
        "Basketball team secured championship victory with last-second three-pointer in overtime.",
        "Formula One driver breaks track record in qualifying session for upcoming Grand Prix.",
        "Cricket World Cup final draws massive global audience as teams compete for trophy.",
        "NFL quarterback throws record-breaking touchdown passes in playoff game victory.",
        "Soccer fans celebrate as national team advances to tournament semifinals with penalty shootout win.",
        "Swimmer sets new world record in Olympic trials breaking previous mark by two seconds.",
        "Golf tournament leader maintains strong position after third round at Augusta National.",
        "Boxing champion successfully defends title with unanimous decision victory in title fight.",
        "Hockey team wins Stanley Cup after intense seven-game series against rivals.",
        "Marathon runner finishes race in record time despite challenging weather conditions.",
        "Baseball team clinches division title with walk-off home run in bottom of ninth inning.",
        "Rugby match features incredible try-scoring performance from breakout star player.",
        "Cycling champion wins stage race after grueling mountain climb in Tour de France.",
        "Volleyball team earns gold medal at international championship with straight-set victory.",
        "Figure skating competition showcases breathtaking performances from talented athletes worldwide.",
        "Wrestling championship crowns new champion after dramatic final match performance.",
        "Track and field athlete breaks long-standing record in 100-meter sprint event.",
        "The football striker scored hat-trick in crucial derby match helping team maintain top position.",
        "Basketball playoffs feature intense competition with multiple overtime games thrilling fans.",
        "Tennis tournament reaches exciting conclusion with underdog defeating top seed in final.",
        "Hockey goalie makes incredible saves throughout game leading team to shutout victory.",
        "Baseball pitcher throws perfect game striking out fifteen batters in historic performance.",
        "Soccer team wins continental championship defeating rivals in penalty shootout after final.",
        "Olympic sprinter wins gold medal breaking championship record in 200-meter race.",
        "Golf major championship features dramatic finish with playoff deciding winner on final hole.",
        "Swimming relay team sets world record winning gold medal at international competition.",
        "Boxing match ends in knockout victory as challenger defeats champion in stunning upset.",
    ]

    # Sample politics articles
    politics_articles = [
        "Prime Minister announces new economic policy aimed at reducing unemployment and inflation rates.",
        "Senate votes on controversial healthcare reform bill after weeks of intense debate.",
        "President delivers State of the Union address outlining administration priorities for coming year.",
        "Parliament debates climate change legislation with cross-party support for environmental measures.",
        "Election results show surprising victory for opposition party in key swing districts.",
        "Congressional hearing examines government spending and budget allocation priorities.",
        "Political leaders meet for summit to discuss international trade agreements and tariffs.",
        "Supreme Court issues landmark ruling on constitutional rights impacting federal law.",
        "Governor signs executive order addressing state infrastructure and transportation funding.",
        "Political campaign announces policy platform focused on education and healthcare reform.",
        "Lawmakers negotiate bipartisan compromise on immigration reform legislation in committee.",
        "Presidential candidate leads polls ahead of primary elections in crucial battleground states.",
        "Cabinet reshuffle announced following resignation of senior government minister official.",
        "International diplomacy efforts intensify as nations negotiate peace treaty and sanctions.",
        "Political party convention nominates candidate for upcoming national election campaign.",
        "Congressional committee investigates allegations of corruption in government contracting process.",
        "United Nations assembly votes on resolution addressing humanitarian crisis and aid distribution.",
        "State legislature passes controversial bill on voting rights and election security measures.",
        "Political analyst predicts electoral outcomes based on recent polling data and trends.",
        "Government announces stimulus package to support economy during financial crisis period.",
        "Parliamentary session discusses proposed constitutional amendments affecting citizen rights.",
        "Foreign minister announces new diplomatic initiative to strengthen international relations.",
        "Senate majority leader calls for bipartisan cooperation on infrastructure spending bill.",
        "Political protest draws thousands demanding policy changes on social justice issues.",
        "Presidential administration unveils comprehensive plan to address national security concerns.",
        "Congress debates military spending authorization bill for defense budget allocation.",
        "Governor vetoes legislation citing concerns about fiscal impact on state budget.",
        "International summit addresses global trade disputes with negotiations between powers.",
        "Political scandal investigation leads to resignation of cabinet official amid pressure.",
        "Legislative committee holds hearing on environmental regulations affecting emissions.",
    ]

    # Create DataFrames
    sports_df = pd.DataFrame({'content': sports_articles})
    politics_df = pd.DataFrame({'content': politics_articles})

    # Create files
    create_data_files_from_dataframes(sports_df, politics_df)

    return sports_df, politics_df


# STEP 1: LOAD DATA FROM FILES

def load_data_from_files(sports_file="data/sports.txt", politics_file="data/politics.txt"):
    """
    Load data from text files
    Label convention: 1 = Sports, 0 = Politics
    """
    print("\n[1/7] LOADING DATA FROM FILES")
    print("-" * 80)

    document_lines = []
    labels = []

    # Load Sports (label = 1)
    if os.path.exists(sports_file):
        with open(sports_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    document_lines.append(line)
                    labels.append(1)
        print(f" Loaded sports data from {sports_file}")
    else:
        print(f" Warning: {sports_file} not found")

    # Load Politics (label = 0)
    if os.path.exists(politics_file):
        with open(politics_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    document_lines.append(line)
                    labels.append(0)
        print(f" Loaded politics data from {politics_file}")
    else:
        print(f" Warning: {politics_file} not found")

    print(f"\n Dataset Summary:")
    print(f"   Total documents: {len(document_lines)}")
    print(f"   Sports (label=1): {labels.count(1)}")
    print(f"   Politics (label=0): {labels.count(0)}")

    # Create DataFrame
    df = pd.DataFrame({
        'text': document_lines,
        'label': labels
    })

    # Shuffle
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    return df


# STEP 2: EXPLORATORY DATA ANALYSIS


def perform_eda(df):
    """Perform exploratory data analysis"""
    print("\n[2/7] EXPLORATORY DATA ANALYSIS")
    print("-" * 80)

    # Add text statistics
    df['text_length'] = df['text'].str.len()
    df['word_count'] = df['text'].str.split().str.len()

    # Statistics by category
    print("\n Text Statistics by Category:")
    print("-" * 80)

    for label, name in [(1, "Sports"), (0, "Politics")]:
        subset = df[df['label'] == label]
        print(f"\n{name} (label={label}):")
        print(f"  Count: {len(subset)}")
        print(f"  Avg length: {subset['text_length'].mean():.1f} characters")
        print(f"  Avg words: {subset['word_count'].mean():.1f} words")
        print(f"  Min words: {subset['word_count'].min()}")
        print(f"  Max words: {subset['word_count'].max()}")

    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Class distribution
    ax1 = axes[0, 0]
    label_counts = df['label'].value_counts().sort_index()
    colors = ['#FF6B6B', '#4ECDC4']
    bars = ax1.bar(['Politics (0)', 'Sports (1)'],
                    [label_counts[0], label_counts[1]],
                    color=colors, edgecolor='black')
    ax1.set_title('Class Distribution', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Number of Documents')
    for i, bar in enumerate(bars):
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom', fontweight='bold')

    # Text length distribution
    ax2 = axes[0, 1]
    for label, name, color in [(1, 'Sports', '#4ECDC4'), (0, 'Politics', '#FF6B6B')]:
        data = df[df['label'] == label]['text_length']
        ax2.hist(data, alpha=0.6, label=name, bins=20, color=color, edgecolor='black')
    ax2.set_title('Text Length Distribution', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Text Length (characters)')
    ax2.set_ylabel('Frequency')
    ax2.legend()

    # Word count distribution
    ax3 = axes[1, 0]
    for label, name, color in [(1, 'Sports', '#4ECDC4'), (0, 'Politics', '#FF6B6B')]:
        data = df[df['label'] == label]['word_count']
        ax3.hist(data, alpha=0.6, label=name, bins=20, color=color, edgecolor='black')
    ax3.set_title('Word Count Distribution', fontsize=14, fontweight='bold')
    ax3.set_xlabel('Number of Words')
    ax3.set_ylabel('Frequency')
    ax3.legend()

    # Box plot
    ax4 = axes[1, 1]
    box_data = [df[df['label'] == 0]['word_count'], df[df['label'] == 1]['word_count']]
    bp = ax4.boxplot(box_data, labels=['Politics (0)', 'Sports (1)'],
                     patch_artist=True)
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
    ax4.set_title('Word Count by Category', fontsize=14, fontweight='bold')
    ax4.set_ylabel('Word Count')

    plt.tight_layout()

    # Save plot
    os.makedirs('results', exist_ok=True)
    plt.savefig('results/eda_analysis.png', dpi=300, bbox_inches='tight')
    print("\n✓ Saved visualization: results/eda_analysis.png")
    plt.close()

    # Top words analysis
    print("\n Top Words by Category:")
    print("-" * 80)

    stop_words = {'the', 'and', 'for', 'with', 'from', 'that', 'this',
                 'was', 'were', 'are', 'has', 'had', 'but', 'not', 'after',
                 'before', 'when', 'where', 'who', 'what', 'which', 'how'}

    for label, name in [(1, "Sports"), (0, "Politics")]:
        text = ' '.join(df[df['label'] == label]['text'].values)
        words = re.findall(r'\b[a-z]{3,}\b', text.lower())
        words = [w for w in words if w not in stop_words]

        word_freq = Counter(words).most_common(20)

        print(f"\n{name} (label={label}):")
        for word, freq in word_freq[:10]:
            print(f"  {word:.<20} {freq:>4}")

    return df


# STEP 3: TRAIN-TEST SPLIT


def prepare_train_test_split(df):
    """Split data into train and test sets"""
    print("\n[3/7] PREPARING TRAIN-TEST SPLIT")
    print("-" * 80)

    X = df['text'].values
    y = df['label'].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    print(f"Training set: {len(X_train)} documents")
    print(f"  Sports (1): {sum(y_train == 1)}")
    print(f"  Politics (0): {sum(y_train == 0)}")

    print(f"\nTest set: {len(X_test)} documents")
    print(f"  Sports (1): {sum(y_test == 1)}")
    print(f"  Politics (0): {sum(y_test == 0)}")

    return X_train, X_test, y_train, y_test


# STEP 4: TRAIN MULTIPLE MODELS


def train_all_models(X_train, X_test, y_train, y_test):
    """Train and evaluate multiple models"""
    print("\n[4/7] TRAINING MULTIPLE MODELS")
    print("-" * 80)

    # Model configurations
    configs = [
        ('bow', (1, 1), 'Bag of Words', 'nb', 'Naive Bayes'),
        ('bow', (1, 1), 'Bag of Words', 'svm', 'SVM (Linear)'),
        ('bow', (1, 1), 'Bag of Words', 'lr', 'Logistic Regression'),
        ('bow', (1, 1), 'Bag of Words', 'rf', 'Random Forest'),
        ('tfidf', (1, 1), 'TF-IDF (1-gram)', 'nb', 'Naive Bayes'),
        ('tfidf', (1, 1), 'TF-IDF (1-gram)', 'svm', 'SVM (Linear)'),
        ('tfidf', (1, 1), 'TF-IDF (1-gram)', 'lr', 'Logistic Regression'),
        ('tfidf', (1, 1), 'TF-IDF (1-gram)', 'rf', 'Random Forest'),
        ('tfidf', (1, 2), 'TF-IDF (1-2-gram)', 'nb', 'Naive Bayes'),
        ('tfidf', (1, 2), 'TF-IDF (1-2-gram)', 'svm', 'SVM (Linear)'),
        ('tfidf', (1, 2), 'TF-IDF (1-2-gram)', 'lr', 'Logistic Regression'),
        ('tfidf', (1, 2), 'TF-IDF (1-2-gram)', 'rf', 'Random Forest'),
    ]

    results = []
    best_score = 0
    best_model = None
    best_vectorizer = None
    best_config = None

    print("\nRunning 12 experiments...\n")

    for feat_type, ngram, feat_name, model_type, model_name in configs:
        print(f"Training: {model_name} + {feat_name}")

        # Create vectorizer
        if feat_type == 'bow':
            vectorizer = CountVectorizer(
                max_features=5000,
                ngram_range=ngram,
                stop_words='english'
            )
        else:
            vectorizer = TfidfVectorizer(
                max_features=5000,
                ngram_range=ngram,
                stop_words='english'
            )

        # Vectorize
        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        # Create model
        if model_type == 'nb':
            model = MultinomialNB()
        elif model_type == 'svm':
            model = LinearSVC(random_state=42, max_iter=2000)
        elif model_type == 'lr':
            model = LogisticRegression(random_state=42, max_iter=1000)
        else:
            model = RandomForestClassifier(n_estimators=100, random_state=42)

        # Train
        model.fit(X_train_vec, y_train)

        # Evaluate
        y_pred = model.predict(X_test_vec)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted')
        rec = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Cross-validation
        cv_scores = cross_val_score(model, X_train_vec, y_train, cv=5, scoring='accuracy')

        results.append({
            'Feature': feat_name,
            'Model': model_name,
            'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1-Score': f1,
            'CV Mean': cv_scores.mean(),
            'CV Std': cv_scores.std()
        })

        print(f"  Acc: {acc:.3f} | F1: {f1:.3f} | CV: {cv_scores.mean():.3f}")

        # Track best model
        if f1 > best_score:
            best_score = f1
            best_model = model
            best_vectorizer = vectorizer
            best_config = (feat_name, model_name)

    results_df = pd.DataFrame(results)

    print("\n" + "="*80)
    print("RESULTS SUMMARY")
    print("="*80)
    print(results_df.to_string(index=False))

    # Save results
    results_df.to_csv('results/experiment_results.csv', index=False)
    print("\n Saved results to: results/experiment_results.csv")

    print(f"\n BEST MODEL: {best_config[1]} + {best_config[0]}")
    print(f"   F1-Score: {best_score:.4f}")

    return results_df, best_model, best_vectorizer, best_config


# STEP 5: VISUALIZE RESULTS


def visualize_results(results_df):
    """Create visualizations of results"""
    print("\n[5/7] CREATING VISUALIZATIONS")
    print("-" * 80)

    fig, axes = plt.subplots(1, 2, figsize=(15, 6))

    # Bar plot
    pivot = results_df.pivot(index='Model', columns='Feature', values='Accuracy')
    pivot.plot(kind='bar', ax=axes[0], rot=45)
    axes[0].set_title('Accuracy Comparison', fontweight='bold', fontsize=14)
    axes[0].set_ylabel('Accuracy')
    axes[0].legend(title='Feature Type', bbox_to_anchor=(1.05, 1))
    axes[0].grid(axis='y', alpha=0.3)

    # Heatmap
    pivot_f1 = results_df.pivot(index='Model', columns='Feature', values='F1-Score')
    sns.heatmap(pivot_f1, annot=True, fmt='.3f', cmap='YlGnBu', ax=axes[1],
                cbar_kws={'label': 'F1-Score'})
    axes[1].set_title('F1-Score Heatmap', fontweight='bold', fontsize=14)

    plt.tight_layout()
    plt.savefig('results/model_comparison.png', dpi=300, bbox_inches='tight')
    print(" Saved visualization: results/model_comparison.png")
    plt.close()


# STEP 6: BEST MODEL ANALYSIS


def analyze_best_model(best_model, best_vectorizer, X_train, X_test, y_train, y_test):
    """Detailed analysis of best model"""
    print("\n[6/7] BEST MODEL ANALYSIS")
    print("-" * 80)

    # Transform data
    X_train_vec = best_vectorizer.transform(X_train)
    X_test_vec = best_vectorizer.transform(X_test)

    # Predictions
    y_pred = best_model.predict(X_test_vec)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Politics (0)', 'Sports (1)'],
                yticklabels=['Politics (0)', 'Sports (1)'])
    plt.title('Confusion Matrix - Best Model', fontweight='bold', fontsize=14)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig('results/confusion_matrix.png', dpi=300, bbox_inches='tight')
    print(" Saved visualization: results/confusion_matrix.png")
    plt.close()

    # Classification Report
    print("\n Classification Report:")
    print("-" * 80)
    target_names = ['Politics (0)', 'Sports (1)']
    print(classification_report(y_test, y_pred, target_names=target_names))

    return y_pred


# STEP 7: SAVE MODEL AND TEST


def save_and_test_model(best_model, best_vectorizer):
    """Save model and test with new examples"""
    print("\n[7/7] SAVING MODEL & TESTING")
    print("-" * 80)

    # Save model
    os.makedirs('models', exist_ok=True)

    model_data = {
        'model': best_model,
        'vectorizer': best_vectorizer
    }

    with open('models/best_classifier.pkl', 'wb') as f:
        pickle.dump(model_data, f)

    print(" Model saved to: models/best_classifier.pkl")

    # Test with new examples
    test_examples = [
        "The basketball team won the championship in overtime with a buzzer beater",
        "Senate voted on the healthcare reform bill after lengthy debate",
        "Olympic swimmer breaks world record in 200m freestyle finals",
        "President signs executive order addressing climate change policy",
        "Football striker scores hat-trick in crucial derby match victory",
        "Congressional committee investigates corruption allegations in government"
    ]

    print("\n Testing with new examples:")
    print("-" * 80)

    X_new = best_vectorizer.transform(test_examples)
    predictions = best_model.predict(X_new)

    # Get probabilities if available
    if hasattr(best_model, 'predict_proba'):
        probas = best_model.predict_proba(X_new)
    elif hasattr(best_model, 'decision_function'):
        decision = best_model.decision_function(X_new)
        probas = np.exp(decision) / (1 + np.exp(decision))
        probas = np.column_stack([1 - probas, probas])
    else:
        probas = None

    for i, (text, pred) in enumerate(zip(test_examples, predictions)):
        label_name = "Sports" if pred == 1 else "Politics"

        if probas is not None:
            confidence = probas[i][pred] * 100
            print(f"\n{i+1}. {text}")
            print(f"   → Label: {pred} ({label_name}) | Confidence: {confidence:.1f}%")
        else:
            print(f"\n{i+1}. {text}")
            print(f"   → Label: {pred} ({label_name})")


# MAIN EXECUTION


def main():
    """Main execution function"""

    # Load data
    df = load_data_from_files()

    if len(df) == 0:
        print("\n ERROR: No data loaded. Please check your files:")
        print("   - data/sports.txt")
        print("   - data/politics.txt")
        return

    # EDA
    df = perform_eda(df)

    # Train-test split
    X_train, X_test, y_train, y_test = prepare_train_test_split(df)

    # Train models
    results_df, best_model, best_vectorizer, best_config = train_all_models(
        X_train, X_test, y_train, y_test
    )

    # Visualize
    visualize_results(results_df)

    # Analyze best model
    y_pred = analyze_best_model(best_model, best_vectorizer,
                                 X_train, X_test, y_train, y_test)

    # Save and test
    save_and_test_model(best_model, best_vectorizer)

    print("\n" + "="*80)
    print(" CLASSIFICATION COMPLETE!")
    print("="*80)
    print("\nGenerated Files:")
    print("   results/eda_analysis.png - Data exploration")
    print("   results/model_comparison.png - Model performance")
    print("   results/confusion_matrix.png - Best model accuracy")
    print("   results/experiment_results.csv - Detailed results")
    print("   models/best_classifier.pkl - Trained model")
    print("="*80)

if __name__ == "__main__":
    main()

SPORTS VS POLITICS TEXT CLASSIFIER
Label Convention: 1 = Sports, 0 = Politics

[1/7] LOADING DATA FROM FILES
--------------------------------------------------------------------------------
 Loaded sports data from data/sports.txt
 Loaded politics data from data/politics.txt

 Dataset Summary:
   Total documents: 60
   Sports (label=1): 30
   Politics (label=0): 30

[2/7] EXPLORATORY DATA ANALYSIS
--------------------------------------------------------------------------------

 Text Statistics by Category:
--------------------------------------------------------------------------------

Sports (label=1):
  Count: 30
  Avg length: 86.9 characters
  Avg words: 12.0 words
  Min words: 10
  Max words: 14

Politics (label=0):
  Count: 30
  Avg length: 89.0 characters
  Avg words: 10.9 words
  Min words: 9
  Max words: 13

✓ Saved visualization: results/eda_analysis.png

 Top Words by Category:
--------------------------------------------------------------------------------

Sports (label=1

In [11]:
"""
Helper Script: Create Data Files from DataFrames
Converts pandas DataFrames to the required text file format
"""

import pandas as pd
import os

# USAGE EXAMPLES


if __name__ == "__main__":
    print("="*80)
    print("DATA FILE CREATOR")
    print("Creates sports.txt and politics.txt in correct format")
    print("="*80)

    # Example 1: Create sample files
    print("\n Creating sample data files...\n")
    sports_df, politics_df = create_sample_data_files()

    print("\n" + "="*80)
    print(" DONE! You can now run:")
    print("   python classifier_with_labels.py")
    print("="*80)

    # Example 2: If you already have DataFrames
    """
    # Uncomment and use this if you have your own DataFrames:

    # Load your data
    sports_df = pd.read_csv('your_sports_data.csv')  # Must have 'content' column
    politics_df = pd.read_csv('your_politics_data.csv')  # Must have 'content' column

    # Create files
    create_data_files_from_dataframes(sports_df, politics_df)
    """

DATA FILE CREATOR
Creates sports.txt and politics.txt in correct format

 Creating sample data files...

✓ Created data/sports.txt with 30 articles
✓ Created data/politics.txt with 30 articles

 Files created successfully in 'data/' directory

 DONE! You can now run:
   python classifier_with_labels.py
