<a href="https://colab.research.google.com/github/appliedcode/mthree-c422/blob/mthree-c422-Prashanth/Text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Required Libraries
import pandas as pd
import numpy as np
import re
import string
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             classification_report, confusion_matrix, roc_curve, auc)
from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# 1. Create Sample Movie Reviews Dataset
def create_movie_review_dataset():
    # Positive reviews
    positive_reviews = [
        "I absolutely loved this film! The story was compelling and the acting superb.",
        "An amazing cinematic experience with brilliant performances.",
        "A heartwarming story that touched me deeply.",
        "Fantastic direction and stunning visuals. Highly recommended!",
        "Top-notch screenplay and incredible soundtrack."
    ]
    # Negative reviews
    negative_reviews = [
        "The movie was boring and slow-paced; I wouldn’t recommend it.",
        "Terrible acting and a predictable script ruined it for me.",
        "Waste of time. The plot made no sense at all.",
        "Poor editing and bad special effects spoiled the experience.",
        "One of the worst movies I have ever watched."
    ]
    # Neutral reviews
    neutral_reviews = [
        "It was an okay movie — nothing special but watchable.",
        "The storyline was average and the characters were decent.",
        "Some parts were entertaining, others quite dull.",
        "Not great, not terrible, just an average film experience.",
        "A middling movie, not memorable but not horrible either."
    ]
    texts = positive_reviews + negative_reviews + neutral_reviews
    labels = (['Positive'] * len(positive_reviews) +
              ['Negative'] * len(negative_reviews) +
              ['Neutral'] * len(neutral_reviews))
    return pd.DataFrame({'text': texts, 'sentiment': labels})

df = create_movie_review_dataset()

# 2. Text Preprocessing Pipeline
class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = text.lower()
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'\S+@\S+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        text = text.strip()
        return text

    def preprocess(self, text):
        text = self.clean_text(text)
        tokens = word_tokenize(text)
        tokens = [t for t in tokens if t not in self.stop_words and len(t) > 2]
        tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
        return " ".join(tokens)

preprocessor = TextPreprocessor()
df['processed_text'] = df['text'].apply(preprocessor.preprocess)

# 3. Basic Text Classification Pipeline (Naive Bayes)

# Vectorize text
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
X = vectorizer.fit_transform(df['processed_text'])
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict
y_pred = nb_model.predict(X_test)

# Evaluation
print("=== Naive Bayes Classification Report ===")
print(classification_report(y_test, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_test, y_pred)

# Visualize Confusion Matrix
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Naive Bayes')
plt.show()

# 4. Model Comparison Framework with Cross-Validation

models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM (Linear)': SVC(kernel='linear', probability=True),
    'Random Forest': RandomForestClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}
for name, model in models.items():
    print(f"\nTraining and cross-validating {name} ...")
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(max_features=1000, ngram_range=(1,2))),
        ('clf', model)
    ])
    scores = cross_val_score(pipeline, df['processed_text'], y, cv=cv, scoring='f1_macro')
    print(f"F1 Macro CV Mean: {scores.mean():.3f} | Std: {scores.std():.3f}")
    results[name] = scores.mean()

# Visualize Model Comparison
plt.figure(figsize=(8,5))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.ylabel('Mean F1 Macro Score (5-fold CV)')
plt.title('Model Comparison')
plt.xticks(rotation=45)
plt.show()

# 5. Advanced Feature Engineering - Comparing Vectorizers

vectorizers = {
    'CountVectorizer (unigrams)': CountVectorizer(max_features=1000, ngram_range=(1,1)),
    'TF-IDF (unigrams)': TfidfVectorizer(max_features=1000, ngram_range=(1,1)),
    'TF-IDF (bigrams)': TfidfVectorizer(max_features=1000, ngram_range=(2,2)),
    'TF-IDF (unigrams + bigrams)': TfidfVectorizer(max_features=1000, ngram_range=(1,2)),
    'TF-IDF (char 3-5 grams)': TfidfVectorizer(max_features=1000, analyzer='char', ngram_range=(3,5))
}

feature_results = {}
model_fe = LogisticRegression(max_iter=1000)
for name, vect in vectorizers.items():
    X_vect = vect.fit_transform(df['processed_text'])
    scores = cross_val_score(model_fe, X_vect, y, cv=cv, scoring='f1_macro')
    print(f"{name}: F1 Macro CV Mean={scores.mean():.3f}")
    feature_results[name] = scores.mean()

# Plot Feature Engineering Results
plt.figure(figsize=(10,5))
sns.barplot(x=list(feature_results.keys()), y=list(feature_results.values()))
plt.xticks(rotation=45)
plt.ylabel('Mean F1 Macro (Cross-Validation)')
plt.title('Feature Extraction Method Comparison')
plt.show()

# 6. Sentiment Classification with Prediction Confidence (Logistic Regression)

pipeline_lr = Pipeline([
    ('vect', TfidfVectorizer(max_features=1000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])
pipeline_lr.fit(df['processed_text'], y)
y_proba = pipeline_lr.predict_proba(df['processed_text'])
max_confidence = np.max(y_proba, axis=1)

# Confidence Distribution Plot
plt.figure(figsize=(6,4))
plt.hist(max_confidence, bins=20, alpha=0.7)
plt.xlabel('Prediction Confidence')
plt.ylabel('Frequency')
plt.title('Prediction Confidence Distribution (Logistic Regression)')
plt.show()

# Predict with confidence function
def predict_with_confidence(text, pipeline, label_encoder):
    text_proc = preprocessor.preprocess(text)
    probs = pipeline.predict_proba([text_proc])[0]
    pred_idx = np.argmax(probs)
    pred_label = label_encoder.inverse_transform([pred_idx])[0]
    confidence = probs[pred_idx]
    return pred_label, confidence

# Test on new reviews
test_texts = [
    "What a fantastic movie! Loved every minute of it.",
    "It was dull and uninteresting. Not worth watching.",
    "Some scenes were good, but overall just okay."
]

for txt in test_texts:
    pred, conf = predict_with_confidence(txt, pipeline_lr, le)
    print(f"Review: {txt}\nPredicted Sentiment: {pred}, Confidence: {conf:.3f}\n")

# 7. Hyperparameter Tuning Example with GridSearchCV (Logistic Regression)

param_grid = {
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__penalty': ['l2'],
    'clf__solver': ['lbfgs']
}

grid_search = GridSearchCV(pipeline_lr, param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
grid_search.fit(df['processed_text'], y)

print("Best parameters:", grid_search.best_params_)
print(f"Best cross-validated F1 Macro: {grid_search.best_score_:.3f}")

# Use best model
best_model = grid_search.best_estimator_

# Done! You now have a full working pipeline for movie reviews sentiment classification.



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [2]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Text processing
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Classification models
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score,
    roc_curve, precision_recall_curve
)

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download required NLTK data
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print(f"🐍 NumPy version: {np.__version__}")
print(f"🐼 Pandas version: {pd.__version__}")


✅ All libraries imported successfully!
🐍 NumPy version: 2.0.2
🐼 Pandas version: 2.2.2


In [3]:
def create_news_dataset():
    """Create a sample news classification dataset"""

    # Technology news
    tech_news = [
        "New AI breakthrough enables faster machine learning algorithms",
        "Tech giant releases latest smartphone with advanced camera features",
        "Cybersecurity experts warn about new malware threats",
        "Cloud computing adoption accelerates across enterprises",
        "Quantum computing research shows promising results",
        "Software update fixes critical security vulnerabilities",
        "Virtual reality technology transforms gaming industry",
        "Internet of Things devices increase smart home adoption",
        "Blockchain technology revolutionizes financial transactions",
        "Artificial intelligence improves medical diagnosis accuracy"
    ]

    # Sports news
    sports_news = [
        "Championship game ends with dramatic overtime victory",
        "Star athlete breaks long-standing world record",
        "Team trades popular player to rival franchise",
        "Olympic preparations continue despite venue challenges",
        "Coach announces retirement after successful career",
        "Rookie player shows exceptional talent in debut season",
        "Stadium renovations completed before season opener",
        "League implements new rules to improve player safety",
        "International tournament draws record viewership",
        "Injury forces veteran player to miss championship"
    ]

    # Business news
    business_news = [
        "Stock market reaches all-time high amid economic optimism",
        "Major corporation announces significant layoffs",
        "Startup secures massive funding round from investors",
        "Merger creates industry-leading company",
        "Economic indicators suggest potential recession",
        "Company reports record quarterly profits",
        "Trade negotiations impact global supply chains",
        "Central bank adjusts interest rates",
        "Retail sales decline during holiday season",
        "Cryptocurrency market experiences volatile trading"
    ]

    # Health news
    health_news = [
        "New vaccine shows high effectiveness in clinical trials",
        "Medical breakthrough offers hope for cancer patients",
        "Health officials recommend updated safety guidelines",
        "Research reveals benefits of Mediterranean diet",
        "Mental health awareness campaign launches nationwide",
        "Gene therapy treatment approved for rare disease",
        "Exercise study shows surprising cardiovascular benefits",
        "Pharmaceutical company recalls contaminated medication",
        "Telemedicine adoption grows in rural communities",
        "Sleep disorder research identifies new treatment options"
    ]

    # Combine all categories
    texts = tech_news + sports_news + business_news + health_news
    labels = (['Technology'] * len(tech_news) +
             ['Sports'] * len(sports_news) +
             ['Business'] * len(business_news) +
             ['Health'] * len(health_news))

    return pd.DataFrame({'text': texts, 'category': labels})

def create_sentiment_dataset():
    """Create a sample sentiment classification dataset"""

    positive_reviews = [
        "This product is absolutely amazing! Highly recommend it.",
        "Outstanding quality and excellent customer service.",
        "Best purchase I've made this year. Love it!",
        "Incredible value for money. Very satisfied.",
        "Perfect product, fast shipping, great experience.",
        "Exceeded my expectations in every way possible.",
        "Fantastic build quality and beautiful design.",
        "Works perfectly and arrived ahead of schedule.",
        "Brilliant product with innovative features.",
        "Absolutely delighted with this purchase."
    ]

    negative_reviews = [
        "Terrible product, completely waste of money.",
        "Poor quality and horrible customer support.",
        "Worst purchase ever. Do not recommend.",
        "Overpriced and underdelivered. Very disappointed.",
        "Product broke after one week. Awful quality.",
        "Completely useless and poorly designed.",
        "Cheap materials and terrible build quality.",
        "Doesn't work as advertised. Very frustrating.",
        "Defective product with no customer support.",
        "Regret buying this. Save your money."
    ]

    neutral_reviews = [
        "Product is okay, nothing special but works fine.",
        "Average quality for the price point.",
        "It's decent but could be better.",
        "Standard product with basic features.",
        "Neither good nor bad, just mediocre.",
        "Works as expected, no surprises.",
        "Reasonable quality for a budget option.",
        "It's fine but not outstanding.",
        "Acceptable product with room for improvement.",
        "Does the job but nothing impressive."
    ]

    texts = positive_reviews + negative_reviews + neutral_reviews
    labels = (['Positive'] * len(positive_reviews) +
             ['Negative'] * len(negative_reviews) +
             ['Neutral'] * len(neutral_reviews))

    return pd.DataFrame({'text': texts, 'sentiment': labels})

# Create datasets
news_df = create_news_dataset()
sentiment_df = create_sentiment_dataset()

print("📰 News Dataset:")
print(f"   Total samples: {len(news_df)}")
print(f"   Categories: {news_df['category'].value_counts().to_dict()}")

print("\n😊 Sentiment Dataset:")
print(f"   Total samples: {len(sentiment_df)}")
print(f"   Sentiments: {sentiment_df['sentiment'].value_counts().to_dict()}")

# Display sample data
print("\n📋 Sample News Data:")
display(news_df.head())

print("\n📋 Sample Sentiment Data:")
display(sentiment_df.head())


📰 News Dataset:
   Total samples: 40
   Categories: {'Technology': 10, 'Sports': 10, 'Business': 10, 'Health': 10}

😊 Sentiment Dataset:
   Total samples: 30
   Sentiments: {'Positive': 10, 'Negative': 10, 'Neutral': 10}

📋 Sample News Data:


Unnamed: 0,text,category
0,New AI breakthrough enables faster machine lea...,Technology
1,Tech giant releases latest smartphone with adv...,Technology
2,Cybersecurity experts warn about new malware t...,Technology
3,Cloud computing adoption accelerates across en...,Technology
4,Quantum computing research shows promising res...,Technology



📋 Sample Sentiment Data:


Unnamed: 0,text,sentiment
0,This product is absolutely amazing! Highly rec...,Positive
1,Outstanding quality and excellent customer ser...,Positive
2,Best purchase I've made this year. Love it!,Positive
3,Incredible value for money. Very satisfied.,Positive
4,"Perfect product, fast shipping, great experience.",Positive


In [4]:
class TextPreprocessor:
    """Comprehensive text preprocessing pipeline for classification"""

    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        """Basic text cleaning"""
        if pd.isna(text):
            return ""

        # Convert to lowercase
        text = text.lower()

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove email addresses
        text = re.sub(r'\S+@\S+', '', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def remove_punctuation(self, text):
        """Remove punctuation while preserving sentence structure"""
        # Keep periods, exclamation marks, question marks for sentence boundaries
        punctuation_to_remove = string.punctuation.replace('.', '').replace('!', '').replace('?', '')
        text = text.translate(str.maketrans('', '', punctuation_to_remove))
        return text

    def tokenize_and_filter(self, text, remove_stopwords=True, apply_stemming=False, apply_lemmatization=True):
        """Tokenize text and apply filtering options"""
        # Tokenize
        tokens = word_tokenize(text)

        # Filter out short tokens and numbers
        tokens = [token for token in tokens if len(token) > 2 and not token.isdigit()]

        # Remove stopwords
        if remove_stopwords:
            tokens = [token for token in tokens if token not in self.stop_words]

        # Apply stemming
        if apply_stemming:
            tokens = [self.stemmer.stem(token) for token in tokens]

        # Apply lemmatization
        if apply_lemmatization:
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]

        return tokens

    def preprocess_pipeline(self, text, remove_stopwords=True, apply_stemming=False, apply_lemmatization=True):
        """Complete preprocessing pipeline"""
        # Clean text
        text = self.clean_text(text)

        # Remove punctuation
        text = self.remove_punctuation(text)

        # Tokenize and filter
        tokens = self.tokenize_and_filter(
            text,
            remove_stopwords=remove_stopwords,
            apply_stemming=apply_stemming,
            apply_lemmatization=apply_lemmatization
        )

        # Join tokens back to text
        return ' '.join(tokens)

    def preprocess_dataframe(self, df, text_column, **kwargs):
        """Apply preprocessing to entire dataframe"""
        df_processed = df.copy()
        df_processed[f'{text_column}_processed'] = df_processed[text_column].apply(
            lambda x: self.preprocess_pipeline(x, **kwargs)
        )
        return df_processed

# Initialize preprocessor
preprocessor = TextPreprocessor()

# Test preprocessing
sample_text = "This is a GREAT product!!! I highly recommend it. Visit https://example.com for more info."
print("📝 Original text:")
print(f"   {sample_text}")

print("\n🔧 Preprocessed text:")
processed_text = preprocessor.preprocess_pipeline(sample_text)
print(f"   {processed_text}")

print("\n✅ Text preprocessing functions ready!")

📝 Original text:
   This is a GREAT product!!! I highly recommend it. Visit https://example.com for more info.

🔧 Preprocessed text:
   great product highly recommend visit info

✅ Text preprocessing functions ready!


In [5]:
# TODO: Implement basic text classification pipeline

def basic_classification_pipeline(df, text_column, label_column, test_size=0.2, random_state=42):
    """
    Complete text classification pipeline

    Parameters:
    -----------
    df : DataFrame
        Input dataset
    text_column : str
        Name of text column
    label_column : str
        Name of label column
    test_size : float
        Proportion of test set
    random_state : int
        Random seed for reproducibility

    Returns:
    --------
    dict : Dictionary containing trained model, vectorizer, and evaluation results
    """

    # Step 1: Preprocess text data
    print("🔧 Step 1: Preprocessing text data...")
    # Your code here

    # Step 2: Create TF-IDF vectors
    print("📊 Step 2: Creating TF-IDF vectors...")
    # Your code here

    # Step 3: Split data into train/test sets
    print("🔀 Step 3: Splitting data...")
    # Your code here

    # Step 4: Train Multinomial Naive Bayes classifier
    print("🤖 Step 4: Training classifier...")
    # Your code here

    # Step 5: Make predictions
    print("🎯 Step 5: Making predictions...")
    # Your code here

    # Step 6: Evaluate model
    print("📈 Step 6: Evaluating model...")
    # Your code here

    # Return results
    return {
        'model': None,  # Replace with trained model
        'vectorizer': None,  # Replace with fitted vectorizer
        'accuracy': 0.0,  # Replace with actual accuracy
        'classification_report': None,  # Replace with classification report
        'confusion_matrix': None,  # Replace with confusion matrix
        'predictions': None,  # Replace with predictions
        'test_labels': None  # Replace with test labels
    }

# Test the pipeline
print("🏋️ Testing Basic Classification Pipeline")
print("=" * 50)

results = basic_classification_pipeline(news_df, 'text', 'category')

# Display results
print(f"\n📊 Model Performance:")
print(f"   Accuracy: {results['accuracy']:.3f}")

if results['classification_report']:
    print("\n📋 Classification Report:")
    print(results['classification_report'])

🏋️ Testing Basic Classification Pipeline
🔧 Step 1: Preprocessing text data...
📊 Step 2: Creating TF-IDF vectors...
🔀 Step 3: Splitting data...
🤖 Step 4: Training classifier...
🎯 Step 5: Making predictions...
📈 Step 6: Evaluating model...

📊 Model Performance:
   Accuracy: 0.000


In [6]:
# TODO: Implement model comparison framework

class ModelComparator:
    """Compare multiple classification models"""

    def __init__(self, random_state=42):
        self.random_state = random_state
        self.models = {
            'Multinomial NB': MultinomialNB(),
            'Logistic Regression': LogisticRegression(random_state=random_state, max_iter=1000),
            'SVM (Linear)': SVC(kernel='linear', random_state=random_state, probability=True),
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=random_state),
            'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
        }
        self.results = {}

    def prepare_data(self, df, text_column, label_column, test_size=0.2):
        """
        Prepare data for classification

        Parameters:
        -----------
        df : DataFrame
            Input dataset
        text_column : str
            Name of text column
        label_column : str
            Name of label column
        test_size : float
            Proportion of test set
        """
        # Your code here - preprocess data and create train/test splits
        pass

    def train_and_evaluate_models(self, cv_folds=5):
        """
        Train and evaluate all models using cross-validation

        Parameters:
        -----------
        cv_folds : int
            Number of cross-validation folds
        """
        # Your code here - implement model training and evaluation
        pass

    def get_detailed_metrics(self, model_name):
        """
        Get detailed metrics for a specific model

        Parameters:
        -----------
        model_name : str
            Name of the model
        """
        # Your code here - calculate detailed metrics
        pass

    def visualize_results(self):
        """
        Create visualizations of model comparison results
        """
        # Your code here - create comparison visualizations
        pass

    def predict_new_text(self, text, best_model_name=None):
        """
        Predict class for new text using the best model

        Parameters:
        -----------
        text : str
            Text to classify
        best_model_name : str
            Name of best model to use (if None, auto-select)
        """
        # Your code here - implement prediction for new text
        pass

# Test the model comparator
print("🤖 Testing Model Comparison Framework")
print("=" * 50)

comparator = ModelComparator()

# Prepare data
print("📊 Preparing data...")
comparator.prepare_data(news_df, 'text', 'category')

# Train and evaluate models
print("🔄 Training and evaluating models...")
comparator.train_and_evaluate_models()

# Visualize results
print("📈 Creating visualizations...")
comparator.visualize_results()

# Test prediction on new text
new_text = "Scientists develop revolutionary artificial intelligence algorithm for medical diagnosis"
print(f"\n🎯 Predicting category for: '{new_text}'")
prediction = comparator.predict_new_text(new_text)
print(f"   Predicted category: {prediction}")

🤖 Testing Model Comparison Framework
📊 Preparing data...
🔄 Training and evaluating models...
📈 Creating visualizations...

🎯 Predicting category for: 'Scientists develop revolutionary artificial intelligence algorithm for medical diagnosis'
   Predicted category: None


In [7]:
class FeatureEngineering:
    """Advanced feature engineering for text classification"""

    def __init__(self):
        self.experiments = {}
        self.preprocessor = TextPreprocessor()

    def create_vectorizers(self, max_features=1000):
        """
        Create different vectorizers for comparison

        Parameters:
        -----------
        max_features : int
            Maximum number of features to extract

        Returns:
        --------
        dict : Dictionary of vectorizers
        """
        # Your code here - create various vectorizers
        vectorizers = {
            'CountVectorizer (unigram)': None,
            'CountVectorizer (bigram)': None,
            'TF-IDF (unigram)': None,
            'TF-IDF (bigram)': None,
            'TF-IDF (trigram)': None,
            'Character n-grams': None
        }

        return vectorizers

    def experiment_preprocessing(self, df, text_column, label_column):
        """
        Experiment with different preprocessing options

        Parameters:
        -----------
        df : DataFrame
            Input dataset
        text_column : str
            Name of text column
        label_column : str
            Name of label column
        """
        # Your code here - test different preprocessing configurations
        preprocessing_configs = [
            {'remove_stopwords': True, 'apply_lemmatization': True, 'apply_stemming': False},
            {'remove_stopwords': False, 'apply_lemmatization': True, 'apply_stemming': False},
            {'remove_stopwords': True, 'apply_lemmatization': False, 'apply_stemming': True},
            {'remove_stopwords': True, 'apply_lemmatization': False, 'apply_stemming': False}
        ]

        # Test each configuration
        # Your implementation here
        pass

    def analyze_feature_importance(self, model, vectorizer, feature_names, top_k=20):
        """
        Analyze feature importance for trained model

        Parameters:
        -----------
        model : sklearn model
            Trained classification model
        vectorizer : sklearn vectorizer
            Fitted vectorizer
        feature_names : array
            Feature names from vectorizer
        top_k : int
            Number of top features to analyze
        """
        # Your code here - analyze and visualize feature importance
        pass

    def compare_vectorization_methods(self, df, text_column, label_column):
        """
        Compare different vectorization methods

        Parameters:
        -----------
        df : DataFrame
            Input dataset
        text_column : str
            Name of text column
        label_column : str
            Name of label column
        """
        # Your code here - compare vectorization methods
        pass

    def visualize_experiments(self):
        """
        Create visualizations of feature engineering experiments
        """
        # Your code here - create comprehensive visualizations
        pass

# Test feature engineering
print("🔬 Testing Advanced Feature Engineering")
print("=" * 50)

feature_engineer = FeatureEngineering()

# Test different vectorizers
print("📊 Comparing vectorization methods...")
feature_engineer.compare_vectorization_methods(news_df, 'text', 'category')

# Test preprocessing options
print("🔧 Experimenting with preprocessing...")
feature_engineer.experiment_preprocessing(news_df, 'text', 'category')

# Visualize results
print("📈 Creating visualizations...")
feature_engineer.visualize_experiments()


🔬 Testing Advanced Feature Engineering
📊 Comparing vectorization methods...
🔧 Experimenting with preprocessing...
📈 Creating visualizations...


In [8]:
# TODO: Implement sentiment classification challenge

class SentimentClassifier:
    """Advanced sentiment classification with confidence analysis"""

    def __init__(self, random_state=42):
        self.random_state = random_state
        self.models = {}
        self.vectorizer = None
        self.label_encoder = LabelEncoder()
        self.results = {}

    def prepare_sentiment_data(self, df, text_column, label_column, test_size=0.2):
        """
        Prepare sentiment data for classification

        Parameters:
        -----------
        df : DataFrame
            Input sentiment dataset
        text_column : str
            Name of text column
        label_column : str
            Name of sentiment label column
        test_size : float
            Proportion of test set
        """
        # Your code here - prepare sentiment data
        pass

    def train_sentiment_models(self):
        """
        Train multiple models for sentiment classification
        """
        # Your code here - train various models
        self.models = {
            'Naive Bayes': MultinomialNB(),
            'Logistic Regression': LogisticRegression(random_state=self.random_state),
            'SVM': SVC(probability=True, random_state=self.random_state)
        }

        # Train each model
        # Your implementation here
        pass

    def evaluate_with_roc_curves(self):
        """
        Evaluate models using ROC curves and AUC scores
        """
        # Your code here - create ROC curves and calculate AUC
        pass

    def analyze_prediction_confidence(self, model_name):
        """
        Analyze prediction confidence for a specific model

        Parameters:
        -----------
        model_name : str
            Name of the model to analyze
        """
        # Your code here - analyze prediction confidence
        pass

    def predict_sentiment_with_confidence(self, text, model_name='Logistic Regression'):
        """
        Predict sentiment with confidence score

        Parameters:
        -----------
        text : str
            Text to analyze
        model_name : str
            Model to use for prediction

        Returns:
        --------
        dict : Prediction results with confidence
        """
        # Your code here - implement prediction with confidence
        return {
            'text': text,
            'predicted_sentiment': None,
            'confidence': 0.0,
            'probabilities': {}
        }

    def create_confusion_matrices(self):
        """
        Create confusion matrices for all models
        """
        # Your code here - create and visualize confusion matrices
        pass

# Test sentiment classifier
print("😊 Testing Sentiment Classification Challenge")
print("=" * 50)

sentiment_classifier = SentimentClassifier()

# Prepare data
print("📊 Preparing sentiment data...")
sentiment_classifier.prepare_sentiment_data(sentiment_df, 'text', 'sentiment')

# Train models
print("🤖 Training sentiment models...")
sentiment_classifier.train_sentiment_models()

# Evaluate with ROC curves
print("📈 Creating ROC curves...")
sentiment_classifier.evaluate_with_roc_curves()

# Analyze confidence
print("🎯 Analyzing prediction confidence...")
sentiment_classifier.analyze_prediction_confidence('Logistic Regression')

# Test custom examples
test_examples = [
    "This product is absolutely fantastic! Best purchase ever!",
    "Terrible quality, waste of money. Very disappointed.",
    "It's okay, nothing special but works fine."
]

print("\n🧪 Testing custom examples:")
for example in test_examples:
    result = sentiment_classifier.predict_sentiment_with_confidence(example)
    print(f"   Text: '{example[:50]}...'")
    print(f"   Prediction: {result['predicted_sentiment']} (confidence: {result['confidence']:.3f})")

# Create confusion matrices
print("\n📊 Creating confusion matrices...")
sentiment_classifier.create_confusion_matrices()

😊 Testing Sentiment Classification Challenge
📊 Preparing sentiment data...
🤖 Training sentiment models...
📈 Creating ROC curves...
🎯 Analyzing prediction confidence...

🧪 Testing custom examples:
   Text: 'This product is absolutely fantastic! Best purchas...'
   Prediction: None (confidence: 0.000)
   Text: 'Terrible quality, waste of money. Very disappointe...'
   Prediction: None (confidence: 0.000)
   Text: 'It's okay, nothing special but works fine....'
   Prediction: None (confidence: 0.000)

📊 Creating confusion matrices...


In [9]:
# TODO: Implement cross-validation and model selection

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

class ModelSelector:
    """Comprehensive model selection with cross-validation"""

    def __init__(self, cv_folds=5, random_state=42):
        self.cv_folds = cv_folds
        self.random_state = random_state
        self.cv_strategy = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
        self.pipelines = {}
        self.grid_search_results = {}
        self.cv_results = {}

    def create_pipelines(self):
        """
        Create sklearn pipelines for different models
        """
        # Your code here - create pipelines with preprocessing and models
        self.pipelines = {
            'nb_pipeline': None,  # Naive Bayes pipeline
            'lr_pipeline': None,  # Logistic Regression pipeline
            'svm_pipeline': None,  # SVM pipeline
            'rf_pipeline': None   # Random Forest pipeline
        }
        pass

    def define_hyperparameter_grids(self):
        """
        Define hyperparameter grids for grid search

        Returns:
        --------
        dict : Hyperparameter grids for each model
        """
        # Your code here - define hyperparameter grids
        param_grids = {
            'nb_pipeline': {
                # Naive Bayes parameters
            },
            'lr_pipeline': {
                # Logistic Regression parameters
            },
            'svm_pipeline': {
                # SVM parameters
            },
            'rf_pipeline': {
                # Random Forest parameters
            }
        }

        return param_grids

    def perform_cross_validation(self, X, y, scoring=['accuracy', 'f1_macro', 'precision_macro', 'recall_macro']):
        """
        Perform cross-validation for all models

        Parameters:
        -----------
        X : array-like
            Feature matrix
        y : array-like
            Target labels
        scoring : list
            List of scoring metrics
        """
        # Your code here - perform cross-validation
        pass

    def hyperparameter_tuning(self, X, y, scoring='f1_macro'):
        """
        Perform hyperparameter tuning using grid search

        Parameters:
        -----------
        X : array-like
            Feature matrix
        y : array-like
            Target labels
        scoring : str
            Scoring metric for optimization
        """
        # Your code here - perform grid search
        pass

    def create_evaluation_report(self):
        """
        Create comprehensive evaluation report
        """
        # Your code here - create detailed evaluation report
        pass

    def visualize_cv_results(self):
        """
        Visualize cross-validation results
        """
        # Your code here - create cross-validation visualizations
        pass

    def get_best_model(self, metric='f1_macro'):
        """
        Get the best performing model based on specified metric

        Parameters:
        -----------
        metric : str
            Metric to use for model selection

        Returns:
        --------
        dict : Best model information
        """
        # Your code here - select best model
        return {
            'model_name': None,
            'best_params': {},
            'best_score': 0.0,
            'model': None
        }

# Test model selection
print("🎯 Testing Cross-Validation and Model Selection")
print("=" * 50)

# Prepare data for model selection
print("📊 Preparing data...")
# Use news dataset for this example
df_processed = preprocessor.preprocess_dataframe(news_df, 'text')
X_text = df_processed['text_processed']
y = df_processed['category']

# Initialize model selector
model_selector = ModelSelector(cv_folds=5)

# Create pipelines
print("🔧 Creating pipelines...")
model_selector.create_pipelines()

# Perform cross-validation
print("🔄 Performing cross-validation...")
model_selector.perform_cross_validation(X_text, y)

# Hyperparameter tuning
print("⚙️ Hyperparameter tuning...")
model_selector.hyperparameter_tuning(X_text, y)

# Create evaluation report
print("📋 Creating evaluation report...")
model_selector.create_evaluation_report()

# Visualize results
print("📈 Creating visualizations...")
model_selector.visualize_cv_results()

# Get best model
best_model_info = model_selector.get_best_model()
print(f"\n🏆 Best Model: {best_model_info['model_name']}")
print(f"   Best Score: {best_model_info['best_score']:.3f}")
print(f"   Best Parameters: {best_model_info['best_params']}")

🎯 Testing Cross-Validation and Model Selection
📊 Preparing data...
🔧 Creating pipelines...
🔄 Performing cross-validation...
⚙️ Hyperparameter tuning...
📋 Creating evaluation report...
📈 Creating visualizations...

🏆 Best Model: None
   Best Score: 0.000
   Best Parameters: {}


In [10]:
print("🧪 COMPREHENSIVE TESTING OF TEXT CLASSIFICATION EXERCISES")
print("=" * 70)

# Test 1: Basic Pipeline
print("\n1️⃣ Testing Basic Classification Pipeline")
print("-" * 40)
try:
    results_basic = basic_classification_pipeline(news_df, 'text', 'category')
    if results_basic['accuracy'] > 0:
        print(f"   ✅ Basic pipeline working - Accuracy: {results_basic['accuracy']:.3f}")
    else:
        print("   ❌ Basic pipeline needs implementation")
except Exception as e:
    print(f"   ❌ Error in basic pipeline: {e}")

# Test 2: Model Comparison
print("\n2️⃣ Testing Model Comparison Framework")
print("-" * 40)
try:
    comparator_test = ModelComparator()
    comparator_test.prepare_data(news_df, 'text', 'category')
    print("   ✅ Model comparator initialized successfully")
except Exception as e:
    print(f"   ❌ Error in model comparator: {e}")

# Test 3: Feature Engineering
print("\n3️⃣ Testing Feature Engineering")
print("-" * 40)
try:
    feature_eng_test = FeatureEngineering()
    vectorizers = feature_eng_test.create_vectorizers()
    print(f"   ✅ Feature engineering initialized - {len(vectorizers)} vectorizers created")
except Exception as e:
    print(f"   ❌ Error in feature engineering: {e}")

# Test 4: Sentiment Classifier
print("\n4️⃣ Testing Sentiment Classifier")
print("-" * 40)
try:
    sentiment_test = SentimentClassifier()
    sentiment_test.prepare_sentiment_data(sentiment_df, 'text', 'sentiment')
    print("   ✅ Sentiment classifier initialized successfully")
except Exception as e:
    print(f"   ❌ Error in sentiment classifier: {e}")

# Test 5: Model Selection
print("\n5️⃣ Testing Model Selection")
print("-" * 40)
try:
    selector_test = ModelSelector()
    selector_test.create_pipelines()
    print("   ✅ Model selector initialized successfully")
except Exception as e:
    print(f"   ❌ Error in model selector: {e}")

# Summary
print("\n📊 TESTING SUMMARY")
print("=" * 30)
print("\n📝 Next Steps:")
print("   1. Implement the TODO sections in each exercise")
print("   2. Test with your own datasets")
print("   3. Experiment with different hyperparameters")
print("   4. Try advanced preprocessing techniques")
print("   5. Compare results with transformer models")

print("\n🎓 Learning Outcomes Achieved:")
print("   ✅ Text preprocessing for classification")
print("   ✅ Multiple classification algorithms")
print("   ✅ Model evaluation and comparison")
print("   ✅ Feature engineering techniques")
print("   ✅ Cross-validation and model selection")

print("\n🚀 Ready for Session 7: Sentiment Analysis Deep Dive!")

🧪 COMPREHENSIVE TESTING OF TEXT CLASSIFICATION EXERCISES

1️⃣ Testing Basic Classification Pipeline
----------------------------------------
🔧 Step 1: Preprocessing text data...
📊 Step 2: Creating TF-IDF vectors...
🔀 Step 3: Splitting data...
🤖 Step 4: Training classifier...
🎯 Step 5: Making predictions...
📈 Step 6: Evaluating model...
   ❌ Basic pipeline needs implementation

2️⃣ Testing Model Comparison Framework
----------------------------------------
   ✅ Model comparator initialized successfully

3️⃣ Testing Feature Engineering
----------------------------------------
   ✅ Feature engineering initialized - 6 vectorizers created

4️⃣ Testing Sentiment Classifier
----------------------------------------
   ✅ Sentiment classifier initialized successfully

5️⃣ Testing Model Selection
----------------------------------------
   ✅ Model selector initialized successfully

📊 TESTING SUMMARY

📝 Next Steps:
   1. Implement the TODO sections in each exercise
   2. Test with your own dataset