# Text Sentiment Classification Demo

This notebook demonstrates the text sentiment classification model that predicts whether a given text expresses a positive or negative sentiment. The model uses TF-IDF features with a simple machine learning classifier for fast training and easy deployment.

## 1. Setup and Imports

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

# Add the src directory to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

# Import our modules
from data_prep import download_movie_reviews, preprocess_data, split_and_save_data
from feature_extract import TfidfFeatureExtractor
from train import train_model
from evaluate import evaluate_model, plot_confusion_matrix, plot_roc_curve
from predict import predict_sentiment, preprocess_text

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

## 2. Data Preparation

Let's download and prepare the dataset for sentiment analysis.

In [None]:
# Download the dataset
df = download_movie_reviews()

# Display the first few rows
print(f'Dataset shape: {df.shape}')
df.head()

In [None]:
# Check class distribution
sentiment_counts = df['sentiment'].value_counts()
print(f'Class distribution:
{sentiment_counts}')

# Plot class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=df, palette=['#ff9999', '#66b3ff'])
plt.title('Class Distribution')
plt.xlabel('Sentiment (0=Negative, 1=Positive)')
plt.ylabel('Count')
plt.xticks([0, 1], ['Negative', 'Positive'])
plt.show()

In [None]:
# Preprocess the data
df = preprocess_data(df)

# Display a few examples after preprocessing
print('Examples after preprocessing:')
for i, row in df.sample(5).iterrows():
    print(f"\nSentiment: {'Positive' if row['sentiment'] == 1 else 'Negative'}")
    print(f"Text: {row['text'][:200]}...")

In [None]:
# Split the data into training and testing sets
train_df, test_df = split_and_save_data(df)

print(f'Training set shape: {train_df.shape}')
print(f'Testing set shape: {test_df.shape}')

## 3. Feature Extraction

Now, let's extract TF-IDF features from the text data.

In [None]:
# Initialize the TF-IDF feature extractor
extractor = TfidfFeatureExtractor(max_features=5000, min_df=5, max_df=0.7, ngram_range=(1, 2))

# Extract features
X_train = extractor.fit_transform(train_df['text'])
X_test = extractor.transform(test_df['text'])

# Get labels
y_train = train_df['sentiment']
y_test = test_df['sentiment']

print(f'Training features shape: {X_train.shape}')
print(f'Testing features shape: {X_test.shape}')

In [None]:
# Get the top features (terms) by TF-IDF score
feature_names = extractor.get_feature_names()

# Calculate the average TF-IDF score for each feature
tfidf_mean = X_train.mean(axis=0).A1

# Create a DataFrame of features and their scores
feature_scores = pd.DataFrame({'feature': feature_names, 'score': tfidf_mean})

# Sort by score in descending order
top_features = feature_scores.sort_values('score', ascending=False).head(20)

# Plot the top features
plt.figure(figsize=(10, 8))
sns.barplot(x='score', y='feature', data=top_features, palette='viridis')
plt.title('Top 20 Features by TF-IDF Score')
plt.xlabel('Average TF-IDF Score')
plt.ylabel('Feature (Term)')
plt.tight_layout()
plt.show()

## 4. Model Training

Let's train different models and compare their performance.

In [None]:
# Define the models to train
model_types = ['logistic_regression', 'naive_bayes', 'svm']

# Train and evaluate each model
models = {}
train_accuracies = {}
test_metrics = {}

for model_type in model_types:
    print(f'\nTraining {model_type} model...')
    
    # Train the model
    model = train_model(X_train, y_train, model_type=model_type)
    models[model_type] = model
    
    # Evaluate on training data
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracies[model_type] = train_accuracy
    print(f'Training accuracy: {train_accuracy:.4f}')
    
    # Evaluate on test data
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    
    test_metrics[model_type] = {
        'accuracy': test_accuracy,
        'precision': test_precision,
        'recall': test_recall,
        'f1': test_f1
    }
    
    print(f'Test accuracy: {test_accuracy:.4f}')
    print(f'Test precision: {test_precision:.4f}')
    print(f'Test recall: {test_recall:.4f}')
    print(f'Test F1 score: {test_f1:.4f}')

In [None]:
# Compare model performance
metrics_df = pd.DataFrame(test_metrics).T
metrics_df.index.name = 'Model'
metrics_df.reset_index(inplace=True)

# Plot model comparison
plt.figure(figsize=(12, 8))
metrics_df_melted = pd.melt(metrics_df, id_vars=['Model'], var_name='Metric', value_name='Score')

sns.barplot(x='Model', y='Score', hue='Metric', data=metrics_df_melted, palette='viridis')
plt.title('Model Performance Comparison')
plt.xlabel('Model')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.legend(title='Metric')
plt.tight_layout()
plt.show()

# Display the metrics table
metrics_df.set_index('Model')

## 5. Model Evaluation

Let's evaluate the best model in more detail.

In [None]:
# Find the best model based on test accuracy
best_model_type = metrics_df.loc[metrics_df['accuracy'].idxmax(), 'Model']
best_model = models[best_model_type]

print(f'Best model: {best_model_type} with accuracy {metrics_df.loc[metrics_df["Model"] == best_model_type, "accuracy"].values[0]:.4f}')

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title(f'Confusion Matrix - {best_model_type}')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.show()

# Print classification report
print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

In [None]:
# Plot ROC curve for models that support probability predictions
plt.figure(figsize=(10, 8))

for model_type, model in models.items():
    if hasattr(model, 'predict_proba'):
        try:
            # Get probability predictions
            y_score = model.predict_proba(X_test)[:, 1]
            
            # Calculate ROC curve and AUC
            fpr, tpr, _ = roc_curve(y_test, y_score)
            roc_auc = auc(fpr, tpr)
            
            # Plot ROC curve
            plt.plot(fpr, tpr, lw=2, label=f'{model_type} (AUC = {roc_auc:.2f})')
        except:
            print(f'Error plotting ROC curve for {model_type}')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

## 6. Feature Importance

Let's examine which features (words/terms) are most important for classification.

In [None]:
# Get feature importance for logistic regression model
if 'logistic_regression' in models:
    # Get the logistic regression model
    lr_model = models['logistic_regression']
    
    # Get feature names
    feature_names = extractor.get_feature_names()
    
    # Get coefficients
    coefficients = lr_model.coef_[0]
    
    # Create a DataFrame of features and their coefficients
    feature_importance = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})
    
    # Sort by absolute coefficient value in descending order
    feature_importance['abs_coefficient'] = np.abs(feature_importance['coefficient'])
    feature_importance = feature_importance.sort_values('abs_coefficient', ascending=False)
    
    # Get top positive and negative features
    top_positive = feature_importance[feature_importance['coefficient'] > 0].head(15)
    top_negative = feature_importance[feature_importance['coefficient'] < 0].head(15)
    
    # Plot top positive features (indicative of positive sentiment)
    plt.figure(figsize=(12, 6))
    sns.barplot(x='coefficient', y='feature', data=top_positive, palette='Greens_r')
    plt.title('Top 15 Features Indicating Positive Sentiment')
    plt.xlabel('Coefficient (Importance)')
    plt.ylabel('Feature (Term)')
    plt.tight_layout()
    plt.show()
    
    # Plot top negative features (indicative of negative sentiment)
    plt.figure(figsize=(12, 6))
    sns.barplot(x='coefficient', y='feature', data=top_negative, palette='Reds_r')
    plt.title('Top 15 Features Indicating Negative Sentiment')
    plt.xlabel('Coefficient (Importance)')
    plt.ylabel('Feature (Term)')
    plt.tight_layout()
    plt.show()

## 7. Prediction Demo

Let's use our best model to make predictions on new text inputs.

In [None]:
# Sample texts for prediction
sample_texts = [
    "I really enjoyed this product. It works great and the customer service was excellent!",
    "This is the worst experience I've ever had. The product broke after one use.",
    "The movie was okay, not great but not terrible either.",
    "I'm very satisfied with my purchase and would recommend it to others."
]

# Make predictions using the best model
print(f'Making predictions using the {best_model_type} model:\n')

results = []
for text in sample_texts:
    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    
    # Transform the text to TF-IDF features
    X = extractor.transform([preprocessed_text])
    
    # Make prediction
    prediction = best_model.predict(X)[0]
    
    # Get probability if available
    probability = None
    if hasattr(best_model, 'predict_proba'):
        try:
            probability = best_model.predict_proba(X)[0][prediction]
        except:
            pass
    
    # Create result dictionary
    result = {
        'text': text,
        'sentiment': 'Positive' if prediction == 1 else 'Negative',
        'confidence': probability
    }
    
    results.append(result)
    
    # Print prediction details
    print(f"Text: {text}")
    print(f"Predicted sentiment: {result['sentiment']}")
    if result['confidence'] is not None:
        print(f"Confidence: {result['confidence']:.4f}")
    print()

# Create a DataFrame of results
results_df = pd.DataFrame(results)
results_df

In [None]:
# Create a function for interactive prediction
def predict_interactive(text):
    # Preprocess the text
    preprocessed_text = preprocess_text(text)
    
    # Transform the text to TF-IDF features
    X = extractor.transform([preprocessed_text])
    
    # Make prediction
    prediction = best_model.predict(X)[0]
    
    # Get probability if available
    probability = None
    if hasattr(best_model, 'predict_proba'):
        try:
            probability = best_model.predict_proba(X)[0][prediction]
        except:
            pass
    
    # Print prediction details
    print(f"\nText: {text}")
    print(f"Preprocessed text: {preprocessed_text}")
    print(f"Predicted sentiment: {'Positive' if prediction == 1 else 'Negative'}")
    if probability is not None:
        print(f"Confidence: {probability:.4f}")
    
    return prediction, probability

# Try it with your own text
# predict_interactive("Your text here")

## 8. Conclusion

In this notebook, we've demonstrated a complete text sentiment classification pipeline using TF-IDF features and machine learning classifiers. The model can effectively predict whether a given text expresses a positive or negative sentiment.

Key components of the pipeline include:

1. Data preparation and preprocessing
2. TF-IDF feature extraction
3. Model training and selection
4. Model evaluation and analysis
5. Prediction interface

The model is simple, fast to train, and easy to deploy, making it suitable for various applications requiring sentiment analysis.