In [None]:
from IPython import get_ipython
from IPython.display import display
# %%
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler # Import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pickle # Import pickle
from itertools import cycle # Import cycle
# %%
# Load data from CSV file
df = pd.read_csv('processed_dataset.csv')
# %%
# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nSample of the dataset:")
print(df.head())
# %%
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())
# %%
# Check unique values in 'type' column
print("\nUnique values in the classification column:")
# Print the actual column names to identify the correct one
print("DataFrame columns:", df.columns.tolist())
# Assume the column is named 'type' based on common practice,
# but replace 'type' with the actual column name if it's different
if 'type' in df.columns:
    print(df['type'].unique())
else:
    print("Column 'type' not found. Please check DataFrame columns.")
# %%
# Standardize the type values - treat case insensitively and NaN as spam
def standardize_type(x):
    if pd.isna(x):  # Handle NaN values as spam
        return 'spam'

    if not isinstance(x, str):
        x = str(x)  # Convert non-string types to string

    x_lower = x.lower().strip()

    # Exact match for main categories
    if 'spam' in x_lower:
        return 'spam'
    elif 'ham' in x_lower:
        return 'ham'
    elif 'promo' in x_lower:
        return 'promo'
    else:
        return x_lower  # Return as is for other categories

correct_type_column_name = 'Type' # Assuming the column is named 'Type' based on 'Type                …' in the global variables

if correct_type_column_name in df.columns:
    df['standardized_type'] = df[correct_type_column_name].apply(standardize_type)
else:
    # This block should ideally not be reached if the column name is correct
    print(f"Error: The column '{correct_type_column_name}' was not found in the DataFrame.")
    print("Please check the column names in df.columns.tolist() and update the code.")
# %%
# Display distribution of standardized types
print("\nStandardized type distribution:")
print(df['standardized_type'].value_counts())
# %%
# Create a label encoder for the target variable
label_encoder = LabelEncoder()
df['type_encoded'] = label_encoder.fit_transform(df['standardized_type'])

# %%
# Map the encoded values back to their original labels for reference
encoded_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("\nEncoded class mapping:")
for class_name, encoded_value in encoded_mapping.items():
    print(f"{class_name} -> {encoded_value}")

# %%
# Define features (all columns except type-related and text columns)
features = [col for col in df.columns if col not in ['type', 'text', 'standardized_type', 'type_encoded', correct_type_column_name]] # Exclude the original type column too
print("\nFeatures used for classification:")
print(features)

# %%
# Prepare features and target
X = df[features]
y = df['type_encoded']
# %%
# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# %%
# Identify numeric and categorical columns based on the actual data types in X_train
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

print("\nNumeric features:", numeric_features)
print("Categorical features:", categorical_features)

# Create preprocessing steps using ColumnTransformer
# We'll scale numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        # Apply StandardScaler to numerical features
        ('num', StandardScaler(), numeric_features),
        # Apply OneHotEncoder to categorical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    # Drop any columns that are not explicitly handled (e.g., if 'text' column somehow got here)
    remainder='drop'
)

# Create a pipeline that includes preprocessing and the model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Fit the pipeline - This applies preprocessing and then trains the model
print("\nTraining Random Forest model using Pipeline...")
pipeline.fit(X_train, y_train)

# %%
# Evaluate model on training data using the pipeline
train_preds = pipeline.predict(X_train)
train_accuracy = accuracy_score(y_train, train_preds)
print(f"\nTraining accuracy (Pipeline): {train_accuracy:.3f}")

# %%
# Evaluate model on test data using the pipeline
test_preds = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, test_preds)
print(f"Test accuracy (Pipeline): {test_accuracy:.3f}")

# %%
# Get the original class names for display
original_class_names = label_encoder.classes_

# %%
# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, test_preds, target_names=original_class_names))

# %%
# Display confusion matrix
cm = confusion_matrix(y_test, test_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=original_class_names,
            yticklabels=original_class_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix_multiclass.png')
print("\nConfusion matrix saved as 'confusion_matrix_multiclass.png'")

# %%
# Feature importance - Note: Getting feature importance from a pipeline with
# ColumnTransformer is slightly more involved as the preprocessor transforms
# the features. We'll get importance from the trained classifier component,
# but the names will correspond to the transformed features.
# A more complex approach is needed to map back to original feature names
# for one-hot encoded features. For simplicity, we'll get importance from
# the classifier and match with processed feature names.

# Get feature importances from the trained classifier
classifier = pipeline.named_steps['classifier']
importances = classifier.feature_importances_

# Get feature names after preprocessing (this includes scaled numerical and one-hot encoded categorical)
# This is a bit tricky with older scikit-learn versions. For newer versions,
# preprocessor.get_feature_names_out() is available. If using an older version,
# you might need to manually construct the names.
try:
    processed_feature_names = list(pipeline.named_steps['preprocessor'].get_feature_names_out(features))
except AttributeError:
    # Fallback for older scikit-learn versions - this is an approximation
    # Numerical features keep their names, categorical features become name_categoryvalue
    processed_feature_names = []
    # Add numerical feature names
    processed_feature_names.extend(numeric_features)
    # Add categorical feature names - this requires fitting and inspecting the OneHotEncoder
    ohe = pipeline.named_steps['preprocessor'].named_transformers_['cat']
    for i, cat_col in enumerate(categorical_features):
        # Check if categories_ attribute exists
        if hasattr(ohe, 'categories_') and i < len(ohe.categories_):
             processed_feature_names.extend([f"{cat_col}_{cat}" for cat in ohe.categories_[i]])
        else:
             # Fallback if categories_ is not available or indexing fails
             print(f"Warning: Could not determine categories for {cat_col}. Feature names might be incorrect.")
             # Add a placeholder or just the column name multiple times if needed
             processed_feature_names.extend([f"processed_{cat_col}_{j}" for j in range(ohe.transform(X_train[[cat_col]]).shape[1])])


if len(importances) == len(processed_feature_names):
    feature_importance = pd.DataFrame({
        'Feature': processed_feature_names,
        'Importance': importances
    }).sort_values('Importance', ascending=False)

    print("\nFeature Importance:")
    print(feature_importance)

    plt.figure(figsize=(12, min(50, len(feature_importance) * 0.5))) # Adjust figure height based on number of features
    sns.barplot(x='Importance', y='Feature', data=feature_importance.head(20)) # Plotting top 20 for clarity
    plt.title('Feature Importance (Processed Features)')
    plt.tight_layout()
    plt.savefig('feature_importance_multiclass.png')
    print("Feature importance plot saved as 'feature_importance_multiclass.png'")

    # Display top 5 most important features
    print("\nTop 5 most important features:")
    print(feature_importance.head(5))
else:
    print("Could not match feature importances to processed feature names.")
    print("Number of importances:", len(importances))
    print("Number of processed feature names:", len(processed_feature_names))
    print("Feature importances:", importances) # Print raw importances

# %%
# The tune_hyperparameters function can also be updated to use the pipeline
def tune_hyperparameters():
    print("\nPerforming hyperparameter tuning...")
    from sklearn.model_selection import GridSearchCV

    # Define the parameter grid. Note the parameter names now include the pipeline step name ('classifier__')
    param_grid = {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4]
    }

    # Create a new pipeline instance for grid search
    # Use the same preprocessor definition as above
    grid_pipeline = Pipeline([
        ('preprocessor', preprocessor), # Use the preprocessor defined earlier
        ('classifier', RandomForestClassifier(random_state=42))
    ])


    grid_search = GridSearchCV(
        grid_pipeline, # Pass the pipeline to GridSearchCV
        param_grid=param_grid,
        cv=5,
        n_jobs=-1,
        scoring='accuracy'
    )

    # Fit GridSearchCV on the training data
    grid_search.fit(X_train, y_train)
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.3f}")

    # The best estimator from grid search is a fitted pipeline
    best_pipeline = grid_search.best_estimator_

    # Evaluate the best pipeline on the test set
    best_preds = best_pipeline.predict(X_test)
    best_accuracy = accuracy_score(y_test, best_preds)
    print(f"Tuned model test accuracy (Pipeline): {best_accuracy:.3f}")

    return best_pipeline # Return the best fitted pipeline
# %%
# Calculate and Plot ROC curve for multiclass

# Get predicted probabilities for the test set
# The pipeline's predict_proba method handles preprocessing internally
test_probs = pipeline.predict_proba(X_test)

# Calculate ROC AUC score for multiclass (using One-vs-Rest strategy)
# average='ovr' computes the AUC of each class against the rest and takes the average.
# multi_class='ovr' specifies the strategy for multiclass.
try:
    roc_auc_ovr = roc_auc_score(y_test, test_probs, average='ovr', multi_class='ovr')
    print(f"\nROC AUC (One-vs-Rest): {roc_auc_ovr:.3f}")
except ValueError as e:
    print(f"\nCould not calculate ROC AUC: {e}")
    print("This might happen if there is only one class present in the test set.")


# Compute ROC curve and ROC area for each class
# y_test is the true labels (encoded integers)
# test_probs are the predicted probabilities for each class
fpr = dict() # Dictionary to store False Positive Rate for each class
tpr = dict() # Dictionary to store True Positive Rate for each class
roc_auc = dict() # Dictionary to store ROC AUC score for each class

# Get the number of classes
n_classes = len(label_encoder.classes_)

# Iterate through each class
for i in range(n_classes):
    # For each class, we compare it against all other classes (One-vs-Rest)
    # We treat the current class as positive (1) and all others as negative (0)
    # We need to convert y_test to a binary format for each class
    # (1 if the true label is the current class, 0 otherwise)
    y_true_binary = (y_test == i).astype(int)

    # Get the probability for the current class
    y_score = test_probs[:, i]

    # Compute ROC curve: fpr, tpr, thresholds
    fpr[i], tpr[i], _ = roc_curve(y_true_binary, y_score)

    # Compute Area Under the curve (AUC) using the calculated fpr and tpr
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curves for each class
plt.figure(figsize=(10, 8))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'red', 'purple', 'brown']) # Define colors for plots

for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} ({1:.2f})'.format(label_encoder.classes_[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='Random (AUC = 0.50)') # Plot random guess line
plt.xlim([0.0, 1.0]) # Set x-axis limits
plt.ylim([0.0, 1.05]) # Set y-axis limits slightly above 1
plt.xlabel('False Positive Rate') # Label x-axis
plt.ylabel('True Positive Rate') # Label y-axis
plt.title('Receiver Operating Characteristic (ROC) Curve - Multiclass (One-vs-Rest)') # Set title
plt.legend(loc="lower right") # Display legend
plt.grid(True) # Add grid
plt.tight_layout() # Adjust layout
plt.savefig('roc_curve_multiclass_ovr.png') # Save the plot
print("\nROC curve plot saved as 'roc_curve_multiclass_ovr.png'")

# %%
# Save the pipeline and label encoder
# Use the 'pipeline' object fitted earlier or the 'best_pipeline' if tuning was run
final_pipeline_to_save = pipeline # Or best_pipeline if tuning was executed

with open('message_classifier_model.pkl', 'wb') as file:
    # Save the pipeline, label encoder, and original feature names used
    pickle.dump({'model': final_pipeline_to_save, 'label_encoder': label_encoder, 'features': features}, file)
print("\nPipeline saved as 'message_classifier_model.pkl'")

# %%
# Function to make predictions on new data using the loaded pipeline
def predict_message_type(model_data, new_data):
    # Extract pipeline components
    pipeline = model_data['model'] # The model is now the pipeline
    label_encoder = model_data['label_encoder']
    original_features = model_data['features'] # List of original feature column names

    # Ensure new_data has all original feature columns that the pipeline expects
    # Create a DataFrame with all expected columns, filling missing ones as needed
    # The values used for missing features should be handled appropriately by your preprocessor
    # (e.g., StandardScaler will handle NaNs if present, OneHotEncoder handle_unknown='ignore')
    # However, it's best practice to ensure the input data matches the training data structure.
    # For this example, we'll create a dummy DataFrame and populate it.
    # Ensure the dtypes match the training data where possible.
    processed_new_data = pd.DataFrame(columns=original_features)

    # Populate with data from new_data, adding missing columns with default values
    for col in original_features:
        if col in new_data.columns:
            processed_new_data[col] = new_data[col]
        else:
            # Add missing column with a default value.
            # A better approach might be to determine the dtype from the training data
            # and use an appropriate default (0 for numeric, '' or None for categorical).
            # For simplicity, we'll add 0. You might need to adjust this.
             # Attempt to infer dtype from training features
            if col in X_train.columns:
                 if X_train[col].dtype in ['int64', 'float64']:
                     processed_new_data[col] = 0
                 elif X_train[col].dtype == 'object':
                      # Use a placeholder string if column was categorical
                      processed_new_data[col] = '' # Or a specific category if applicable
                 else:
                      processed_new_data[col] = 0 # Default fallback
            else:
                 processed_new_data[col] = 0 # Default if column not in X_train (shouldn't happen if features list is correct)

    # Ensure row count matches input data (if new_data had multiple rows)
    if len(processed_new_data) != len(new_data):
         # This case handles adding missing columns correctly for multiple rows
         processed_new_data = pd.DataFrame(index=new_data.index, columns=original_features)
         for col in original_features:
            if col in new_data.columns:
                processed_new_data[col] = new_data[col]
            else:
                if col in X_train.columns:
                    if X_train[col].dtype in ['int64', 'float64']:
                        processed_new_data[col] = 0
                    elif X_train[col].dtype == 'object':
                        processed_new_data[col] = ''
                    else:
                        processed_new_data[col] = 0
                else:
                    processed_new_data[col] = 0


    # Make predictions using the pipeline (this includes preprocessing)
    predictions_encoded = pipeline.predict(processed_new_data)
    predictions = label_encoder.inverse_transform(predictions_encoded)

    # Get probabilities for each class
    probabilities = pipeline.predict_proba(processed_new_data)

    # Create results DataFrame
    results = pd.DataFrame({
        'Predicted_Type': predictions
    })

    # Add probability columns for each class
    for i, class_name in enumerate(label_encoder.classes_):
        results[f'{class_name}_Probability'] = probabilities[:, i]

    # Add original data for context (optional)
    # results = pd.concat([new_data.reset_index(drop=True), results], axis=1)

    return results
# %%
print("\nModel training and evaluation complete!")
print("To use this model for predictions:")
print("1. Load the model: model_data = pickle.load(open('message_classifier_model.pkl', 'rb'))")
print("2. Create new data with the same columns used for training (or a subset that will be handled by the function's logic).")
print("   Example: sample_data = pd.DataFrame({'has_phone_number': [1], 'word_count': [15], ...})")
print("3. Call: predict_message_type(model_data, sample_data)")

# %%
# Add a simple test prediction example
print("\nExample prediction code:")
print("import pickle")
print("import pandas as pd")
# Load the model data from the saved file
model_data = pickle.load(open('message_classifier_model.pkl', 'rb'))
print("model_data = pickle.load(open('message_classifier_model.pkl', 'rb'))")

# Create a sample input with same columns as the training data (using first row of X_test as example)
# Replace this with actual new data you want to predict on
if not X_test.empty:
    sample_data = X_test.head(1).copy()
    # Modify values if you want to test a different input
    # sample_data['has_phone_number'] = 1
    # sample_data['avg_word_length'] = 5.0
    # ... modify other relevant features
    print("\nUsing a sample from X_test for prediction example:")
    print(sample_data)
    predictions = predict_message_type(model_data, sample_data)
    print("\nPrediction results:")
    print(predictions)
else:
    print("\nX_test is empty, skipping prediction example.")
    print("# Create a sample input with same columns as the training data")
    print("sample_data = pd.DataFrame({")
    # List some expected features - you'll need to provide values
    if features:
        for i, feat in enumerate(features[:5]): # Show top 5 features
            # Provide a placeholder value based on expected type (int/float 0, str '')
            default_val = 0 if feat in numeric_features else ''
            print(f"    '{feat}': [{default_val}]{',' if i < len(features[:5]) - 1 else ''}")
    else:
        print("    # No features defined in the loaded model.")
    print("    # Add other features as needed based on the 'features' list")
    print("})")
    print("# predictions = predict_message_type(model_data, sample_data)")
    # print("# print(predictions)")

In [None]:
# %%
# Calculate ROC curve and AUC for a binary scenario (e.g., Spam vs Ham)

# This code assumes you have the following variables available from previous cells:
# - pipeline: Your fitted scikit-learn pipeline
# - X_test: Test features (original features)
# - y_test: Test labels (encoded integers)
# - label_encoder: The fitted LabelEncoder

print("\nCalculating and plotting binary ROC curve for 'Spam' vs 'Ham'...")

# We need the predicted probabilities for the test set.
# The pipeline's predict_proba method handles preprocessing internally.
try:
    test_probs = pipeline.predict_proba(X_test)
except Exception as e:
    print(f"Error getting predicted probabilities from pipeline: {e}")
    test_probs = None # Indicate failure

if test_probs is not None:
    # Find the index corresponding to the 'spam' class in the label encoder's classes.
    try:
        spam_class_index = list(label_encoder.classes_).index('spam')
        print(f"Found 'spam' class at index: {spam_class_index}")
    except ValueError:
        print("Error: 'spam' class not found in label encoder classes. Cannot proceed with binary ROC.")
        spam_class_index = None # Indicate that 'spam' was not found

    if spam_class_index is not None:
        # Get the predicted probabilities specifically for the 'spam' class
        y_test_proba_spam = test_probs[:, spam_class_index]

        # Filter y_test and y_test_proba to only include samples from 'ham' and 'spam' classes
        # Find the encoded labels for 'ham' and 'spam'
        try:
            ham_encoded_label = label_encoder.transform(['ham'])[0] if 'ham' in label_encoder.classes_ else None
            spam_encoded_label = label_encoder.transform(['spam'])[0] if 'spam' in label_encoder.classes_ else None
        except Exception as e:
            print(f"Error getting encoded labels for 'ham' or 'spam': {e}. Skipping binary ROC.")
            ham_encoded_label = None
            spam_encoded_label = None


        if ham_encoded_label is not None and spam_encoded_label is not None:
            # Create a mask for test samples that are either 'ham' or 'spam' (using their encoded labels)
            ham_spam_mask_test = (y_test == ham_encoded_label) | (y_test == spam_encoded_label)

            # Apply the mask to filter the true labels and predicted probabilities
            y_test_ham_spam_encoded = y_test[ham_spam_mask_test]
            y_test_proba_ham_spam = y_test_proba_spam[ham_spam_mask_test]

            # Convert the filtered true labels to binary: 'spam' (positive) is 1, 'ham' (negative) is 0
            y_test_binary = (y_test_ham_spam_encoded == spam_encoded_label).astype(int)

            # Check if there are both positive (spam) and negative (ham) samples in the filtered data
            if len(np.unique(y_test_binary)) > 1:
                # Calculate ROC curve
                from sklearn.metrics import roc_curve, auc
                fpr, tpr, thresholds = roc_curve(y_test_binary, y_test_proba_ham_spam)

                # Calculate Area Under the ROC Curve (AUC)
                roc_auc = auc(fpr, tpr)

                print(f"\nArea under ROC Curve (AUC) for 'Spam' class (vs 'Ham'): {roc_auc:.4f}")

                # Plot the ROC curve
                plt.figure(figsize=(8, 6))
                plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
                plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random (AUC = 0.50)')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title('Receiver Operating Characteristic (ROC) Curve for Spam (vs Ham)')
                plt.legend(loc="lower right")
                plt.grid(True)
                plt.tight_layout()
                plt.savefig('roc_curve_spam_vs_ham.png')
                print("ROC curve plot saved as 'roc_curve_spam_vs_ham.png'")

            else:
                 print("\nCannot plot binary ROC for 'Spam' vs 'Ham': Filtered test set does not contain both classes.")
                 print(f"Unique values in filtered y_test_binary: {np.unique(y_test_binary)}")
        else:
             print("\nCould not get encoded labels for 'ham' or 'spam'. Skipping binary ROC plot.")
    # %%