In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load data from CSV file
df = pd.read_csv('processed_dataset.csv')

In [None]:
# Display basic information about the dataset
print("Dataset shape:", df.shape)
print("\nSample of the dataset:")
print(df.head())

In [None]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

In [None]:
# Check unique values in 'type' column
print("\nUnique values in 'type' column:")
print(df['label'].unique())

In [None]:
# Standardize the type values - treat case insensitively and NaN as spam
def standardize_type(x):
    if pd.isna(x):  # Handle NaN values as spam
        return 'spam'

    if not isinstance(x, str):
        x = str(x)  # Convert non-string types to string

    x_lower = x.lower().strip()

    # Exact match for main categories
    if 'spam' in x_lower:
        return 'spam'
    elif 'ham' in x_lower:
        return 'ham'
    elif 'promo' in x_lower:
        return 'promo'
    else:
        return x_lower  # Return as is for other categories

df['standardized_type'] = df['label'].apply(standardize_type)

In [None]:
# Display distribution of standardized types
print("\nStandardized type distribution:")
print(df['standardized_type'].value_counts())

In [None]:
# Create a label encoder for the target variable
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['type_encoded'] = label_encoder.fit_transform(df['standardized_type'])

In [None]:
# Map the encoded values back to their original labels for reference
encoded_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("\nEncoded class mapping:")
for class_name, encoded_value in encoded_mapping.items():
    print(f"{class_name} -> {encoded_value}")


In [None]:
# Define features (all columns except type-related and text columns)
features = [col for col in df.columns if col not in ['type', 'text', 'standardized_type', 'type_encoded', 'label']]
print("\nFeatures used for classification:")
print(features)

In [None]:
# Prepare features and target
X = df[features]
y = df['label']

In [None]:
# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

In [None]:
from sklearn.preprocessing import StandardScaler
# First, create copies to avoid modifying the original data
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Apply standard scaler only to the numerical columns (avg_word_length and word_length)
# This will scale them to have a mean of 0 and standard deviation of 1
scaler = StandardScaler()

# Identify the columns to scale
columns_to_scale = ['avg_word_length', 'word_length']
# Fit the scaler on the training data and transform both training and test data
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

# To make the values be between 0 and 1 instead of standardized,
# we can use MinMaxScaler instead of StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Apply MinMaxScaler to scale to 0-1 range
min_max_scaler = MinMaxScaler()
X_train_scaled[columns_to_scale] = min_max_scaler.fit_transform(X_train[columns_to_scale])
X_test_scaled[columns_to_scale] = min_max_scaler.transform(X_test[columns_to_scale])

# Verify the scaled values are between 0 and 1
print("\nScaled training data sample (avg_word_length and word_length should be between 0-1):")
print(X_train_scaled[columns_to_scale].describe())


In [None]:
# Now train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions on both training and test sets
y_train_pred = rf_classifier.predict(X_train_scaled)
y_test_pred = rf_classifier.predict(X_test_scaled)

# Calculate accuracy scores
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\nRandom Forest Classifier Results:")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

# Print detailed classification report for test set
print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred, target_names=['Ham','Spam']))

In [None]:
# Print confusion matrix for test set
print("\nConfusion Matrix (Test Set):")
conf_matrix = confusion_matrix(y_test, y_test_pred)
print(conf_matrix)

# Feature importance
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_classifier.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importances)

In [None]:
# Train Random Forest model
print("\nTraining Random Forest model...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


In [None]:
# Drop problematic columns
X_train_numeric = X_train.select_dtypes(include=['int64', 'float64'])
X_test_numeric = X_test.select_dtypes(include=['int64', 'float64'])

# Train with only numeric features
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_numeric, y_train)

In [None]:
# Evaluate model on training data
train_preds = rf.predict(X_train)
train_accuracy = accuracy_score(y_train, train_preds)
print(f"\nTraining accuracy: {train_accuracy:.3f}")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Identify numeric and categorical columns
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a pipeline that includes preprocessing and the model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Now you can use the pipeline for prediction
train_preds = pipeline.predict(X_train)
train_accuracy = accuracy_score(y_train, train_preds)
print(f"\nTraining accuracy: {train_accuracy:.3f}")

# And for test data when ready
# test_preds = pipeline.predict(X_test)
# test_accuracy = accuracy_score(y_test, test_preds)
# print(f"Test accuracy: {test_accuracy:.3f}")

In [None]:
# Evaluate model on test data
test_preds = rf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_preds)
print(f"Test accuracy: {test_accuracy:.3f}")


In [None]:
# Get the original class names for display
original_class_names = label_encoder.classes_


In [None]:
# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, test_preds, target_names=original_class_names))


In [None]:
# Display confusion matrix
cm = confusion_matrix(y_test, test_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=original_class_names,
            yticklabels=original_class_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig('confusion_matrix_multiclass.png')
print("\nConfusion matrix saved as 'confusion_matrix_multiclass.png'")


In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 10))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance_multiclass.png')
print("Feature importance plot saved as 'feature_importance_multiclass.png'")


In [None]:
# Display top 5 most important features
print("\nTop 5 most important features:")
print(feature_importance.head(5))


In [None]:
def tune_hyperparameters():
    print("\nPerforming hyperparameter tuning...")
    from sklearn.model_selection import GridSearchCV

    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid=param_grid,
        cv=5,
        n_jobs=-1,
        scoring='accuracy'
    )

    grid_search.fit(X_train, y_train)
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.3f}")

    # Train with best parameters
    best_rf = RandomForestClassifier(**grid_search.best_params_, random_state=42)
    best_rf.fit(X_train, y_train)

    # Evaluate
    best_preds = best_rf.predict(X_test)
    best_accuracy = accuracy_score(y_test, best_preds)
    print(f"Tuned model test accuracy: {best_accuracy:.3f}")

    return best_rf

In [None]:
# Save the model and label encoder
import pickle
with open('message_classifier_model.pkl', 'wb') as file:
    pickle.dump({'model': rf, 'label_encoder': label_encoder, 'features': features}, file)
print("\nModel saved as 'message_classifier_model.pkl'")


In [None]:
# Function to make predictions on new data
def predict_message_type(model_data, new_data):
    # Extract model components
    model = model_data['model']
    label_encoder = model_data['label_encoder']
    features = model_data['features']

    # Ensure new_data has the same features as training data
    needed_features = [f for f in features if f in new_data.columns]
    missing_features = [f for f in features if f not in new_data.columns]

    if missing_features:
        print(f"Warning: Missing features: {missing_features}")
        for feature in missing_features:
            new_data[feature] = 0  # Add missing features with default values

    new_data_features = new_data[features]

    # Make predictions
    predictions_encoded = model.predict(new_data_features)
    predictions = label_encoder.inverse_transform(predictions_encoded)

    # Get probabilities for each class
    probabilities = model.predict_proba(new_data_features)

    # Create results DataFrame
    results = pd.DataFrame({
        'Predicted_Type': predictions
    })

    # Add probability columns for each class
    for i, class_name in enumerate(label_encoder.classes_):
        results[f'{class_name}_Probability'] = probabilities[:, i]

    return results

In [None]:
print("\nModel training and evaluation complete!")
print("To use this model for predictions:")
print("1. Load the model: model_data = pickle.load(open('message_classifier_model.pkl', 'rb'))")
print("2. Call: predict_message_type(model_data, new_data)")


In [None]:
# Add a simple test prediction example
print("\nExample prediction code:")
print("import pickle")
print("model_data = pickle.load(open('message_classifier_model.pkl', 'rb'))")
print("# Create a sample input with same columns as the training data")
print("sample_data = pd.DataFrame({")
print("    'has_phone_number': [1, 0, 0],")
print("    'has_special_chars': [1, 1, 1],")
print("    # Add other features...")
print("})")
print("predictions = predict_message_type(model_data, sample_data)")
print("print(predictions)")