In [1]:
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,learning_curve
from itertools import cycle
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from log import get_logger
from collections import Counter
import logging

In [2]:
DATA_PATH = '/Users/ashishsingh/Desktop/SIGN/character and numbers/data/processed_data/data.pickle'
MODEL_DIR = '/Users/ashishsingh/Desktop/SIGN/character and numbers/models'
GRAPH_DIR = '/Users/ashishsingh/Desktop/SIGN/character and numbers/graph'

In [3]:
# Logger setup
logging.getLogger("matplotlib").setLevel(logging.ERROR)
logger = get_logger(__name__)

In [4]:
def preprocess_labels(labels):
    """
    Encode string/categorical labels into numeric values.
    Returns the encoded labels and the fitted LabelEncoder.
    """
    le = LabelEncoder()
    labels_encoded = le.fit_transform(labels)
    return labels_encoded, le

In [5]:
def load_data(path):
    """
    Load processed data and ensure feature count consistency.
    """
    logger.info(f"Loading data from {path}")
    data_dict = pickle.load(open(path, 'rb'))
    data = np.asarray(data_dict['data'])
    labels = np.asarray(data_dict['labels'])

    # Check feature length
    for i, sample in enumerate(data):
        if len(sample) != 84:
            logger.error(f"Sample {i} has {len(sample)} features, expected 84")
            raise ValueError("Feature length mismatch in data.")

    # Count label occurrences
    label_counts = Counter(labels)

    logger.info(f"Data loaded successfully with {len(data)} total samples.")
    logger.info(f"Number of unique labels: {len(label_counts)}")
    for label, count in label_counts.items():
        logger.info(f"Label '{label}': {count} samples")

    return data, labels


In [6]:
def train_and_evaluate_models(x_train, x_test, y_train, y_test):
    """
    Train multiple models and return their accuracies and trained models.
    """
    logger.info("Training models...")

    models = {
        "RandomForest": RandomForestClassifier(
            n_estimators=200, max_depth=20,
            min_samples_split=5, min_samples_leaf=2,
            random_state=42
        ),
        "XGBoost": XGBClassifier(
            n_estimators=100, max_depth=6,
            learning_rate=0.1, random_state=42,
            use_label_encoder=False, eval_metric='mlogloss'
        ),
        "SVM": SVC(kernel='rbf', C=1, probability=True, random_state=42),
        "LogisticRegression": LogisticRegression(max_iter=500, random_state=42),
        "KNN": KNeighborsClassifier(n_neighbors=5),
        "DecisionTree": DecisionTreeClassifier(random_state=42),
        "NaiveBayes": GaussianNB(),
        "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
    }

    results = {}
    trained_models = {}

    for name, model in models.items():
        logger.info(f"Training {name}...")
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        score = accuracy_score(y_test, y_pred)
        results[name] = score
        trained_models[name] = model
        logger.info(f"{name} Accuracy: {score*100:.2f}%")

    return results, trained_models

In [7]:
def plot_model_accuracies(results, save_dir=GRAPH_DIR):
    """
    Save a bar plot of model accuracies.
    """
    logger.info("Plotting model accuracies...")
    save_path = os.path.join(save_dir, 'model_accuracies.png')

    plt.figure(figsize=(8, 6))
    plt.bar(results.keys(), [v*100 for v in results.values()], color='lightgreen')
    plt.ylabel('Accuracy (%)')
    plt.title('Model Accuracy Comparison')
    plt.ylim(0, 100)
    for i, acc in enumerate(results.values()):
        plt.text(i, acc*100 + 1, f"{acc*100:.2f}%", ha='center')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

    logger.info(f"Model accuracies saved to {save_path}")

In [8]:
def plot_confusion_matrix(y_true, y_pred, labels, save_dir=GRAPH_DIR):
    """
    Save confusion matrix for best model.
    """
    logger.info("Plotting confusion matrix for best model...")
    save_path = os.path.join(save_dir, 'best_model_confusion_matrix.png')

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap='Blues', xticks_rotation=45)
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

    logger.info(f"Confusion matrix saved to {save_path}")

In [9]:
def plot_pca(X, y, save_dir=GRAPH_DIR):
    """
    Save PCA visualization of data colored by labels with enhanced details.
    """
    logger.info("Plotting PCA visualization...")
    pca = PCA(n_components=2)
    X_reduced = pca.fit_transform(X)
    save_path = os.path.join(save_dir, 'pca_visualization.png')

    plt.figure(figsize=(10, 8))  # Increased figure size for clarity
    unique_labels = np.unique(y)
    colors = plt.cm.get_cmap('viridis', len(unique_labels))  # Changed to 'viridis' for better distinction
    for i, label in enumerate(unique_labels):
        plt.scatter(X_reduced[y == label, 0], X_reduced[y == label, 1], 
                    color=colors(i), label=f'Class {label}', alpha=0.6)  # Added alpha for overlap visibility
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title(f'PCA Visualization of Data (Explained Variance: {pca.explained_variance_ratio_.sum()*100:.2f}%)')
    plt.legend(title="Classes", bbox_to_anchor=(1.05, 1), loc='upper left')  # Moved legend outside
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, format='png')  # Higher resolution
    plt.close()

    logger.info(f"PCA visualization saved to {save_path}")

In [10]:
def plot_class_distribution(labels, save_dir=GRAPH_DIR):
    """
    Save a bar plot of class distribution.
    """
    logger.info("Plotting class distribution...")
    unique_labels, counts = np.unique(labels, return_counts=True)
    save_path = os.path.join(save_dir, 'class_distribution.png')

    plt.figure(figsize=(8, 6))
    plt.bar(unique_labels, counts, color='skyblue')
    plt.xlabel('Classes')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Dataset')
    plt.xticks(unique_labels)
    for i, count in enumerate(counts):
        plt.text(unique_labels[i], count + 10, str(count), ha='center')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

    logger.info(f"Class distribution saved to {save_path}")

In [11]:
def plot_feature_importance(model, best_model_name, save_dir=GRAPH_DIR):
    """
    Save feature importance plot if the model supports it (e.g., XGBoost, RandomForest).
    """
    if hasattr(model, 'feature_importances_'):
        logger.info(f"Plotting feature importance for {best_model_name}...")
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1][:20]  # Top 20 features for clarity
        save_path = os.path.join(save_dir, 'feature_importance.png')

        plt.figure(figsize=(10, 6))
        plt.bar(range(len(indices)), importances[indices], color='salmon')
        plt.xticks(range(len(indices)), [f"Feature {i}" for i in indices], rotation=90)
        plt.xlabel('Features (Indexed)')
        plt.ylabel('Importance Score')
        plt.title(f'Feature Importances for {best_model_name}')
        plt.tight_layout()
        plt.savefig(save_path)
        plt.close()

        logger.info(f"Feature importance saved to {save_path}")
    else:
        logger.warning(f"{best_model_name} does not support feature_importances_")

In [12]:
def plot_roc_curve(model, x_test, y_test, le, save_dir=GRAPH_DIR):
    """
    Save multi-class ROC curve for the best model.
    """
    logger.info("Plotting ROC curve for best model...")
    y_test_bin = label_binarize(y_test, classes=range(len(le.classes_)))
    y_prob = model.predict_proba(x_test)
    n_classes = len(le.classes_)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_prob.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    save_path = os.path.join(save_dir, 'roc_curve.png')
    plt.figure(figsize=(8, 6))
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label=f'ROC curve of class {le.classes_[i]} (AUC = {roc_auc[i]:.2f})')

    plt.plot(fpr["micro"], tpr["micro"],
             label=f'micro-average ROC curve (AUC = {roc_auc["micro"]:.2f})',
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multi-Class ROC Curve')
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

    logger.info(f"ROC curve saved to {save_path}")

In [13]:
def plot_learning_curve(model, x_train, y_train, save_dir=GRAPH_DIR):
    """
    Save learning curve for the best model using cross-validation.
    """
    logger.info("Plotting learning curve for best model...")
    train_sizes, train_scores, test_scores = learning_curve(
        model, x_train, y_train, cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)
    )

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    save_path = os.path.join(save_dir, 'learning_curve.png')
    plt.figure(figsize=(8, 6))
    plt.plot(train_sizes, train_mean, '--', color="blue", label="Training score")
    plt.plot(train_sizes, test_mean, color="green", label="Cross-validation score")
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="blue", alpha=0.15)
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="green", alpha=0.15)
    plt.xlabel('Number of Training Samples')
    plt.ylabel('Accuracy Score')
    plt.title('Learning Curve for Best Model')
    plt.legend(loc='best')
    plt.tight_layout()
    plt.savefig(save_path)
    plt.close()

    logger.info(f"Learning curve saved to {save_path}")

In [14]:
def save_best_model(best_model_name, model, le, save_dir=MODEL_DIR):
    """
    Save the best model and label encoder.
    """
    logger.info(f"Saving best model: {best_model_name}")
    model_path = os.path.join(save_dir, 'best_model.p')
    labels_path = os.path.join(save_dir, 'labels.p')

    with open(model_path, 'wb') as f:
        pickle.dump({'model': model}, f)
    with open(labels_path, 'wb') as f:
        pickle.dump(le, f)

    logger.info(f"Best model saved to {model_path}")
    logger.info(f"Label encoder saved to {labels_path}")

In [15]:
if __name__ == "__main__":
    # Load and preprocess data
    data, labels = load_data(DATA_PATH)
    labels_encoded, le = preprocess_labels(labels)

    # Train-test split
    x_train, x_test, y_train, y_test = train_test_split(
        data,
        labels_encoded,
        test_size=0.2,
        shuffle=True,
        stratify=labels,
        random_state=42
    )
    plot_class_distribution(labels)

    # Train models
    results, trained_models = train_and_evaluate_models(x_train, x_test, y_train, y_test)

    # Plot accuracies
    plot_model_accuracies(results)

    # Find best model
    best_model_name = max(results, key=results.get)
    best_model = trained_models[best_model_name]
    logger.info(f"Best model: {best_model_name} with accuracy {results[best_model_name]*100:.2f}%")

    # Confusion matrix for best model
    y_pred_best = best_model.predict(x_test)
    plot_confusion_matrix(y_test, y_pred_best, le.classes_)

    plot_feature_importance(best_model, best_model_name)
    plot_roc_curve(best_model, x_test, y_test, le)
    plot_learning_curve(best_model, x_train, y_train)

    # Advanced visualizations
    plot_pca(data, labels_encoded)

    # Save best model & label encoder
    save_best_model(best_model_name, best_model, le)

2025-09-27 12:31:51,348 | INFO     | __main__ | Loading data from /Users/ashishsingh/Desktop/SIGN/character and numbers/data/processed_data/data.pickle
2025-09-27 12:31:51,559 | INFO     | __main__ | Data loaded successfully with 51824 total samples.
2025-09-27 12:31:51,560 | INFO     | __main__ | Number of unique labels: 36
2025-09-27 12:31:51,560 | INFO     | __main__ | Label 'R': 633 samples
2025-09-27 12:31:51,560 | INFO     | __main__ | Label 'U': 742 samples
2025-09-27 12:31:51,561 | INFO     | __main__ | Label '9': 3044 samples
2025-09-27 12:31:51,561 | INFO     | __main__ | Label '0': 3078 samples
2025-09-27 12:31:51,561 | INFO     | __main__ | Label '7': 3388 samples
2025-09-27 12:31:51,562 | INFO     | __main__ | Label 'I': 927 samples
2025-09-27 12:31:51,562 | INFO     | __main__ | Label 'N': 964 samples
2025-09-27 12:31:51,562 | INFO     | __main__ | Label 'G': 719 samples
2025-09-27 12:31:51,563 | INFO     | __main__ | Label '6': 3335 samples
2025-09-27 12:31:51,563 | INFO