In [5]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import pickle
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from time import time
import argparse
import os
import sys

def main(dataset_dir, output_dir):
    # Importing the datasets
    try:
        df_train = pd.read_csv(os.path.join(dataset_dir, 'train_data.csv')).sample(n=25000, random_state=42)
        df_test = pd.read_csv(os.path.join(dataset_dir, 'test_data.csv'))
    except FileNotFoundError as e:
        print(f'Error: {e}')
        sys.exit(1)  # Exit the script if datasets are not found
    except pd.errors.EmptyDataError as e:
        print(f'Error: One of the CSV files is empty or malformed: {e}')
        sys.exit(1)
    except Exception as e:
        print(f'An unexpected error occurred while reading the datasets: {e}')
        sys.exit(1)

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    print(f"Dataset directory: {dataset_dir}")
    print(f"Output directory: {output_dir}")

    print(f"\nTraining Dataset has {df_train.shape[0]} rows and {df_train.shape[1]} columns")
    print(f"Testing Dataset has {df_test.shape[0]} rows and {df_test.shape[1]} columns")

    # Display first 5 rows of training data
    print("\nFirst 5 rows of Training Data:")
    print(df_train.head(5))

    # Check if 'sentiment' column exists
    if 'sentiment' not in df_train.columns or 'sentiment' not in df_test.columns:
        print("Error: 'sentiment' column not found in one of the datasets.")
        sys.exit(1)

    sentiment_counts_train = df_train['sentiment'].value_counts()
    print("\nSentiment distribution in Training Data:")
    print(sentiment_counts_train)

    # Display first 5 rows of testing data
    print("\nFirst 5 rows of Testing Data:")
    print(df_test.head(5))

    sentiment_counts_test = df_test['sentiment'].value_counts()
    print("\nSentiment distribution in Testing Data:")
    print(sentiment_counts_test)

    # Check if 'sentence' column exists
    if 'sentence' not in df_train.columns or 'sentence' not in df_test.columns:
        print("Error: 'sentence' column not found in one of the datasets.")
        sys.exit(1)

    trainX = df_train['sentence'].astype(str)  # Ensure all entries are strings
    trainY = df_train['sentiment']
    testX = df_test['sentence'].astype(str)
    testY = df_test['sentiment']

    # Initialize TfidfVectorizer with appropriate parameters to handle large datasets
    tf_vec = TfidfVectorizer()
    # tf_vec = TfidfVectorizer(
    #     max_features=100000,  # Limit to top 100k features to manage memory
    #     ngram_range=(1, 2),    # Use unigrams and bigrams
    #     stop_words='english',  # Remove English stop words
    #     lowercase=True,
    #     strip_accents='unicode'
    # )

    # Transform training data
    print("\nStarting TF-IDF vectorization on training data...")
    start = time()
    X_train_tf = tf_vec.fit_transform(trainX)
    end = time()
    print(f"Time to transform training data: {end - start:.2f}s")
    print(f"Training Data Shape: n_samples={X_train_tf.shape[0]}, n_features={X_train_tf.shape[1]}")

    # Transform testing data
    print("\nStarting TF-IDF vectorization on testing data...")
    start = time()
    X_test_tf = tf_vec.transform(testX)
    duration = time() - start
    print(f"Time taken to extract features from test data: {duration:.2f} seconds")
    print(f"Testing Data Shape: n_samples={X_test_tf.shape[0]}, n_features={X_test_tf.shape[1]}")

    # Defining the parameter grid for Random Forest
    param_dist = {
        'n_estimators': [100, 200, 300],
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True, False]
    }

    # Initialize the Random Forest Classifier
    rf_classifier = RandomForestClassifier(random_state=42)

    # Setting up GridSearchCV to find the best parameters
    random_search = GridSearchCV(
        estimator=rf_classifier,
        param_grid=param_dist,
        scoring='roc_auc',  # Optimize for AUC-ROC
        cv=3,  # 3-fold cross-validation
        verbose=2,
        n_jobs=-1  # Use all available cores
    )

    # Start the random search
    print("\nStarting GridSearchCV for Random Forest...")
    start = time()
    try:
        random_search.fit(X_train_tf, trainY)
    except Exception as e:
        print(f"An error occurred during GridSearchCV: {e}")
        sys.exit(1)
    end = time()
    print(f"GridSearchCV completed in {end - start:.2f}s")

    print("\nBest parameters found:")
    print(random_search.best_params_)

    print(f"\nBest cross-validation AUC-ROC score: {random_search.best_score_:.4f}")

    # Evaluate the model with the best parameters on the test set
    best_rf_classifier = random_search.best_estimator_
    print("\nMaking predictions on the test set...")
    start = time()
    try:
        y_pred = best_rf_classifier.predict(X_test_tf)
        if hasattr(best_rf_classifier, "predict_proba"):
            y_pred_proba = best_rf_classifier.predict_proba(X_test_tf)[:, 1]  # Probability estimates for the positive class
        else:
            # If predict_proba is not available, use decision_function
            y_pred_proba = best_rf_classifier.decision_function(X_test_tf)
            y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (y_pred_proba.max() - y_pred_proba.min())  # Normalize to [0,1]
    except Exception as e:
        print(f"An error occurred during prediction: {e}")
        sys.exit(1)
    prediction_time = time() - start
    print(f"Prediction time: {prediction_time:.4f}s")

    # Calculate Accuracy
    acc = metrics.accuracy_score(testY, y_pred)
    print(f"\nAccuracy on test set: {acc*100:.2f}%")

    # Calculate AUC-ROC
    try:
        auc = roc_auc_score(testY, y_pred_proba)
        print(f"AUC-ROC on test set: {auc:.4f}")
    except ValueError as e:
        print(f"Error computing AUC-ROC: {e}")
        auc = None

    # Classification report
    print("\nClassification report for the optimized classifier: \n")
    print(metrics.classification_report(testY, y_pred))

    # Create a confusion matrix heatmap
    conf_matrix = confusion_matrix(testY, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'])

    plt.title("Confusion Matrix Heatmap")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")

    cm_path = os.path.join(output_dir, 'confusion_matrix_heatmap.png')
    plt.savefig(cm_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Confusion matrix heatmap saved as '{cm_path}'")

    # Plot ROC Curve if AUC is computable
    if auc is not None:
        fpr, tpr, thresholds = roc_curve(testY, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {auc:.4f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")

        roc_path = os.path.join(output_dir, 'roc_curve.png')
        plt.savefig(roc_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"ROC curve saved as '{roc_path}'")
    else:
        print("ROC curve was not plotted due to inability to compute AUC-ROC.")

    # Save the model and the vectorizer
    try:
        model_path = os.path.join(output_dir, 'best_rf_classifier_model.pkl')
        with open(model_path, 'wb') as model_file:
            pickle.dump(best_rf_classifier, model_file)
        print(f"Model saved as '{model_path}'")
    except Exception as e:
        print(f"An error occurred while saving the model: {e}")

    try:
        vec_path = os.path.join(output_dir, 'tfidf_vectorizer.pkl')
        with open(vec_path, 'wb') as vec_file:
            pickle.dump(tf_vec, vec_file)
        print(f"TfidfVectorizer saved as '{vec_path}'")
    except Exception as e:
        print(f"An error occurred while saving the vectorizer: {e}")

In [6]:
main('/home/dgxuser16/NTL/mccarthy/ahmad/Projects/ML_Course_Proj/data/twitter', 'output_new')

Dataset directory: /home/dgxuser16/NTL/mccarthy/ahmad/Projects/ML_Course_Proj/data/twitter
Output directory: output_new

Training Dataset has 25000 rows and 2 columns
Testing Dataset has 359 rows and 2 columns

First 5 rows of Training Data:
                                                  sentence  sentiment
1110964  chillin at the rooftoop on a rainy sunday than...          1
442422   me loves you too fran i don t feel too good ri...          0
348915   u guys r so funny p boston tonight so close ma...          0
575434   can t sleep i had that chance with that chunky...          0
289960   i miss my tv it s at my apartment in san anton...          0

Sentiment distribution in Training Data:
sentiment
0    12634
1    12366
Name: count, dtype: int64

First 5 rows of Testing Data:
                                            sentence  sentiment
0  i loooooooovvvvvveee my kindle not that the dx...          1
1  reading my kindle love it lee childs is good read          1
2  ok first ass

In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
import pickle
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
import argparse
import os
import sys

def main(dataset_dir, output_dir):
    # Importing the datasets
    try:
        df_train = pd.read_csv(os.path.join(dataset_dir, 'train_data.csv')).sample(n=100000, random_state=42)
        df_test = pd.read_csv(os.path.join(dataset_dir, 'test_data.csv'))
    except FileNotFoundError as e:
        print(f'Error: {e}')
        sys.exit(1)
    except pd.errors.EmptyDataError as e:
        print(f'Error: One of the CSV files is empty or malformed: {e}')
        sys.exit(1)
    except Exception as e:
        print(f'An unexpected error occurred while reading the datasets: {e}')
        sys.exit(1)

    os.makedirs(output_dir, exist_ok=True)

    print(f"Dataset directory: {dataset_dir}")
    print(f"Output directory: {output_dir}")

    print(f"\nTraining data shape: {df_train.shape}\n")
    print(f"Testing data shape: {df_test.shape}")

    if 'sentiment' not in df_train.columns or 'sentiment' not in df_test.columns:
        print("Error: 'sentiment' column not found in one of the datasets.")
        sys.exit(1)

    trainX = df_train['sentence'].astype(str)
    trainY = df_train['sentiment']
    testX = df_test['sentence'].astype(str)
    testY = df_test['sentiment']

    tf_vec = TfidfVectorizer()
    print("\nStarting TF-IDF vectorization on training data...")
    start = time()
    X_train_tf = tf_vec.fit_transform(trainX)
    end = time()
    print(f"Time to transform training data: {end - start:.2f}s")

    print("\nStarting TF-IDF vectorization on testing data...")
    start = time()
    X_test_tf = tf_vec.transform(testX)
    print(f"Time taken to extract features from test data: {time() - start:.2f}s")

    # Initialize Random Forest Classifier with default parameters
    rf_classifier = RandomForestClassifier(random_state=42, bootstrap=True, criterion='entropy', max_features='log2', n_estimators=300)
    
    print("\nTraining Random Forest Classifier...")
    start = time()
    try:
        rf_classifier.fit(X_train_tf, trainY)
    except Exception as e:
        print(f"An error occurred during model training: {e}")
        sys.exit(1)
    print(f"Training completed in {time() - start:.2f}s")

    print("\nMaking predictions on the test set...")
    start = time()
    try:
        y_pred = rf_classifier.predict(X_test_tf)
        if hasattr(rf_classifier, "predict_proba"):
            y_pred_proba = rf_classifier.predict_proba(X_test_tf)[:, 1]
        else:
            y_pred_proba = rf_classifier.decision_function(X_test_tf)
            y_pred_proba = (y_pred_proba - y_pred_proba.min()) / (y_pred_proba.max() - y_pred_proba.min())
    except Exception as e:
        print(f"An error occurred during prediction: {e}")
        sys.exit(1)
    print(f"Prediction time: {time() - start:.2f}s")

    acc = metrics.accuracy_score(testY, y_pred)
    print(f"\nAccuracy on test set: {acc*100:.2f}%")

    try:
        auc = roc_auc_score(testY, y_pred_proba)
        print(f"AUC-ROC on test set: {auc:.4f}")
    except ValueError as e:
        print(f"Error computing AUC-ROC: {e}")
        auc = None

    print("\nClassification report for the Random Forest classifier: \n")
    print(metrics.classification_report(testY, y_pred))

    conf_matrix = confusion_matrix(testY, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Negative', 'Positive'],
                yticklabels=['Negative', 'Positive'])

    cm_path = os.path.join(output_dir, 'confusion_matrix_heatmap.png')
    plt.savefig(cm_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Confusion matrix heatmap saved as '{cm_path}'")

    if auc is not None:
        fpr, tpr, thresholds = roc_curve(testY, y_pred_proba)
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {auc:.4f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")

        roc_path = os.path.join(output_dir, 'roc_curve.png')
        plt.savefig(roc_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"ROC curve saved as '{roc_path}'")
    else:
        print("ROC curve was not plotted due to inability to compute AUC-ROC.")

    try:
        model_path = os.path.join(output_dir, 'rf_classifier_model.pkl')
        with open(model_path, 'wb') as model_file:
            pickle.dump(rf_classifier, model_file)
        print(f"Model saved as '{model_path}'")
    except Exception as e:
        print(f"An error occurred while saving the model: {e}")

    try:
        vec_path = os.path.join(output_dir, 'tfidf_vectorizer.pkl')
        with open(vec_path, 'wb') as vec_file:
            pickle.dump(tf_vec, vec_file)
        print(f"TfidfVectorizer saved as '{vec_path}'")
    except Exception as e:
        print(f"An error occurred while saving the vectorizer: {e}")

In [2]:
main('/home/dgxuser16/NTL/mccarthy/ahmad/Projects/ML_Course_Proj/data/twitter', 'output_final')

Dataset directory: /home/dgxuser16/NTL/mccarthy/ahmad/Projects/ML_Course_Proj/data/twitter
Output directory: output_final

Training data shape: (100000, 2)

Testing data shape: (359, 2)

Starting TF-IDF vectorization on training data...
Time to transform training data: 1.09s

Starting TF-IDF vectorization on testing data...
Time taken to extract features from test data: 0.01s

Training Random Forest Classifier...
Training completed in 1609.89s

Making predictions on the test set...
Prediction time: 2.01s

Accuracy on test set: 81.62%
AUC-ROC on test set: 0.8820

Classification report for the Random Forest classifier: 

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       177
           1       0.81      0.84      0.82       182

    accuracy                           0.82       359
   macro avg       0.82      0.82      0.82       359
weighted avg       0.82      0.82      0.82       359

Confusion matrix heatmap saved as 'output_fina