In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
import cupy as cp
from sklearnex import patch_sklearn

In [1]:
def mainBareBones():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000) 
    X = tfidf.fit_transform(data['comment'])

    # Target variable
    y = data['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # AdaBoost Model
    print("\nInitializing AdaBoost Classifier...")
    model = AdaBoostClassifier(n_estimators=50, random_state=42)

    # Train the model
    print("\nTraining the AdaBoost model...")
    model.fit(X_train, y_train)
    print("Model training completed!")

    # Predictions on test data
    print("\nMaking predictions on the test set...")
    y_pred = model.predict(X_test)

    # Evaluation
    print("\nEvaluating the model...")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainBareBones()



Initializing AdaBoost Classifier...

Training the AdaBoost model...
Model training completed!

Making predictions on the test set...

Evaluating the model...
              precision    recall  f1-score   support

  hatespeech       0.85      0.39      0.54      1325
      normal       0.44      0.96      0.61      1502
   offensive       0.72      0.09      0.16      1195

    accuracy                           0.52      4022
   macro avg       0.67      0.48      0.43      4022
weighted avg       0.66      0.52      0.45      4022



In [2]:
def mainCuPY():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    tfidf = TfidfVectorizer(max_features=5000) 
    X = tfidf.fit_transform(data['comment'])

    # Move TF-IDF matrix to CuPy
    X = cp.array(X.toarray())

    # Target variable
    y = cp.array(data['label'])  # Move labels to GPU

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")

    print("\nInitializing AdaBoost Classifier...")
    model = AdaBoostClassifier(n_estimators=50, random_state=42)

    # Train the model 
    print("\nTraining the AdaBoost model...")
    model.fit(cp.asnumpy(X_train), cp.asnumpy(y_train))  # Convert CuPy arrays
    print("Model training completed!")

    # Predictions on test data
    print("\nMaking predictions on the test set...")
    y_pred = model.predict(cp.asnumpy(X_test))

    # Evaluation
    print("\nEvaluating the model...")
    print(classification_report(cp.asnumpy(y_test), y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainCuPY()


Training set shape: (16087, 5000), Test set shape: (4022, 5000)

Initializing AdaBoost Classifier...

Training the AdaBoost model...
Model training completed!

Making predictions on the test set...

Evaluating the model...
              precision    recall  f1-score   support

  hatespeech       0.85      0.39      0.54      1325
      normal       0.44      0.96      0.61      1502
   offensive       0.72      0.09      0.16      1195

    accuracy                           0.52      4022
   macro avg       0.67      0.48      0.43      4022
weighted avg       0.66      0.52      0.45      4022



In [None]:
# Apply Scikit-learn patches
patch_sklearn()

In [3]:
def mainIntelex():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',  
        'label': 'category', 
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000) 
    X = tfidf.fit_transform(data['comment'])

    # Target variable
    y = data['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # AdaBoost Model
    print("\nInitializing AdaBoost Classifier...")
    model = AdaBoostClassifier(n_estimators=50, random_state=42)

    # Train the model
    print("\nTraining the AdaBoost model...")
    model.fit(X_train, y_train)
    print("Model training completed!")

    # Predictions on test data
    print("\nMaking predictions on the test set...")
    y_pred = model.predict(X_test)

    # Evaluation
    print("\nEvaluating the model...")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

if __name__ == "__main__":
    mainIntelex()



Initializing AdaBoost Classifier...

Training the AdaBoost model...
Model training completed!

Making predictions on the test set...

Evaluating the model...
              precision    recall  f1-score   support

  hatespeech       0.85      0.39      0.54      1325
      normal       0.44      0.96      0.61      1502
   offensive       0.72      0.09      0.16      1195

    accuracy                           0.52      4022
   macro avg       0.67      0.48      0.43      4022
weighted avg       0.66      0.52      0.45      4022

