In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
from cupyx.scipy.sparse import csr_matrix
import warnings
import cupy as cp
from sklearnex import patch_sklearn

In [20]:
#Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)


def mainBareBones():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',
        'label': 'category',
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000)
    X = tfidf.fit_transform(data['comment'])

    # Target variable
    y = data['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # LightGBM Classifier
    model = LGBMClassifier(
        objective='multiclass',
        num_class=len(label_encoder.classes_),
        random_state=42,
        force_col_wise=True
    )
    model.fit(X_train, y_train)

    # Predictions on test data
    y_pred = model.predict(X_test)

    # Evaluation
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


if __name__ == "__main__":
    mainBareBones()


[LightGBM] [Info] Total Bins 64010
[LightGBM] [Info] Number of data points in the train set: 16087, number of used features: 1775
[LightGBM] [Info] Start training from score -1.137269
[LightGBM] [Info] Start training from score -0.934925
[LightGBM] [Info] Start training from score -1.249350
              precision    recall  f1-score   support

  hatespeech       0.75      0.69      0.72      1325
      normal       0.62      0.80      0.70      1502
   offensive       0.58      0.42      0.49      1195

    accuracy                           0.65      4022
   macro avg       0.65      0.63      0.63      4022
weighted avg       0.65      0.65      0.64      4022



In [21]:
def optimize_datatypes(data):
    for col in data.select_dtypes(include=['int']).columns:
        data[col] = cp.asnumpy(cp.array(data[col], dtype=cp.int32))
    for col in data.select_dtypes(include=['float']).columns:
        data[col] = cp.asnumpy(cp.array(data[col], dtype=cp.float32))
    return data


def mainCuPY():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',
        'label': 'category',
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # Optimize datatypes
    data = optimize_datatypes(data)

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000)
    X_csr = tfidf.fit_transform(data['comment'])
    X = csr_matrix(X_csr)

    # Target variable
    y = cp.array(data['label'])

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # LightGBM Classifier
    model = LGBMClassifier(
        objective='multiclass',
        num_class=len(label_encoder.classes_),
        random_state=42,
        force_col_wise=True
    )
    model.fit(cp.asnumpy(X_train.toarray()), cp.asnumpy(y_train))

    # Predictions on test data
    y_pred = model.predict(cp.asnumpy(X_test.toarray()))

    # Evaluation
    print(classification_report(cp.asnumpy(y_test), y_pred, target_names=label_encoder.classes_))


if __name__ == "__main__":
    mainCuPY()

[LightGBM] [Info] Total Bins 64010
[LightGBM] [Info] Number of data points in the train set: 16087, number of used features: 1775
[LightGBM] [Info] Start training from score -1.137269
[LightGBM] [Info] Start training from score -0.934925
[LightGBM] [Info] Start training from score -1.249350
              precision    recall  f1-score   support

  hatespeech       0.75      0.69      0.72      1325
      normal       0.62      0.80      0.70      1502
   offensive       0.58      0.42      0.49      1195

    accuracy                           0.65      4022
   macro avg       0.65      0.63      0.63      4022
weighted avg       0.65      0.65      0.64      4022



In [16]:
patch_sklearn() # Apply Intel optimizations

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [17]:
def mainIntelex():
    # Load the dataset
    data = pd.read_csv('final_hateXplain.csv', dtype={
        'comment': 'string',
        'label': 'category',
        'Race': 'category',
        'Religion': 'category',
        'Gender': 'category',
        'Sexual Orientation': 'category',
        'Miscellaneous': 'category'
    })

    # Preprocessing
    # Fill missing values in the 'comment' column
    data['comment'] = data['comment'].fillna('')

    # Encode the 'label' column (e.g., normal, offensive, hatespeech)
    label_encoder = LabelEncoder()
    data['label'] = label_encoder.fit_transform(data['label'])

    # TF-IDF Vectorization for the 'comment' column
    tfidf = TfidfVectorizer(max_features=5000)
    X = tfidf.fit_transform(data['comment'])

    # Target variable
    y = data['label']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # LightGBM Classifier
    model = LGBMClassifier(
        objective='multiclass',
        num_class=len(label_encoder.classes_),
        random_state=42,
        force_col_wise=True
    )
    model.fit(X_train, y_train)

    # Predictions on test data
    y_pred = model.predict(X_test)

    # Evaluation
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


if __name__ == "__main__":
    mainIntelex()

[LightGBM] [Info] Total Bins 64010
[LightGBM] [Info] Number of data points in the train set: 16087, number of used features: 1775
[LightGBM] [Info] Start training from score -1.137269
[LightGBM] [Info] Start training from score -0.934925
[LightGBM] [Info] Start training from score -1.249350
              precision    recall  f1-score   support

  hatespeech       0.75      0.69      0.72      1325
      normal       0.62      0.80      0.70      1502
   offensive       0.58      0.42      0.49      1195

    accuracy                           0.65      4022
   macro avg       0.65      0.63      0.63      4022
weighted avg       0.65      0.65      0.64      4022

