In [1]:
import re
import traceback
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length, concat_ws, regexp_replace, size, split, when, lit, expr
from pyspark.sql.types import StringType, IntegerType, DoubleType, ArrayType
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StopWordsRemover, HashingTF, IDF, VectorAssembler, NGram, Word2Vec
)

try:
    from spark_rapids_ml.classification import LogisticRegression
    print("Using spark_rapids_ml.classification.LogisticRegression")
except ImportError:
    from pyspark.ml.classification import LogisticRegression
    print("Using pyspark.ml.classification.LogisticRegression (spark_rapids_ml not found)")

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = (SparkSession.builder
    .appName("GPU_Optimized_RAPIDS")
    .master("local[*]")
    .config("spark.driver.memory","24g")
    .config("spark.executor.memory", "24g")
    .config("spark.executor.cores", "4")
    .config("spark.driver.cores", "4")
    .config("spark.sql.shuffle.partitions", "200")
    .config("spark.plugins", "com.nvidia.spark.SQLPlugin")
    .config("spark.driver.host", "localhost")
    .config("spark.rapids.sql.explain", "ALL")
    .config("spark.rapids.sql.allowMultipleJars", "ALWAYS") 
    .config("spark.rapids.sql.enabled", "true")
    .config("spark.rapids.memory.hostStorageFraction", "0.8")
    .config("spark.rapids.memory.deviceStorageFraction", "0.8")
    .config("spark.rapids.sql.concurrentGpuTasks", "2")
    .config("spark.memory.offHeap.enabled", "true")
    .config("spark.memory.offHeap.size", "8g")
    .config("spark.rapids.memory.gpu.allocation.limit", "0.9")
    .config("spark.rapids.host.shim.async", "true")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.rapids.sql.exec.CollectLimitExec", "true")
    .config("spark.rapids.sql.rowBasedUDF.enabled", "true")
    .config("spark.sql.inMemoryColumnarStorage.batchSerializer", "com.nvidia.spark.rapids.shims.SparkShimServiceProvider")
    .getOrCreate()
)

data_path = "/mnt/c/Users/BerenÜnveren/Desktop/BIL401/data/train.csv"
try:
    print("--- Data Loading ---")
    start_load_time = time.time()

    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("quote", "\"") \
        .option("multiLine", "true") \
        .option("inferSchema", "true") \
        .load(data_path)
    load_time = time.time() - start_load_time
    print(f"Data load time: {load_time:.2f} seconds")
    print("data schema:")
    df.printSchema()
    print("Y column distribution after load (should be HQ, LQ_EDIT, LQ_CLOSE):")
    df.groupBy("Y").count().show()

    print("--- Data Cleaning and Feature Engineering (initial steps) ---")
    start_clean_feature_time = time.time()

    df_clean = df.na.drop(subset=["Title", "Body", "Tags", "Y"]) \
        .withColumn("CleanBody", regexp_replace(col("Body"), "<.*?>", "")) \
        .withColumn("text", concat_ws(" ", col("Title"), col("CleanBody")))

    df_clean = df_clean.withColumn("tags_list",split(expr("TRIM(REPLACE(REPLACE(Tags, '<', ' '), '>', ' '))")," "))

    df_processed = df_clean.withColumn("label",
                                       when(col("Y") == "HQ", 0.0)
                                       .when(col("Y") == "LQ_EDIT", 1.0)
                                       .when(col("Y") == "LQ_CLOSE", 2.0)
                                       .otherwise(lit(None))
                                       .cast(DoubleType())
                                      )
    
    df_processed = df_processed.na.drop(subset=["label"])
    df_processed = df_processed.withColumn("words", split(col("text"), " "))
    df_processed = df_processed.withColumn("title_len", length(col("Title"))) \
        .withColumn("body_len", length(col("CleanBody"))) \
        .withColumn("punct_count", length(regexp_replace(col("text"), "[?!]", ""))) \
        .withColumn("avg_word_len",
                    (length(regexp_replace(col("text"), " ", "")).cast(DoubleType()) /\
                     (size(split(col("text"), " ")) + lit(1e-6)))\
                   )

    clean_feature_time = time.time() - start_clean_feature_time
    print(f"Initial data cleaning and feature engineering time: {clean_feature_time:.2f} seconds")

    stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
    ngram = NGram(n=2, inputCol="filtered_words", outputCol="bigrams")
    hashing_tf_text = HashingTF(inputCol="filtered_words", outputCol="raw_text_features", numFeatures=20000)
    idf_text = IDF(inputCol="raw_text_features", outputCol="text_features")
    hashing_tf_bigrams = HashingTF(inputCol="bigrams", outputCol="raw_bigrams_features", numFeatures=20000)
    idf_bigrams = IDF(inputCol="raw_bigrams_features", outputCol="bigrams_features")
    w2v = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="w2v_features")
    hashing_tf_tags = HashingTF(inputCol="tags_list", outputCol="raw_tags_features", numFeatures=5000)
    idf_tags = IDF(inputCol="raw_tags_features", outputCol="tags_tags")
    feature_assembler = VectorAssembler(
        inputCols=["text_features", "bigrams_features", "w2v_features", "tags_tags",
                   "title_len", "body_len", "punct_count", "avg_word_len"],
        outputCol="features"
    )

    (train_data, test_data) = df_processed.randomSplit([0.8, 0.2], seed=42)
    
    train_data.cache()
    test_data.cache()
    print(f"Train data count: {train_data.count()}, Test data count: {test_data.count()}")
    print("\n--- Logistic Regression Training ---")

    lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10) 
    lr_pipeline = Pipeline(stages=[
        stopwords_remover,
        ngram,
        hashing_tf_text, idf_text,
        hashing_tf_bigrams, idf_bigrams,
        w2v,
        hashing_tf_tags, idf_tags,
        feature_assembler,
        lr
    ])

    start_lr_train_time = time.time()
    lr_model = lr_pipeline.fit(train_data)
    lr_train_time = time.time() - start_lr_train_time
    print(f"Logistic Regression training time: {lr_train_time:.2f} seconds")

    start_lr_predict_time = time.time()
    lr_predictions = lr_model.transform(test_data)
    lr_predict_time = time.time() - start_lr_predict_time
    print(f"Logistic Regression prediction time: {lr_predict_time:.2f} seconds")

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    accuracy_lr = evaluator.setMetricName("accuracy").evaluate(lr_predictions)
    f1_score_lr = evaluator.setMetricName("f1").evaluate(lr_predictions)

    print("\nLogistic Regression Results:")
    print(f"Accuracy: {accuracy_lr:.4f}")
    print(f"F1 Score: {f1_score_lr:.4f}")
    print("Confusion Matrix:")
    
    lr_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

except Exception as e:
    print(f"An error occurred: {e}")
    traceback.print_exc()
finally:
    try:
        if 'spark' in locals() and spark is not None:
            spark.stop()
    except Exception as e:
        print(f"Error stopping Spark session: {e}")
        traceback.print_exc()

Using spark_rapids_ml.classification.LogisticRegression


25/08/06 16:48:16 WARN Utils: Your hostname, DESKTOP-15VE119 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/08/06 16:48:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/06 16:48:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/06 16:48:18 WARN RapidsPluginUtils: RAPIDS Accelerator 24.02.0 using cudf 24.02.1.
25/08/06 16:48:18 WARN RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.
25/08/06 16:48:18 WARN RapidsPluginUtils: Multiple cudf jars found in the classpath:
revison: dd34fdbe35e68ba56a2183f11ed822ddaa6c927b
	jar URL: jar:file:/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/jars/rapids-4-spark_2.12-24.02.0.jar
	version=24.02.1
	user=
	

--- Data Loading ---


                                                                                

Data load time: 6.79 seconds
data schema:
root
 |-- Id: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- Tags: string (nullable = true)
 |-- CreationDate: string (nullable = true)
 |-- Y: string (nullable = true)

Y column distribution after load (should be HQ, LQ_EDIT, LQ_CLOSE):


25/08/06 16:48:34 WARN GpuOverrides: 
*Exec <CollectLimitExec> will run on GPU
  *Partitioning <SinglePartition$> will run on GPU
  *Exec <HashAggregateExec> will run on GPU
    *Expression <AggregateExpression> count(1) will run on GPU
      *Expression <Count> count(1) will run on GPU
    *Expression <Alias> cast(count(1)#18L as string) AS count#26 will run on GPU
      *Expression <Cast> cast(count(1)#18L as string) will run on GPU
    *Exec <ShuffleExchangeExec> will run on GPU
      *Partitioning <HashPartitioning> will run on GPU
      *Exec <HashAggregateExec> will run on GPU
        *Expression <AggregateExpression> partial_count(1) will run on GPU
          *Expression <Count> count(1) will run on GPU
        *Exec <FileSourceScanExec> will run on GPU

25/08/06 16:48:34 WARN GpuOverrides: 
*Exec <CollectLimitExec> will run on GPU
  *Partitioning <SinglePartition$> will run on GPU
  *Exec <HashAggregateExec> will run on GPU
    *Expression <AggregateExpression> count(1) will ru

+--------+-----+
|       Y|count|
+--------+-----+
|LQ_CLOSE|15000|
|      HQ|15000|
| LQ_EDIT|15000|
+--------+-----+

--- Data Cleaning and Feature Engineering (initial steps) ---
Initial data cleaning and feature engineering time: 0.41 seconds


25/08/06 16:48:37 WARN GpuOverrides: 
*Exec <SampleExec> will run on GPU
  *Exec <SortExec> will run on GPU
    *Expression <SortOrder> Id#0 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> Title#1 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> Body#2 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> Tags#3 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> CreationDate#4 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> Y#5 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> CleanBody#58 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> text#66 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> tags_list#75 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> label#85 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> words#106 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> title_len#118 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> body_len#131 ASC NULLS FIRST wi

Train data count: 35997, Test data count: 9003

--- Logistic Regression Training ---


25/08/06 16:48:48 WARN GpuOverrides: 
! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec
  ! <CreateExternalRow> createexternalrow(newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(raw_text_features,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow
    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke
      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInsta

Logistic Regression training time: 179.09 seconds


25/08/06 16:51:48 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [w2v_features#3700, tags_tags#3755, text_features#3623, features#3787, raw_tags_features#3728, bigrams_features#3672, raw_bigrams_features#3648, raw_text_features#3601]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [w2v_features#3700, tags_tags#3755, text_features#3623, features#3787, raw_tags_features#3728, bigrams_features#3672, raw_bigrams_features#3648, raw_text_features#3601]
  @Partitioning <SinglePartition$> could run on GPU
  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [w2v_features#3700, tags_tags#3755, text_features#3623, features#3787, raw_tags_features#3728, bigrams_features#3672, raw_bigrams_features#3648, raw_text_features#3601]; unsupported data types in 

Logistic Regression prediction time: 2.55 seconds


25/08/06 16:51:49 WARN GpuOverrides: 
! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec
  ! <CreateExternalRow> createexternalrow(staticinvoke(class java.lang.Double, ObjectType(class java.lang.Double), valueOf, prediction#4264, true, false, true), staticinvoke(class java.lang.Double, ObjectType(class java.lang.Double), valueOf, label#85, true, false, true), staticinvoke(class java.lang.Double, ObjectType(class java.lang.Double), valueOf, 1.0#4434, true, false, true), newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(label,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objec


Logistic Regression Results:
Accuracy: 0.7077
F1 Score: 0.7072
Confusion Matrix:


25/08/06 16:52:04 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/08/06 16:52:08 WARN GpuOverrides:                                (0 + 1) / 1]
*Exec <ProjectExec> will run on GPU
  *Expression <Alias> cast(label#85 as string) AS label#5085 will run on GPU
    *Expression <Cast> cast(label#85 as string) will run on GPU
  *Expression <Alias> cast(prediction#4264 as string) AS prediction#5086 will run on GPU
    *Expression <Cast> cast(prediction#4264 as string) will run on GPU
  *Expression <Alias> cast(count#5078L as string) AS count#5087 will run on GPU
    *Expression <Cast> cast(count#5078L as string) will run on GPU
  *Exec <SortExec> will run on GPU
    *Expression <SortOrder> label#85 ASC NULLS FIRST will run on GPU
    *Expression <SortOrder> prediction#4264 ASC NULLS FIRST will run on GPU
    *Exec <ShuffleExchangeExec> will run on GPU
      *Partitioning <RangePartitioning> will run on GPU
        *Expression <SortOrder> label#85 ASC NULLS FIRST will run

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       0.0| 2469|
|  0.0|       1.0|  185|
|  0.0|       2.0|  392|
|  1.0|       0.0|  253|
|  1.0|       1.0| 1971|
|  1.0|       2.0|  756|
|  2.0|       0.0|  442|
|  2.0|       1.0|  604|
|  2.0|       2.0| 1931|
+-----+----------+-----+

