In [1]:
import re
import traceback
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, length, concat_ws, regexp_replace, size, split
from pyspark.sql.types import StringType, IntegerType, DoubleType, ArrayType
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, 
    VectorAssembler, NGram, Word2Vec
)
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

def clean_tags(tags):
    if tags is None: return []
    return tags.replace('<', ' ').replace('>', ' ').strip().split()

clean_tags_udf = udf(clean_tags, ArrayType(StringType()))
spark = (SparkSession.builder
    .appName("GPU_IMP_RECOMMENDED")
    .master("local[*]")
    .config("spark.driver.memory","12g")
    .config("spark.executor.memory", "12g")
    .config("spark.executor.cores", "4")
    .config("spark.driver.cores", "4")
    .config("spark.sql.shuffle.partitions", "200")
    .config("spark.plugins", "com.nvidia.spark.SQLPlugin")
    .config("spark.driver.host", "localhost")
    .config("spark.rapids.sql.explain", "DEBUG")
    .config("spark.rapids.sql.allowMultipleJars", "ALWAYS")
    .config("spark.rapids.sql.enabled", "true")
    .config("spark.rapids.memory.hostStorageFraction", "0.8")
    .config("spark.rapids.memory.deviceStorageFraction", "0.8")
    .config("spark.rapids.sql.concurrentGpuTasks", "2") 
    .config("spark.memory.offHeap.enabled", "true")
    .config("spark.memory.offHeap.size", "4g")
    .config("spark.rapids.memory.gpu.allocation.limit", "0.9")
    .config("spark.rapids.host.shim.async", "true")

    .getOrCreate()
)
        
data_path = "/mnt/c/Users/BerenÜnveren/Desktop/BIL401/data/train.csv"

try:
    print("--- Data Loading ---")
    start_load_time = time.time()
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("quote", "\"") \
        .option("escape", "\"") \
        .option("multiLine", "true") \
        .option("inferSchema", "true") \
        .load(data_path)
    load_time = time.time() - start_load_time
    print(f"Data load time: {load_time:.2f} seconds")
    
    print("data schema:")
    df.printSchema()
    print("Y column distribution after load (should be HQ, LQ_EDIT, LQ_CLOSE):")
    df.groupBy("Y").count().show()

    print("--- Data Cleaning and Feature Engineering (initial steps) ---")
    start_clean_feature_time = time.time()
    df_clean = df.na.drop(subset=["Title", "Body", "Tags", "Y"]) \
        .withColumn("CleanBody", regexp_replace(col("Body"), "<.*?>", "")) \
        .withColumn("text", concat_ws(" ", col("Title"), col("CleanBody"))) \
        .withColumn("tags_list", clean_tags_udf(col("Tags")))
    
    df_featured = df_clean.withColumn("title_len", length(col("Title"))) \
        .withColumn("body_len", length(col("CleanBody"))) \
        .withColumn("punct_count", length(col("text")) - length(regexp_replace(col("text"), "[?!]", ""))) \
        .withColumn("avg_word_len", length(regexp_replace(col("text"), " ", "")) / (size(split(col("text"), " ")) + 1e-6))
    clean_feature_time = time.time() - start_clean_feature_time
    print(f"Initial data cleaning and feature engineering time: {clean_feature_time:.2f} seconds")

    label_indexer = StringIndexer(inputCol="Y", outputCol="label", handleInvalid="skip")
    
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
    ngram = NGram(n=2, inputCol="filtered_words", outputCol="bigrams")
    
    hashing_tf_text = HashingTF(inputCol="filtered_words", outputCol="raw_text_features", numFeatures=20000)
    idf_text = IDF(inputCol="raw_text_features", outputCol="text_features")
    
    hashing_tf_bigrams = HashingTF(inputCol="bigrams", outputCol="raw_bigrams_features", numFeatures=20000)
    idf_bigrams = IDF(inputCol="raw_bigrams_features", outputCol="bigrams_features")
    
    w2v = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="w2v_features")
    
    hashing_tf_tags = HashingTF(inputCol="tags_list", outputCol="raw_tags_features", numFeatures=5000)
    idf_tags = IDF(inputCol="raw_tags_features", outputCol="tags_features")

    feature_assembler = VectorAssembler(
        inputCols=["text_features", "bigrams_features", "w2v_features", "tags_features", 
                   "title_len", "body_len", "punct_count", "avg_word_len"],
        outputCol="features"
    )

    (train_data, test_data) = df_featured.randomSplit([0.8, 0.2], seed=42)
    train_data.cache(); test_data.cache()
    print(f"Train data count: {train_data.count()}, Test data count: {test_data.count()}")

    print("\n--- Logistic Regression Training ---")
    lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
    lr_pipeline = Pipeline(stages=[
        label_indexer, tokenizer, stopwords_remover, ngram, 
        hashing_tf_text, idf_text, hashing_tf_bigrams, idf_bigrams, w2v, hashing_tf_tags, idf_tags, 
        feature_assembler, lr
    ])
    
    start_lr_train_time = time.time()
    lr_model = lr_pipeline.fit(train_data)
    lr_train_time = time.time() - start_lr_train_time
    print(f"Logistic Regression training time: {lr_train_time:.2f} seconds")

    start_lr_predict_time = time.time()
    lr_predictions = lr_model.transform(test_data)
    lr_predict_time = time.time() - start_lr_predict_time
    print(f"Logistic Regression prediction time: {lr_predict_time:.2f} seconds")

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    accuracy_lr = evaluator.setMetricName("accuracy").evaluate(lr_predictions)
    f1_score_lr = evaluator.setMetricName("f1").evaluate(lr_predictions)
    
    print("\nLogistic Regression Results:")
    print(f"Accuracy: {accuracy_lr:.4f}")
    print(f"F1 Score: {f1_score_lr:.4f}")
    print("Confusion Matrix:")
    lr_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

    print("\n--- Random Forest Training with Cross-Validation ---")
    rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)
    rf_pipeline = Pipeline(stages=[
        label_indexer, tokenizer, stopwords_remover, ngram, 
        hashing_tf_text, idf_text, hashing_tf_bigrams, idf_bigrams, w2v, hashing_tf_tags, idf_tags, 
        feature_assembler, rf
    ])

    paramGrid = ParamGridBuilder() \
        .addGrid(rf.numTrees, [50, 100]) \
        .addGrid(rf.maxDepth, [5, 10]) \
        .build()

    crossval = CrossValidator(estimator=rf_pipeline, 
                              estimatorParamMaps=paramGrid,
                              evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1"),
                              numFolds=3) 
    
    start_rf_train_time = time.time()
    cv_model = crossval.fit(train_data)
    rf_train_time = time.time() - start_rf_train_time
    print(f"Random Forest training (with CV) time: {rf_train_time:.2f} seconds")
    
    best_rf_model = cv_model.bestModel
    start_rf_predict_time = time.time()
    rf_predictions = best_rf_model.transform(test_data)
    rf_predict_time = time.time() - start_rf_predict_time
    print(f"Random Forest prediction time: {rf_predict_time:.2f} seconds")

    accuracy_rf = evaluator.setMetricName("accuracy").evaluate(rf_predictions)
    f1_score_rf = evaluator.setMetricName("f1").evaluate(rf_predictions)
    
    print("\nRandom Forest Results:")
    print(f"Accuracy: {accuracy_rf:.4f}")
    print(f"F1 Score: {f1_score_rf:.4f}")

    best_params_stage = best_rf_model.stages[-1]
    print(f"Best parameters: numTrees={best_params_stage.getNumTrees()}, maxDepth={best_params_stage.getMaxDepth()}")
    
    print("Confusion Matrix:")
    rf_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

except Exception as e:
    print(f"An error occurred: {e}")
    traceback.print_exc()
finally:
    spark.stop()

25/07/22 17:53:06 WARN Utils: Your hostname, DESKTOP-15VE119 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/07/22 17:53:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/22 17:53:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/22 17:53:08 WARN RapidsPluginUtils: RAPIDS Accelerator 24.02.0 using cudf 24.02.1.
25/07/22 17:53:08 WARN RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.
25/07/22 17:53:08 WARN RapidsPluginUtils: Multiple cudf jars found in the classpath:
revison: dd34fdbe35e68ba56a2183f11ed822ddaa6c927b
	jar URL: jar:file:/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/jars/rapids-4-spark_2.12-24.02.0.jar
	version=24.02.1
	user=
	

--- Data Loading ---


                                                                                

Data load time: 4.93 seconds
data schema:
root
 |-- Id: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- Tags: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- Y: string (nullable = true)

Y column distribution after load (should be HQ, LQ_EDIT, LQ_CLOSE):


25/07/22 17:53:28 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  !Exec <HashAggregateExec> cannot run on GPU because not all expressions can be replaced
    @Expression <AttributeReference> Y#5 could run on GPU
    @Expression <AggregateExpression> count(1) could run on GPU
      @Expression <Count> count(1) could run on GPU
        @Expression <Literal> 1 could run on GPU
    @Expression <AttributeReference> count(1)#18L could run on GPU
    @Expression <Alias> toprettystring(Y#5, Some(Europe/Istanbul)) AS toprettystring(Y)#25 could run on GPU
      !Expression <ToPrettyString> toprettystring(Y#5, 

+--------+-----+
|       Y|count|
+--------+-----+
|LQ_CLOSE|15000|
|      HQ|15000|
| LQ_EDIT|15000|
+--------+-----+

--- Data Cleaning and Feature Engineering (initial steps) ---
Initial data cleaning and feature engineering time: 0.18 seconds


25/07/22 17:53:32 WARN GpuOverrides: 
        ! <BatchEvalPythonExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.python.BatchEvalPythonExec
          @Expression <PythonUDF> clean_tags(Tags#3)#66 could not block GPU acceleration
            @Expression <AttributeReference> Tags#3 could run on GPU
          @Expression <AttributeReference> pythonUDF0#127 could run on GPU
              !Exec <FileSourceScanExec> cannot run on GPU because GpuCSVScan does not support modified escape chars; Not supported timezone type Europe/Istanbul.

25/07/22 17:53:32 WARN GpuOverrides: 
        ! <BatchEvalPythonExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.python.BatchEvalPythonExec
          @Expression <PythonUDF> clean_tags(Tags#3)#66 could not block GPU acceleration
            @Expression <AttributeReference> Tags#3 could run on GPU
          @Expression <AttributeRefer

Train data count: 35997, Test data count: 9003

--- Logistic Regression Training ---


25/07/22 17:53:43 WARN GpuOverrides: 
!Exec <ObjectHashAggregateExec> cannot run on GPU because not all expressions can be replaced
  @Expression <AggregateExpression> stringindexeraggregator(org.apache.spark.ml.feature.StringIndexerAggregator@6df56cf2, Some(createexternalrow(Y#5.toString, StructField(Y,StringType,true))), Some(interface org.apache.spark.sql.Row), Some(StructType(StructField(Y,StringType,true))), encodeusingserializer(input[0, java.lang.Object, true], true), decodeusingserializer(input[0, binary, true], Array[org.apache.spark.util.collection.OpenHashMap], true), encodeusingserializer(input[0, java.lang.Object, true], true), BinaryType, true, 0, 0) could run on GPU
    ! <ComplexTypedAggregateExpression> StringIndexerAggregator(org.apache.spark.sql.Row) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
      ! <CreateExternalRow> createexternalrow(Y#5.toString, StructField

Logistic Regression training time: 125.47 seconds
Logistic Regression prediction time: 0.40 seconds


25/07/22 17:55:48 WARN GpuOverrides: 
! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec
  ! <CreateExternalRow> createexternalrow(staticinvoke(class java.lang.Double, ObjectType(class java.lang.Double), valueOf, prediction#4065, true, false, true), staticinvoke(class java.lang.Double, ObjectType(class java.lang.Double), valueOf, label#3698, true, false, true), staticinvoke(class java.lang.Double, ObjectType(class java.lang.Double), valueOf, 1.0#4126, true, false, true), newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,false), StructField(label,DoubleType,false), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.o


Logistic Regression Results:
Accuracy: 0.6898
F1 Score: 0.6891
Confusion Matrix:


25/07/22 17:55:57 WARN GpuOverrides:                                (0 + 1) / 1]
!Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced
  @Expression <Alias> toprettystring(label#3698, Some(Europe/Istanbul)) AS toprettystring(label)#4695 could run on GPU
    !Expression <ToPrettyString> toprettystring(label#3698, Some(Europe/Istanbul)) cannot run on GPU because class org.apache.spark.sql.catalyst.expressions.ToPrettyString is not supported with timezone settings: (JVM: Europe/Istanbul, session: Europe/Istanbul). Set both of the timezones to UTC to enable class org.apache.spark.sql.catalyst.expressions.ToPrettyString support
      @Expression <AttributeReference> label#3698 could run on GPU
  @Expression <Alias> toprettystring(prediction#4065, Some(Europe/Istanbul)) AS toprettystring(prediction)#4696 could run on GPU
    !Expression <ToPrettyString> toprettystring(prediction#4065, Some(Europe/Istanbul)) cannot run on GPU because class org.apache.spark.sql.cata

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       0.0| 1902|
|  0.0|       1.0|  668|
|  0.0|       2.0|  407|
|  1.0|       0.0|  858|
|  1.0|       1.0| 1843|
|  1.0|       2.0|  279|
|  2.0|       0.0|  371|
|  2.0|       1.0|  210|
|  2.0|       2.0| 2465|
+-----+----------+-----+


--- Random Forest Training with Cross-Validation ---


25/07/22 17:55:57 WARN GpuOverrides: 
    !Exec <InMemoryTableScanExec> cannot run on GPU because ParquetCachedBatchSerializer is not being used
      @Expression <AttributeReference> Body#2 could run on GPU
      @Expression <AttributeReference> CleanBody#49 could run on GPU
      @Expression <AttributeReference> CreationDate#4 could run on GPU
      @Expression <AttributeReference> Id#0 could run on GPU
      @Expression <AttributeReference> Tags#3 could run on GPU
      @Expression <AttributeReference> Title#1 could run on GPU
      @Expression <AttributeReference> Y#5 could run on GPU
      @Expression <AttributeReference> avg_word_len#113 could run on GPU
      @Expression <AttributeReference> body_len#88 could run on GPU
      @Expression <AttributeReference> punct_count#100 could run on GPU
      @Expression <AttributeReference> tags_list#67 could run on GPU
      @Expression <AttributeReference> text#57 could run on GPU
      @Expression <AttributeReference> title_len#77 could 

An error occurred: An error occurred while calling o880.evaluate


Traceback (most recent call last):
  File "/tmp/ipykernel_9317/3562095174.py", line 153, in <module>
    cv_model = crossval.fit(train_data)
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/ml/base.py", line 205, in fit
    return self._fit(dataset)
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/ml/tuning.py", line 847, in _fit
    for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks):
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/multiprocessing/pool.py", line 873, in next
    raise value
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/ml/tuning.py", line 847, in <lambda>
    for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks):
  File "/home/bunveren/miniconda3

ConnectionRefusedError: [Errno 111] Connection refused