In [1]:
import re
import traceback
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, length, concat_ws, regexp_replace, size, split
from pyspark.sql.types import StringType, IntegerType, DoubleType, ArrayType
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer,
    VectorAssembler, NGram, Word2Vec
)
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

def clean_tags(tags):
    if tags is None: return []
    return tags.replace('<', ' ').replace('>', ' ').strip().split()

clean_tags_udf = udf(clean_tags, ArrayType(StringType()))

spark = (SparkSession.builder
    .appName("GPU_IMP_RECOMMENDED")
    .master("local[*]")
    .config("spark.driver.memory","12g")
    .config("spark.executor.memory", "12g")
    .config("spark.executor.cores", "4")
    .config("spark.driver.cores", "4")
    .config("spark.sql.shuffle.partitions", "200")
    .config("spark.plugins", "com.nvidia.spark.SQLPlugin")
    .config("spark.driver.host", "localhost")
    .config("spark.rapids.sql.explain", "DEBUG")
    .config("spark.rapids.sql.allowMultipleJars", "ALWAYS")
    .config("spark.rapids.sql.enabled", "true")
    .config("spark.rapids.memory.hostStorageFraction", "0.8")
    .config("spark.rapids.memory.deviceStorageFraction", "0.8")
    .config("spark.rapids.sql.concurrentGpuTasks", "2")
    .config("spark.memory.offHeap.enabled", "true")
    .config("spark.memory.offHeap.size", "4g")
    .config("spark.rapids.memory.gpu.allocation.limit", "0.9")
    .config("spark.rapids.host.shim.async", "true")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.rapids.sql.exec.CollectLimitExec", "true") 
    # .config("spark.rapids.sql.csv.read.escape.enabled", "true") # Bu, özel kaçış karakterleri için GPU'yu zorlayabilir, ancak test edilmesi gerekir.
    .getOrCreate()
)

data_path = "/mnt/c/Users/BerenÜnveren/Desktop/BIL401/data/train.csv"

try:
    print("--- Data Loading ---")
    start_load_time = time.time()
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("quote", "\"") \
        .option("escape", "\"") \
        .option("multiLine", "true") \
        .option("inferSchema", "true") \
        .load(data_path)
    load_time = time.time() - start_load_time
    print(f"Data load time: {load_time:.2f} seconds")

    print("data schema:")
    df.printSchema()
    print("Y column distribution after load (should be HQ, LQ_EDIT, LQ_CLOSE):")
    df.groupBy("Y").count().show()

    print("--- Data Cleaning and Feature Engineering (initial steps) ---")
    start_clean_feature_time = time.time()
    df_clean = df.na.drop(subset=["Title", "Body", "Tags", "Y"]) \
        .withColumn("CleanBody", regexp_replace(col("Body"), "<.*?>", "")) \
        .withColumn("text", concat_ws(" ", col("Title"), col("CleanBody"))) \
        .withColumn("tags_list", clean_tags_udf(col("Tags"))) # Bu UDF hala CPU'da çalışacak

    df_featured = df_clean.withColumn("title_len", length(col("Title"))) \
        .withColumn("body_len", length(col("CleanBody"))) \
        .withColumn("punct_count", length(col("text")) - length(regexp_replace(col("text"), "[?!]", ""))) \
        .withColumn("avg_word_len", length(regexp_replace(col("text"), " ", "")) / (size(split(col("text"), " ")) + 1e-6))
    clean_feature_time = time.time() - start_clean_feature_time
    print(f"Initial data cleaning and feature engineering time: {clean_feature_time:.2f} seconds")

    # StringIndexer, Tokenizer, StopWordsRemover, HashingTF, IDF, NGram, Word2Vec, VectorAssembler
    # gibi MLlib özellik transformatörlerinin çoğu, RAPIDS tarafından tam GPU hızlandırmasına sahip değildir
    # veya VectorUDT gibi desteklenmeyen tipler nedeniyle CPU'ya geri döner.
    # Bu adımlar GPU'da çalışmadığı için bellek sorunlarına neden olabilir.

    label_indexer = StringIndexer(inputCol="Y", outputCol="label", handleInvalid="skip")
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
    ngram = NGram(n=2, inputCol="filtered_words", outputCol="bigrams")

    hashing_tf_text = HashingTF(inputCol="filtered_words", outputCol="raw_text_features", numFeatures=20000)
    idf_text = IDF(inputCol="raw_text_features", outputCol="text_features")

    hashing_tf_bigrams = HashingTF(inputCol="bigrams", outputCol="raw_bigrams_features", numFeatures=20000)
    idf_bigrams = IDF(inputCol="raw_bigrams_features", outputCol="bigrams_features")

    w2v = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="w2v_features")

    hashing_tf_tags = HashingTF(inputCol="tags_list", outputCol="raw_tags_features", numFeatures=5000)
    idf_tags = IDF(inputCol="raw_tags_features", outputCol="tags_tags")

    feature_assembler = VectorAssembler(
        inputCols=["text_features", "bigrams_features", "w2v_features", "tags_tags",
                   "title_len", "body_len", "punct_count", "avg_word_len"],
        outputCol="features"
    )

    (train_data, test_data) = df_featured.randomSplit([0.8, 0.2], seed=42)
    train_data.cache(); test_data.cache()
    print(f"Train data count: {train_data.count()}, Test data count: {test_data.count()}")

    print("\n--- Logistic Regression Training ---")
    lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
    lr_pipeline = Pipeline(stages=[
        label_indexer, tokenizer, stopwords_remover, ngram,
        hashing_tf_text, idf_text, hashing_tf_bigrams, idf_bigrams, w2v, hashing_tf_tags, idf_tags,
        feature_assembler, lr
    ])

    start_lr_train_time = time.time()
    lr_model = lr_pipeline.fit(train_data)
    lr_train_time = time.time() - start_lr_train_time
    print(f"Logistic Regression training time: {lr_train_time:.2f} seconds")

    start_lr_predict_time = time.time()
    lr_predictions = lr_model.transform(test_data)
    lr_predict_time = time.time() - start_lr_predict_time
    print(f"Logistic Regression prediction time: {lr_predict_time:.2f} seconds")

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    accuracy_lr = evaluator.setMetricName("accuracy").evaluate(lr_predictions)
    f1_score_lr = evaluator.setMetricName("f1").evaluate(lr_predictions)

    print("\nLogistic Regression Results:")
    print(f"Accuracy: {accuracy_lr:.4f}")
    print(f"F1 Score: {f1_score_lr:.4f}")
    print("Confusion Matrix:")
    lr_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

    print("\n--- Random Forest Training with Cross-Validation ---")
    # çoklu-GPU Dask üzerinden desteklenir. CPU geri dönüşleri hala bir darboğazdır.
    rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42)
    rf_pipeline = Pipeline(stages=[
        label_indexer, tokenizer, stopwords_remover, ngram,
        hashing_tf_text, idf_text, hashing_tf_bigrams, idf_bigrams, w2v, hashing_tf_tags, idf_tags,
        feature_assembler, rf
    ])

    paramGrid = ParamGridBuilder() \
        .addGrid(rf.numTrees, [50, 100]) \
        .addGrid(rf.maxDepth, [5, 10]) \
        .build()

    crossval = CrossValidator(estimator=rf_pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1"),
                              numFolds=3)

    start_rf_train_time = time.time()
    cv_model = crossval.fit(train_data)
    rf_train_time = time.time() - start_rf_train_time
    print(f"Random Forest training (with CV) time: {rf_train_time:.2f} seconds")

    best_rf_model = cv_model.bestModel
    start_rf_predict_time = time.time()
    rf_predictions = best_rf_model.transform(test_data)
    rf_predict_time = time.time() - start_rf_predict_time
    print(f"Random Forest prediction time: {rf_predict_time:.2f} seconds")

    accuracy_rf = evaluator.setMetricName("accuracy").evaluate(rf_predictions)
    f1_score_rf = evaluator.setMetricName("f1").evaluate(rf_predictions)

    print("\nRandom Forest Results:")
    print(f"Accuracy: {accuracy_rf:.4f}")
    print(f"F1 Score: {f1_score_rf:.4f}")

    best_params_stage = best_rf_model.stages[-1]
    print(f"Best parameters: numTrees={best_params_stage.getNumTrees()}, maxDepth={best_params_stage.getMaxDepth()}")

    print("Confusion Matrix:")
    rf_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

except Exception as e:
    print(f"An error occurred: {e}")
    traceback.print_exc()
finally:
    try:
        spark.stop()
    except Exception as e:
        print(f"Error stopping Spark session: {e}")
        traceback.print_exc()

25/07/22 21:44:40 WARN Utils: Your hostname, DESKTOP-15VE119 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/07/22 21:44:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/22 21:44:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/22 21:44:42 WARN RapidsPluginUtils: RAPIDS Accelerator 24.02.0 using cudf 24.02.1.
25/07/22 21:44:42 WARN RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.
25/07/22 21:44:42 WARN RapidsPluginUtils: Multiple cudf jars found in the classpath:
revison: dd34fdbe35e68ba56a2183f11ed822ddaa6c927b
	jar URL: jar:file:/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/jars/rapids-4-spark_2.12-24.02.0.jar
	version=24.02.1
	user=
	

--- Data Loading ---


25/07/22 21:44:56 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

Data load time: 14.26 seconds
data schema:
root
 |-- Id: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- Tags: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- Y: string (nullable = true)

Y column distribution after load (should be HQ, LQ_EDIT, LQ_CLOSE):


25/07/22 21:45:10 WARN GpuOverrides: 
        !Exec <FileSourceScanExec> cannot run on GPU because GpuCSVScan does not support modified escape chars

25/07/22 21:45:10 WARN GpuOverrides: 
        !Exec <FileSourceScanExec> cannot run on GPU because GpuCSVScan does not support modified escape chars

25/07/22 21:45:10 WARN GpuOverrides: 
        !Exec <FileSourceScanExec> cannot run on GPU because GpuCSVScan does not support modified escape chars

25/07/22 21:45:10 WARN GpuOverrides: 
    !Exec <FileSourceScanExec> cannot run on GPU because GpuCSVScan does not support modified escape chars

                                                                                

+--------+-----+
|       Y|count|
+--------+-----+
|LQ_CLOSE|15000|
|      HQ|15000|
| LQ_EDIT|15000|
+--------+-----+

--- Data Cleaning and Feature Engineering (initial steps) ---
Initial data cleaning and feature engineering time: 0.16 seconds


25/07/22 21:45:14 WARN GpuOverrides: 
        ! <BatchEvalPythonExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.python.BatchEvalPythonExec
          @Expression <PythonUDF> clean_tags(Tags#3)#76 could not block GPU acceleration
            @Expression <AttributeReference> Tags#3 could run on GPU
          @Expression <AttributeReference> pythonUDF0#137 could run on GPU
              !Exec <FileSourceScanExec> cannot run on GPU because GpuCSVScan does not support modified escape chars

25/07/22 21:45:14 WARN GpuOverrides: 
        ! <BatchEvalPythonExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.python.BatchEvalPythonExec
          @Expression <PythonUDF> clean_tags(Tags#3)#76 could not block GPU acceleration
            @Expression <AttributeReference> Tags#3 could run on GPU
          @Expression <AttributeReference> pythonUDF0#203 could run on GPU
        

Train data count: 35997, Test data count: 9003

--- Logistic Regression Training ---


25/07/22 21:45:25 WARN GpuOverrides: 
!Exec <ObjectHashAggregateExec> cannot run on GPU because not all expressions can be replaced
  @Expression <AggregateExpression> stringindexeraggregator(org.apache.spark.ml.feature.StringIndexerAggregator@3be02a9f, Some(createexternalrow(Y#5.toString, StructField(Y,StringType,true))), Some(interface org.apache.spark.sql.Row), Some(StructType(StructField(Y,StringType,true))), encodeusingserializer(input[0, java.lang.Object, true], true), decodeusingserializer(input[0, binary, true], Array[org.apache.spark.util.collection.OpenHashMap], true), encodeusingserializer(input[0, java.lang.Object, true], true), BinaryType, true, 0, 0) could run on GPU
    ! <ComplexTypedAggregateExpression> StringIndexerAggregator(org.apache.spark.sql.Row) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
      ! <CreateExternalRow> createexternalrow(Y#5.toString, StructField

Logistic Regression training time: 124.93 seconds
Logistic Regression prediction time: 0.55 seconds


25/07/22 21:47:31 WARN GpuOverrides: 
! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec
  ! <CreateExternalRow> createexternalrow(staticinvoke(class java.lang.Double, ObjectType(class java.lang.Double), valueOf, prediction#4075, true, false, true), staticinvoke(class java.lang.Double, ObjectType(class java.lang.Double), valueOf, label#3708, true, false, true), staticinvoke(class java.lang.Double, ObjectType(class java.lang.Double), valueOf, 1.0#4136, true, false, true), newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,false), StructField(label,DoubleType,false), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.o


Logistic Regression Results:
Accuracy: 0.6915
F1 Score: 0.6910
Confusion Matrix:


25/07/22 21:47:40 WARN DAGScheduler: Broadcasting large task binary with size 4.6 MiB
                                                                                

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       0.0| 1921|
|  0.0|       1.0|  660|
|  0.0|       2.0|  396|
|  1.0|       0.0|  870|
|  1.0|       1.0| 1838|
|  1.0|       2.0|  272|
|  2.0|       0.0|  371|
|  2.0|       1.0|  208|
|  2.0|       2.0| 2467|
+-----+----------+-----+


--- Random Forest Training with Cross-Validation ---


25/07/22 21:47:40 WARN GpuOverrides: 
    !Exec <InMemoryTableScanExec> cannot run on GPU because ParquetCachedBatchSerializer is not being used
      @Expression <AttributeReference> Body#2 could run on GPU
      @Expression <AttributeReference> CleanBody#59 could run on GPU
      @Expression <AttributeReference> CreationDate#4 could run on GPU
      @Expression <AttributeReference> Id#0 could run on GPU
      @Expression <AttributeReference> Tags#3 could run on GPU
      @Expression <AttributeReference> Title#1 could run on GPU
      @Expression <AttributeReference> Y#5 could run on GPU
      @Expression <AttributeReference> avg_word_len#123 could run on GPU
      @Expression <AttributeReference> body_len#98 could run on GPU
      @Expression <AttributeReference> punct_count#110 could run on GPU
      @Expression <AttributeReference> tags_list#77 could run on GPU
      @Expression <AttributeReference> text#67 could run on GPU
      @Expression <AttributeReference> title_len#87 could 

An error occurred: An error occurred while calling o2736.fit


Traceback (most recent call last):
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
  File "/home/bunvere

Error stopping Spark session: [Errno 111] Connection refused


Traceback (most recent call last):
  File "/tmp/ipykernel_991/1675114968.py", line 189, in <module>
    spark.stop()
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/sql/session.py", line 1799, in stop
    self._jvm.SparkSession.clearDefaultSession()
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/py4j/java_gateway.py", line 1712, in __getattr__
    answer = self._gateway_client.send_command(
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/py4j/java_gateway.py", line 1036, in send_command
    connection = self._get_connection()
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/py4j/clientserver.py", line 284, in _get_connection
    connection = self._create_new_connection()
  File "/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/py4j/clientserver.py", line 291, in _create_new_connection
    connection.connect_to_java_server()
