In [2]:
import traceback
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws, length, regexp_replace, size, split
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def main():
    spark = (SparkSession.builder
    .appName("GPU_IMP")
        .master("local[*]")
        .config("spark.plugins", "com.nvidia.spark.SQLPlugin")
        .config("spark.driver.host", "localhost")
        .getOrCreate()
    )
        
    data_path = "/mnt/c/Users/BerenÜnveren/Desktop/BIL401/data/train.csv"

    schema = StructType([
        StructField("Id", IntegerType(), True),
        StructField("Title", StringType(), True),
        StructField("Body", StringType(), True),
        StructField("Y", StringType(), True)
    ])

    df = spark.read.format("csv") \
        .schema(schema) \
        .option("header", "true") \
        .option("quote", "\"") \
        .option("multiLine", "true") \
        .load(data_path)
    
    df.printSchema()
    df.groupBy("Y").count().show()

    df_clean = df.na.drop(subset=["Title", "Body", "Y"]) \
        .withColumn("CleanBody", regexp_replace(col("Body"), "<.*?>", "")) \
        .withColumn("text", concat_ws(" ", col("Title"), col("CleanBody")))

    df_featured = df_clean.withColumn("title_len", length(col("Title"))) \
        .withColumn("body_len", length(col("CleanBody"))) \
        .withColumn("punct_count", length(col("text")) - length(regexp_replace(col("text"), "[?!]", ""))) \
        .withColumn("avg_word_len", length(regexp_replace(col("text"), " ", "")) / (size(split(col("text"), " ")) + 1e-6))
    
    label_indexer = StringIndexer(inputCol="Y", outputCol="label", handleInvalid="skip")
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
    hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20000)
    idf = IDF(inputCol="raw_features", outputCol="text_features")
    
    feature_assembler = VectorAssembler(
        inputCols=["text_features", "title_len", "body_len", "punct_count", "avg_word_len"],
        outputCol="features"
    )

    (train_data, test_data) = df_featured.randomSplit([0.8, 0.2], seed=42)
    train_data.cache()
    test_data.cache()
    
    lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
    lr_pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, hashing_tf, idf, feature_assembler, lr])
    lr_model = lr_pipeline.fit(train_data)
    lr_predictions = lr_model.transform(test_data)
    
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    lr_accuracy = evaluator.setMetricName("accuracy").evaluate(lr_predictions)
    lr_f1 = evaluator.setMetricName("f1").evaluate(lr_predictions)
    
    print("\nLogistic Regression Evaluation")
    print(f"Accuracy: {lr_accuracy:.4f}")
    print(f"F1 Score: {lr_f1:.4f}")
    print("Confusion Matrix:")
    lr_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

    rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
    rf_pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, hashing_tf, idf, feature_assembler, rf])
    
    rf_model = rf_pipeline.fit(train_data)
    rf_predictions = rf_model.transform(test_data)
    rf_accuracy = evaluator.setMetricName("accuracy").evaluate(rf_predictions)
    rf_f1 = evaluator.setMetricName("f1").evaluate(rf_predictions)

    print("\nRandom Forest Evaluation")
    print(f"Accuracy: {rf_accuracy:.4f}")
    print(f"F1 Score: {rf_f1:.4f}")
    print("Confusion Matrix:")
    rf_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

if __name__ == '__main__':
    try:
        main()
    except Exception as e:
        print(f"An error occurred: {e}")
        traceback.print_exc()
    finally:
        from pyspark.sql import SparkSession
        spark = SparkSession.getActiveSession()
        if spark:
            spark.stop()


25/07/21 00:26:53 WARN RapidsPluginUtils: Multiple cudf jars found in the classpath:
revison: dd34fdbe35e68ba56a2183f11ed822ddaa6c927b
	jar URL: jar:file:/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/jars/rapids-4-spark_2.12-24.02.0.jar
	version=24.02.1
	user=
	revision=dd34fdbe35e68ba56a2183f11ed822ddaa6c927b
	branch=HEAD
	date=2024-02-28T05:34:16Z
	url=https://github.com/rapidsai/cudf.git
	jar URL: jar:file:/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/jars/cudf-24.02.2-cuda12.jar
	version=24.02.2
	user=
	revision=dd34fdbe35e68ba56a2183f11ed822ddaa6c927b
	branch=HEAD
	date=2024-02-28T07:51:45Z
Please make sure there is only one cudf jar in the classpath. If it is impossible to fix the classpath you can suppress the error by setting spark.rapids.sql.allowMultipleJars to ALWAYS, but this can cause unpredictable behavior as the plugin may pick up the wrong jar.
25/07/21 00:26:53 WARN RapidsPluginUtils: RAPIDS Accelera

root
 |-- Id: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- Y: string (nullable = true)



25/07/21 00:26:54 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  !Exec <HashAggregateExec> cannot run on GPU because not all expressions can be replaced
    @Expression <AttributeReference> Y#1236 could run on GPU
    @Expression <AggregateExpression> count(1) could run on GPU
      @Expression <Count> count(1) could run on GPU
        @Expression <Literal> 1 could run on GPU
    @Expression <AttributeReference> count(1)#1245L could run on GPU
    @Expression <Alias> toprettystring(Y#1236, Some(Europe/Istanbul)) AS toprettystring(Y)#1252 could run on GPU
      !Expression <ToPrettyString> toprettyst

+--------------------+-----+
|                   Y|count|
+--------------------+-----+
|<node.js><windows...|    1|
|<php><arrays><ite...|    1|
|<fortran><subrout...|    1|
|<asp.net-core><en...|    3|
|         <resharper>|    2|
|<sql><sql-server>...|    1|
|<sql><oracle><sna...|    1|
|<java><arraylist>...|    1|
| <regex><python-3.x>|    1|
|<intellij-idea><a...|    1|
|<java><java-strea...|    1|
|<angular><angular...|    2|
|<apache-spark><st...|    1|
|<java><mysql><sql...|    1|
|<python><scikit-l...|    1|
|<amazon-web-servi...|    1|
|<javascript><requ...|    1|
|<docker><apk><alp...|    1|
|<c#><asp.net><dat...|    2|
|<php><database><l...|    1|
+--------------------+-----+
only showing top 20 rows



25/07/21 00:26:55 WARN GpuOverrides: 
!Exec <ObjectHashAggregateExec> cannot run on GPU because not all expressions can be replaced
  @Expression <AggregateExpression> stringindexeraggregator(org.apache.spark.ml.feature.StringIndexerAggregator@6339ea37, Some(createexternalrow(Y#1236.toString, StructField(Y,StringType,true))), Some(interface org.apache.spark.sql.Row), Some(StructType(StructField(Y,StringType,true))), encodeusingserializer(input[0, java.lang.Object, true], true), decodeusingserializer(input[0, binary, true], Array[org.apache.spark.util.collection.OpenHashMap], true), encodeusingserializer(input[0, java.lang.Object, true], true), BinaryType, true, 0, 0) could run on GPU
    ! <ComplexTypedAggregateExpression> StringIndexerAggregator(org.apache.spark.sql.Row) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
      ! <CreateExternalRow> createexternalrow(Y#1236.toString, Struc

An error occurred: An error occurred while calling o399.fit.
: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ml.linalg.DenseMatrix$.zeros(Matrices.scala:507)
	at org.apache.spark.ml.classification.LogisticRegression.createInitialSolution(LogisticRegression.scala:840)
	at org.apache.spark.ml.classification.LogisticRegression.$anonfun$train$1(LogisticRegression.scala:619)
	at org.apache.spark.ml.classification.LogisticRegression$$Lambda/0x00007fc6484d2ef0.apply(Unknown Source)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at org.apache.spark.ml.util.Instrumentation$$$Lambda/0x00007fc6484d34c0.apply(Unknown Source)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:497)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala