In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("GPU_FINAL_TEST") \
    .master("local[*]") \
    .config("spark.plugins", "com.nvidia.spark.SQLPlugin") \
    .config("spark.driver.host", "localhost") \
    .getOrCreate()
print("Spark Session oluşturuldu.")

data = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]
columns = ["language", "users_count"]
df = spark.createDataFrame(data, columns)

print("Basit bir DataFrame oluşturuldu. Şimdi count() işlemi başlıyor...")

try:
    record_count = df.count()
    print(f"Başarılı, DataFrame'de {record_count} satır var.")
    print("Temel Spark-GPU entegrasyonu çalışıyor. Sorun pipeline'ın ileriki adımlarında.")
except Exception as e:
    print(f"HATA! count() işlemi sırasında bir sorun oluştu: {e}")
    print("Temel Spark-GPU entegrasyonu çalışmıyor. Sorun ortamın kendisinde.")

spark.stop()


"""
import traceback
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws, length, regexp_replace, size, split
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def main():
    spark = SparkSession.builder \
        .appName("GPU_IMP_Optimized") \
        .master("local[*]") \
        .config("spark.driver.memory", "8g") \
        .config("spark.plugins", "com.nvidia.spark.SQLPlugin") \
        .config("spark.sql.files.maxPartitionBytes", "512m") \
        .config("spark.rapids.sql.explain", "ALL") \
        .config("spark.rapids.ml.enabled", "true") \
        .config("spark.sql.session.timeZone", "UTC") \
        .getOrCreate()
        
    data_path = "/mnt/c/Users/BerenÜnveren/Desktop/BIL401/data/train.csv"

    schema = StructType([
        StructField("Id", IntegerType(), True),
        StructField("Title", StringType(), True),
        StructField("Body", StringType(), True),
        StructField("Y", StringType(), True)
    ])

    df = spark.read.format("csv") \
        .schema(schema) \
        .option("header", "true") \
        .option("quote", "\"") \
        .option("multiLine", "true") \
        .load(data_path)
    
    df.printSchema()
    df.groupBy("Y").count().show()

    df_clean = df.na.drop(subset=["Title", "Body", "Y"]) \
        .withColumn("CleanBody", regexp_replace(col("Body"), "<.*?>", "")) \
        .withColumn("text", concat_ws(" ", col("Title"), col("CleanBody")))

    df_featured = df_clean.withColumn("title_len", length(col("Title"))) \
        .withColumn("body_len", length(col("CleanBody"))) \
        .withColumn("punct_count", length(col("text")) - length(regexp_replace(col("text"), "[?!]", ""))) \
        .withColumn("avg_word_len", length(regexp_replace(col("text"), " ", "")) / (size(split(col("text"), " ")) + 1e-6))
    
    label_indexer = StringIndexer(inputCol="Y", outputCol="label", handleInvalid="skip")
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
    hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20000)
    idf = IDF(inputCol="raw_features", outputCol="text_features")
    
    feature_assembler = VectorAssembler(
        inputCols=["text_features", "title_len", "body_len", "punct_count", "avg_word_len"],
        outputCol="features"
    )

    (train_data, test_data) = df_featured.randomSplit([0.8, 0.2], seed=42)
    train_data.cache()
    test_data.cache()
    
    lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)
    lr_pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, hashing_tf, idf, feature_assembler, lr])
    lr_model = lr_pipeline.fit(train_data)
    lr_predictions = lr_model.transform(test_data)
    
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    lr_accuracy = evaluator.setMetricName("accuracy").evaluate(lr_predictions)
    lr_f1 = evaluator.setMetricName("f1").evaluate(lr_predictions)
    
    print("\nLogistic Regression Evaluation")
    print(f"Accuracy: {lr_accuracy:.4f}")
    print(f"F1 Score: {lr_f1:.4f}")
    print("Confusion Matrix:")
    lr_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

    rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
    rf_pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, hashing_tf, idf, feature_assembler, rf])
    
    rf_model = rf_pipeline.fit(train_data)
    rf_predictions = rf_model.transform(test_data)
    rf_accuracy = evaluator.setMetricName("accuracy").evaluate(rf_predictions)
    rf_f1 = evaluator.setMetricName("f1").evaluate(rf_predictions)

    print("\nRandom Forest Evaluation")
    print(f"Accuracy: {rf_accuracy:.4f}")
    print(f"F1 Score: {rf_f1:.4f}")
    print("Confusion Matrix:")
    rf_predictions.groupBy("label", "prediction").count().orderBy("label", "prediction").show()

if __name__ == '__main__':
    try:
        main()
    except Exception as e:
        print(f"An error occurred: {e}")
        traceback.print_exc()
    finally:
        from pyspark.sql import SparkSession
        spark = SparkSession.getActiveSession()
        if spark:
            spark.stop()
"""

25/07/20 23:01:55 WARN Utils: Your hostname, DESKTOP-15VE119 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/07/20 23:01:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/20 23:01:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/20 23:01:57 WARN RapidsPluginUtils: RAPIDS Accelerator 24.02.0 using cudf 24.02.1.
25/07/20 23:01:58 WARN RapidsPluginUtils: Multiple cudf jars found in the classpath:
revison: dd34fdbe35e68ba56a2183f11ed822ddaa6c927b
	jar URL: jar:file:/home/bunveren/miniconda3/envs/rapids-24.02/lib/python3.10/site-packages/pyspark/jars/rapids-4-spark_2.12-24.02.0.jar
	version=24.02.1
	user=
	revision=dd34fdbe35e68ba56a2183f11ed822ddaa6c927b
	branch=HEAD
	date=2024-02-28T05:34:16Z
	url=https:/

Spark Session oluşturuldu.


25/07/20 23:02:10 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


Basit bir DataFrame oluşturuldu. Şimdi count() işlemi başlıyor...


25/07/20 23:02:17 WARN GpuOverrides: 
        ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
          @Expression <AttributeReference> language#0 could run on GPU
          @Expression <AttributeReference> users_count#1L could run on GPU

25/07/20 23:02:18 WARN GpuOverrides: 
        ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
          @Expression <AttributeReference> language#0 could run on GPU
          @Expression <AttributeReference> users_count#1L could run on GPU

25/07/20 23:02:18 WARN GpuOverrides: 
        ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
          @Expression <AttributeReference> language#0 could run on GPU
          @Expression <AttributeReference> users_count#1L could run on GPU

25/07/20 23:02:1

BAŞARILI! DataFrame'de 3 satır var.
Temel Spark-GPU entegrasyonu çalışıyor. Sorun pipeline'ın ileriki adımlarında.


'\nimport traceback\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import col, concat_ws, length, regexp_replace, size, split\nfrom pyspark.sql.types import StructType, StructField, StringType, IntegerType\nfrom pyspark.ml import Pipeline\nfrom pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler\nfrom pyspark.ml.classification import LogisticRegression, RandomForestClassifier\nfrom pyspark.ml.evaluation import MulticlassClassificationEvaluator\n\ndef main():\n    spark = SparkSession.builder         .appName("GPU_IMP_Optimized")         .master("local[*]")         .config("spark.driver.memory", "8g")         .config("spark.plugins", "com.nvidia.spark.SQLPlugin")         .config("spark.sql.files.maxPartitionBytes", "512m")         .config("spark.rapids.sql.explain", "ALL")         .config("spark.rapids.ml.enabled", "true")         .config("spark.sql.session.timeZone", "UTC")         .getOrCreate()\n        \n    data_p