In [49]:
from pyspark.sql import SparkSession

In [50]:
import logging
logging.basicConfig(level=logging.ERROR)

In [51]:
# SparkSession oluşturma
spark = SparkSession.builder \
    .appName("SummarizationApp") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# HDFS'teki CSV dosyalarını okuma
train_data = spark.read.option("header", "true").csv("hdfs:///cnn_dailymail/train.csv")
test_data = spark.read.option("header", "true").csv("hdfs:///cnn_dailymail/test.csv")
validation_data = spark.read.option("header", "true").csv("hdfs:///cnn_dailymail/validation.csv")

In [52]:
from pyspark.sql.functions import lower, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# 1. 'id' ve 'highlights' sütunlarını kaldırma
train_data = train_data.drop("id", "highlights")
test_data = test_data.drop("id", "highlights")
validation_data = validation_data.drop("id", "highlights")

# 2. Null verileri düşürme
train_data = train_data.na.drop(subset=["article"])
test_data = test_data.na.drop(subset=["article"])
validation_data = validation_data.na.drop(subset=["article"])

# 3. Regex ile sadece alfabedik karakterleri ve boşlukları koruma
train_data = train_data.withColumn("article", regexp_replace("article", "[^a-zA-Z\s]", ""))
test_data = test_data.withColumn("article", regexp_replace("article", "[^a-zA-Z\s]", ""))
validation_data = validation_data.withColumn("article", regexp_replace("article", "[^a-zA-Z\s]", ""))

# 4. Tüm harfleri küçük harfe çevirme
train_data = train_data.withColumn("article", lower(train_data["article"]))
test_data = test_data.withColumn("article", lower(test_data["article"]))
validation_data = validation_data.withColumn("article", lower(validation_data["article"]))

# 5. Tokenizasyon
tokenizer = Tokenizer(inputCol="article", outputCol="tokens")
train_data = tokenizer.transform(train_data)
test_data = tokenizer.transform(test_data)
validation_data = tokenizer.transform(validation_data)

# 6. Stopwords kaldırma
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
train_data = remover.transform(train_data)
test_data = remover.transform(test_data)
validation_data = remover.transform(validation_data)

# Sonuçları kontrol etmek için
train_data.show(5)



+--------------------+--------------------+--------------------+
|             article|              tokens|     filtered_tokens|
+--------------------+--------------------+--------------------+
|by  associated pr...|[by, , associated...|[, associated, pr...|
| grand forks and ...|[, grand, forks, ...|[, grand, forks, ...|
|cnn  ralph mata w...|[cnn, , ralph, ma...|[cnn, , ralph, ma...|
| an internal affa...|[, an, internal, ...|[, internal, affa...|
| a complaint alle...|[, a, complaint, ...|[, complaint, all...|
+--------------------+--------------------+--------------------+
only showing top 5 rows

