In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("WordCount_MapReduce") \
    .getOrCreate()


25/12/27 15:03:33 WARN Utils: Your hostname, spark resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/12/27 15:03:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/27 15:03:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Access the SparkContext for RDD operations
sc = spark.sparkContext

In [3]:
# 1. Load Data as RDD (Not DataFrame)
# sc.textFile reads the file line by line into an RDD
print("\n[STEP 1] Loading Data...")
lines_rdd = sc.textFile("yapay_zeka.txt")

print(" -> Input RDD created. Sample of first 7 lines:")
for line in lines_rdd.take(7):
    print(f"    '{line[:60]}...'")


[STEP 1] Loading Data...
 -> Input RDD created. Sample of first 7 lines:


[Stage 0:>                                                          (0 + 1) / 1]

    'Yapay Zeka (Artificial Intelligence)...'
    '...'
    'Yapay zeka (AI), insan zekasını taklit eden ve topladığı bil...'
    '...'
    '1. Yapay Zekanın Temel Türleri...'
    'Yapay zeka sistemleri, yeteneklerine ve karmaşıklık düzeyler...'
    '...'


                                                                                

In [4]:
# 2. Map Step: Split lines into words and map to (word, 1)
# flatMap: Splits each line into multiple words
# map: Transforms each word into a tuple (word, 1)
print("\n[STEP 2] Map Phase...")
print(" -> Splitting lines into words and assigning count 1 to each...")
mapped_rdd = lines_rdd \
    .flatMap(lambda line: line.lower().split(" ")) \
    .map(lambda word: (word.strip(".,;:()[]\"'"), 1)) \
    .filter(lambda pair: pair[0] != "")  # Filter out empty strings


[STEP 2] Map Phase...
 -> Splitting lines into words and assigning count 1 to each...


In [5]:
# Show the (Key, Value)mapped_rdd pairs
reduced_rdd = mapped_rdd.reduceByKey(lambda a, b: a + b)
print(" -> Mapped RDD sample (Word, 1):")
print("   ", mapped_rdd.take(10))

 -> Mapped RDD sample (Word, 1):
    [('yapay', 1), ('zeka', 1), ('artificial', 1), ('intelligence', 1), ('yapay', 1), ('zeka', 1), ('ai', 1), ('insan', 1), ('zekasını', 1), ('taklit', 1)]


In [6]:
# 3. Reduce Step: Aggregate counts
# reduceByKey: Merges the values for each key (word) using the add function
print("\n[STEP 3] Reduce Phase...")
print(" -> Shuffling and reducing keys to calculate total frequencies...")
output_rdd = mapped_rdd.reduceByKey(lambda a, b: a + b)
print(" -> Reduced RDD sample (Word, Total Count):")
print("   ", reduced_rdd.take(5))


[STEP 3] Reduce Phase...
 -> Shuffling and reducing keys to calculate total frequencies...
 -> Reduced RDD sample (Word, Total Count):
    [('yapay', 30), ('artificial', 1), ('ai', 4), ('zekasını', 2), ('eden', 2)]


                                                                                

In [7]:
# 4. Action: Sort and Collect results
# sortBy: Sorts by the count (second element of tuple) in descending order
print("\n[STEP 4] Final Output...")
print(" -> Sorting results by descending frequency...")
sorted_output = output_rdd.sortBy(lambda pair: pair[1], ascending=False)


[STEP 4] Final Output...
 -> Sorting results by descending frequency...


In [8]:
# Display top 20 results
print("\n" + "="*60)
print("     Displaying top 20 results")
print("="*60)
results = sorted_output.take(20)
print(f"{'WORD':<20} | {'COUNT'}")
print("-" * 30)
for word, count in results:
    print(f"{word:<20} | {count}")

print("\n" + "="*60)
print("     ALGORITHM COMPLETED SUCCESSFULLY")
print("="*60)


     Displaying top 20 results
WORD                 | COUNT
------------------------------
yapay                | 30
ve                   | 28
zeka                 | 22
bir                  | 15
bu                   | 12
zekanın              | 8
insan                | 7
ai                   | 4
teknolojinin         | 4
öğrenme              | 4
için                 | 4
gibi                 | 4
alt                  | 4
veri                 | 4
makine               | 3
derin                | 3
da                   | 3
etik                 | 3
veya                 | 3
en                   | 3

     ALGORITHM COMPLETED SUCCESSFULLY
