# jupyter lab --ip=0.0.0.0
# start-master.sh
# start-worker.sh spark://yash-kukrejade-6:7077

In [28]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# from F import col, explode, lower, split

spark_session = SparkSession.builder\
        .master("spark://yash-kukrejade-6:7077") \
        .appName("BadWords")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", 2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# RDD API
spark_context = spark_session.sparkContext

spark_context.setLogLevel("WARN")

In [29]:
# root
#  |-- author: string (nullable = true) 
#  |-- body: string (nullable = true)
#  |-- content: string (nullable = true)
#  |-- content_len: long (nullable = true)
#  |-- id: string (nullable = true)
#  |-- normalizedBody: string (nullable = true)
#  |-- subreddit: string (nullable = true)
#  |-- subreddit_id: string (nullable = true)
#  |-- summary: string (nullable = true)
#  |-- summary_len: long (nullable = true) 
#  |-- title: string (nullable = true)

reddit_data = spark_session.read.json("/home/ubuntu/sample_2000.json")
# Print the first 10 lines of the DataFrame
reddit_data.show(10)
reddit_data = reddit_data.drop(
    *[
        "content_len",
        "summary_len",
        "id",
        "subreddit_id",
        "body",
        "content",
        "summary",
        "title",
        "subreddit",
        "author"
    ]
)
reddit_data.show()

                                                                                

+------------------+--------------------+--------------------+-----------+-------+--------------------+--------------------+------------+--------------------+-----------+--------+
|            author|                body|             content|content_len|     id|      normalizedBody|           subreddit|subreddit_id|             summary|summary_len|   title|
+------------------+--------------------+--------------------+-----------+-------+--------------------+--------------------+------------+--------------------+-----------+--------+
|  raysofdarkmatter|I think it should...|I think it should...|        178|c69al3r|I think it should...|                math|    t5_2qh0n|Shifting seasonal...|          8|    null|
|           Stork13|Art is about the ...|Art is about the ...|        148|c6a9nxd|Art is about the ...|               funny|    t5_2qh33|Personal opinions...|          4|    null|
|     Cloud_dreamer|Ask me what I thi...|Ask me what I thi...|         76|c6acx4l|Ask me what I thi.

In [30]:
# https://www.cs.cmu.edu/~biglou/resources/bad-words.txt
bad_words = spark_context.textFile("/home/ubuntu/bad_words.txt")
# bad_words = spark_session.createDataFrame(bad_words, ['bad_words'])
bad_words = bad_words.filter(lambda x: x != '').collect()
# print(bad_words)

                                                                                

In [31]:
# reddit_data = reddit_data.withColumn("splited_words", F.split("normalizedBody", ' '))
reddit_data = reddit_data.filter(F.col('normalizedBody').rlike('|'.join(bad_words)))

In [32]:
# Read bad words from text file
bad_words = spark_session.read.text("/home/ubuntu/bad_words.txt") \
    .filter(F.col("value") != "") \
    .select(F.col("value").alias("bad_word"))

# Extract words from comments and explode them
words = reddit_data.select(F.explode(F.split(F.lower(F.col("normalizedBody")), "\\s+")).alias("word"))

# Filter out bad words
bad_words_counts = words.join(bad_words, words.word == bad_words.bad_word, "left_outer") \
    .filter(F.col("bad_word").isNotNull()) \
    .groupBy("bad_word") \
    .count() \
    .orderBy("count", ascending=False)

# # Show bad words counts
bad_words_counts.show()

# Stop SparkSession
spark_session.stop()




+--------+-----+
|bad_word|count|
+--------+-----+
|    shit|  176|
| fucking|  170|
|    fuck|  127|
|     god|  103|
|    kill|   90|
|     kid|   87|
|   black|   86|
|     sex|   80|
|  stupid|   71|
|     ass|   66|
|    damn|   63|
|   fight|   59|
|   girls|   56|
|     fat|   54|
|   death|   52|
|     gun|   50|
|    hell|   49|
|     gay|   49|
|  killed|   46|
|  fucked|   45|
+--------+-----+
only showing top 20 rows



                                                                                