# Spread Profanity Analysis

**Author:** Simon Pislar

**Date Created:** March 14, 2024

**Description:** This notebook analyzes the spread of profanity across Reddit users and subreddits. It uses a dataset of profanity words and a dataset of Reddit posts to find the top 20 users who have used profanity across the most subreddits.

**Output:** The top 20 users with profanity across the most subreddits.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, explode, split, countDistinct, broadcast

#Initialize Spark session
spark_session = SparkSession.builder\
        .master("spark://192.168.2.193:7077") \
        .appName("Spread_Profanity_Analysis")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", 2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

#Define the spark context
spark_context = spark_session.sparkContext
#Set log level
spark_context.setLogLevel("ERROR")

In [None]:
# Load the profanity dataset
profanity_df = spark_session.read.csv("file:///home/ubuntu/profanity/profanity_en.csv", 
                                   header=True, inferSchema=True).select("text", "severity_rating")

# Load the Reddit dataset
reddit_df = spark_session.read.json("file:///home/ubuntu/volume/reddit/corpus-webis-tldr-17.json")


In [None]:
# Tokenize the Reddit dataset
reddit_tokenized = reddit_df.withColumn("words", explode(split(lower(col("body")), "\\W+")))

# Filter for Reddit posts that contain profanity
reddit_profanity = reddit_tokenized.join(broadcast(profanity_df), col("words") == col("text"), "inner")

# Count distinct subreddits for each author
author_subreddit_count = reddit_profanity.groupBy("author") \
                                         .agg(countDistinct("subreddit").alias("subreddit_count"))


In [None]:
# Get the top 20 users with profanity across the most subreddits
top_users = author_subreddit_count.orderBy(col("subreddit_count").desc()).limit(20)

top_users.show()

In [None]:
# Finish the job
spark_session.stop()