In [None]:
from pyspark.sql import SparkSession, SQLContext

#Initialize SparkSession with Dynamic Allocation configurations
spark_session = SparkSession.builder \
    .master("spark://192.168.2.157:7077") \
    .appName("test") \
    .config("spark.shuffle.file.buffer", "1mb") \
    .config("spark.executor.cores", 8) \
    .config("spark.executor.memory", "10g") \
    .config("spark.driver.port", 9999) \
    .config("spark.blockManager.port", 10005) \
    .getOrCreate()

#Set up the RDD API context
spark_context = spark_session.sparkContext

#Set log level to INFO to see informational messages
spark_context.setLogLevel("WARN")

In [None]:
df = spark_session.read.json("hdfs://192.168.2.157:9000/user/ubuntu/input/corpus-webis-tldr-17.json").cache()

In [None]:
df.show()

In [None]:
from pyspark.sql.functions import col

# get all the subreddits in a df
subreddits_df = df.select(col("subreddit"))
subreddits_df.show()

In [None]:
# Count the occurences of subreddits
subreddit_counts = subreddits_df.groupBy("subreddit").count()

In [None]:
# Order by the occurence of subbreddits with the most popular at the top
sorted_counts = subreddit_counts.orderBy(col("count").desc())

In [None]:
sorted_counts.show(10)

In [None]:
sorted_counts.show(10)

In [None]:
# Stop the SparkSession
spark_session.stop()

In [None]:
from pyspark.sql.functions import length

# Map each record to a DataFrame containing the length of the summary and the length of the text
lengths_df = df.select(length("summary").alias("summary_length"), length("content").alias("content_length"))

# Calculate the correlation between the two columns
correlation = lengths_df.stat.corr("summary_length", "content_length")

print("Correlation between summary length and text length:", correlation)

# Stop the SparkSession
spark_session.stop()