In [19]:
#Import Relevant Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import lit
from pyspark.sql.functions import regexp_replace, col, concat_ws
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, FloatType

# Clean The Review Data

In [6]:
# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("Data Cleaning with Spark") \
    .getOrCreate()

# Load the data into Spark DataFrames
df_palworld = spark.read.csv('../Web Scraping/Palworld.csv', header=True, inferSchema=True)
df_craftopia = spark.read.csv('../Web Scraping/Craftopia.csv', header=True, inferSchema=True)
df_lethal = spark.read.csv('../Web Scraping/LethalCompany.csv', header=True, inferSchema=True)


                                                                                

In [7]:
# Replace 'EARLY ACCESS REVIEW' with an empty string in the 'ReviewText' column
df_palworld = df_palworld.withColumn("ReviewText", regexp_replace("ReviewText", "EARLY ACCESS REVIEW", ""))

# Show the first few rows of the DataFrame to verify changes
df_palworld.show()

+--------------------+-----------+------------+-------------------+---------------+
|          ReviewText|     Review|ReviewLength|          PlayHours|     DatePosted|
+--------------------+-----------+------------+-------------------+---------------+
|              i like|Recommended|          22| 25.8 hrs on record|Posted: 5 April|
|  Pokemon but better|Recommended|          33| 10.1 hrs on record|Posted: 5 April|
|           good game|Recommended|          25| 17.8 hrs on record|Posted: 5 April|
|You can kidnap pe...|Recommended|          37|  5.0 hrs on record|Posted: 5 April|
|           Fun game.|Recommended|          25|122.1 hrs on record|Posted: 5 April|
|                 123|Recommended|          20| 47.7 hrs on record|Posted: 5 April|
|Fun and interacti...|Recommended|          38| 49.7 hrs on record|Posted: 5 April|
|Love this game! Y...|Recommended|         126| 36.9 hrs on record|Posted: 5 April|
|good but DIGTOISE...|Recommended|         235| 64.0 hrs on record|Posted: 5

In [8]:
# Remove duplicates from df_palworld
df_palworld = df_palworld.dropDuplicates()

# Add a new column 'index' that acts as a row identifier
df_palworld = df_palworld.withColumn("index", monotonically_increasing_id())

In [9]:
# Cleaning df_craftopia
df_craftopia = df_craftopia.withColumn("ReviewText", regexp_replace("ReviewText", "EARLY ACCESS REVIEW", ""))
df_craftopia = df_craftopia.dropDuplicates()
df_craftopia = df_craftopia.withColumn("index", monotonically_increasing_id())

# Cleaning df_lethal
df_lethal = df_lethal.withColumn("ReviewText", regexp_replace("ReviewText", "EARLY ACCESS REVIEW", ""))
df_lethal = df_lethal.dropDuplicates()
df_lethal = df_lethal.withColumn("index", monotonically_increasing_id())


In [11]:
# Combining all three columns
df_lethal = df_lethal.withColumn("Game", lit("Lethal Companies"))
df_palworld = df_palworld.withColumn("Game", lit("Palworld"))
df_craftopia = df_craftopia.withColumn("Game", lit("Craftopia"))

In [12]:
# Combine all DataFrames into one
df_combined = df_lethal.unionByName(df_palworld).unionByName(df_craftopia)

# Show the combined DataFrame to verify it
df_combined.show()

[Stage 9:>                                                          (0 + 1) / 1]

+--------------------+-----------+------------+-------------------+---------------+-----+----------------+
|          ReviewText|     Review|ReviewLength|          PlayHours|     DatePosted|index|            Game|
+--------------------+-----------+------------+-------------------+---------------+-----+----------------+
|                 jeb|Recommended|          20| 21.0 hrs on record|Posted: April 5|    0|Lethal Companies|
|I will leave my c...|Recommended|         322|  5.0 hrs on record|Posted: April 5|    1|Lethal Companies|
|                good|Recommended|          21|104.5 hrs on record|Posted: April 5|    2|Lethal Companies|
|                deez|Recommended|          21| 15.3 hrs on record|Posted: April 5|    3|Lethal Companies|
|  funny with friends|Recommended|          33| 56.1 hrs on record|Posted: April 5|    4|Lethal Companies|
|pretty fun game e...|Recommended|          99| 48.7 hrs on record|Posted: April 5|    5|Lethal Companies|
|                 gud|Recommended|   

                                                                                

In [14]:
# Save DataFrame to Parquet
df_combined.write.parquet('df_combined.parquet')

In [15]:
# Stop the SparkSession
spark.stop()

# Clean Game Info Data

In [18]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Data Cleaning with Spark") \
    .getOrCreate()

# Load the CSV files into Spark DataFrames
info_palworld = spark.read.csv('../Web Scraping/InfoPalworld.csv', header=True, inferSchema=True)
info_craftopia = spark.read.csv('../Web Scraping/InfoCraftopia.csv', header=True, inferSchema=True)
info_lethal = spark.read.csv('../Web Scraping/InfoLethal.csv', header=True, inferSchema=True)

# Combine all DataFrames into one
combined_info = info_lethal.unionByName(info_craftopia).unionByName(info_palworld)

# Clean up the 'Tags' column by removing unwanted symbols
combined_info = combined_info.withColumn("Tags", regexp_replace("Tags", r'[^\w\s,]', ''))

# Convert 'In-Game Count' to integer after removing commas
combined_info = combined_info.withColumn(
    "In-Game Count",
    regexp_replace(col("In-Game Count"), ",", "").cast("int")
)

# Save the cleaned DataFrame to a Parquet file
combined_info.write.mode('overwrite').parquet('combined_info.parquet')

# Stop the Spark session
spark.stop()


                                                                                

# Clean Pricing Data

In [20]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Pricing Data Cleaning") \
    .getOrCreate()

# Load the pricing data into Spark DataFrames
pricing_palworld = spark.read.csv('../Web Scraping/Palworld_pricing.csv', header=True, inferSchema=True)
pricing_craftopia = spark.read.csv('../Web Scraping/Craftopia_pricing.csv', header=True, inferSchema=True)
pricing_lethal = spark.read.csv('../Web Scraping/Lethal_pricing.csv', header=True, inferSchema=True)

24/04/08 19:16:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [21]:
pricing_palworld = pricing_palworld.withColumn("Game", lit("Palworld"))
pricing_craftopia = pricing_craftopia.withColumn("Game", lit("Craftopia"))
pricing_lethal = pricing_lethal.withColumn("Game", lit("Lethal"))
pricing_combined = pricing_palworld.unionByName(pricing_craftopia).unionByName(pricing_lethal)

In [22]:
def clean_price_udf(price_str):
    """UDF to clean the price field and keep only digits, periods, and commas."""
    return ''.join(filter(lambda x: x.isdigit() or x in '.,', price_str))

# Register UDF
spark.udf.register("clean_price_udf", clean_price_udf, StringType())
clean_price = udf(clean_price_udf, StringType())

# Apply the cleaning UDF to price columns
pricing_combined = pricing_combined.withColumn("Current Price", clean_price(col("Current Price")))
pricing_combined = pricing_combined.withColumn("Converted Price", clean_price(col("Converted Price")))
pricing_combined = pricing_combined.withColumn("Lowest Recorded Price", clean_price(col("Lowest Recorded Price")))

# Convert the cleaned price fields to float
pricing_combined = pricing_combined.withColumn("Current Price", col("Current Price").cast(FloatType()))
pricing_combined = pricing_combined.withColumn("Converted Price", col("Converted Price").cast(FloatType()))
pricing_combined = pricing_combined.withColumn("Lowest Recorded Price", col("Lowest Recorded Price").cast(FloatType()))


In [23]:
pricing_combined.write.mode('overwrite').parquet('pricing_combined.parquet')


                                                                                

In [24]:
spark.stop()