In [1]:
#Import Relevant Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import lit
from pyspark.sql.functions import regexp_replace, col, concat_ws, when, regexp_extract
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, FloatType

# Clean The Review Data

In [57]:
# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("Data Cleaning with Spark") \
    .getOrCreate()

# Load the data into Spark DataFrames
df_palworld = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .option("multiLine", "true") \
                        .option("quote", "\"") \
                        .option("escape", "\"") \
                        .csv('../Web Scraping/Palworld.csv')

df_craftopia = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .option("multiLine", "true") \
                        .option("quote", "\"") \
                        .option("escape", "\"") \
                        .csv('../Web Scraping/Craftopia.csv', header=True, inferSchema=True)
df_lethal = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .option("multiLine", "true") \
                        .option("quote", "\"") \
                        .option("escape", "\"") \
                        .csv('../Web Scraping/LethalCompany.csv', header=True, inferSchema=True)


In [58]:
# Replace 'EARLY ACCESS REVIEW' with an empty string in the 'ReviewText' column
df_palworld = df_palworld.withColumn("ReviewText", regexp_replace("ReviewText", "EARLY ACCESS REVIEW", ""))

# Show the first few rows of the DataFrame to verify changes
df_palworld.show()

+--------------------+-----------+------------+-------------------+---------------+
|          ReviewText|     Review|ReviewLength|          PlayHours|     DatePosted|
+--------------------+-----------+------------+-------------------+---------------+
|              i like|Recommended|          22| 25.8 hrs on record|Posted: 5 April|
|  Pokemon but better|Recommended|          33| 10.1 hrs on record|Posted: 5 April|
|           good game|Recommended|          25| 17.8 hrs on record|Posted: 5 April|
|You can kidnap pe...|Recommended|          37|  5.0 hrs on record|Posted: 5 April|
|           Fun game.|Recommended|          25|122.1 hrs on record|Posted: 5 April|
|                 123|Recommended|          20| 47.7 hrs on record|Posted: 5 April|
|Fun and interacti...|Recommended|          38| 49.7 hrs on record|Posted: 5 April|
|Love this game! Y...|Recommended|         126| 36.9 hrs on record|Posted: 5 April|
|good but DIGTOISE...|Recommended|         235| 64.0 hrs on record|Posted: 5

In [59]:
df_palworld.select("Review").distinct().show()

+---------------+
|         Review|
+---------------+
|Not Recommended|
|    Recommended|
+---------------+



In [60]:
# Remove duplicates from df_palworld
df_palworld = df_palworld.dropDuplicates()

# Add a new column 'index' that acts as a row identifier
df_palworld = df_palworld.withColumn("index", monotonically_increasing_id())

In [61]:
# Calculate the number of rows in the df_palworld DataFrame
num_rows = df_palworld.count()

# Print the result
print("Number of rows in df_palworld:", num_rows)

Number of rows in df_palworld: 150


In [62]:
# Cleaning df_craftopia
df_craftopia = df_craftopia.withColumn("ReviewText", regexp_replace("ReviewText", "EARLY ACCESS REVIEW", ""))
df_craftopia = df_craftopia.dropDuplicates()
df_craftopia = df_craftopia.withColumn("index", monotonically_increasing_id())

# Cleaning df_lethal
df_lethal = df_lethal.withColumn("ReviewText", regexp_replace("ReviewText", "EARLY ACCESS REVIEW", ""))
df_lethal = df_lethal.dropDuplicates()
df_lethal = df_lethal.withColumn("index", monotonically_increasing_id())


In [63]:
# Combining all three columns
df_lethal = df_lethal.withColumn("Game", lit("Lethal Companies"))
df_palworld = df_palworld.withColumn("Game", lit("Palworld"))
df_craftopia = df_craftopia.withColumn("Game", lit("Craftopia"))

In [64]:
# Combine all DataFrames into one
df_combined = df_lethal.unionByName(df_palworld).unionByName(df_craftopia)

In [65]:
# Extracting the numerical part of the "PlayHours" and converting it to float
df_combined = df_combined.withColumn("PlayHours", regexp_extract(col("PlayHours"), "(\d+\.\d+|\d+)", 0).cast("float"))
df_combined = df_combined.drop("index")
# Show the updated DataFrame to verify the changes
df_combined.show()

+--------------------+---------------+------------+---------+---------------+----------------+
|          ReviewText|         Review|ReviewLength|PlayHours|     DatePosted|            Game|
+--------------------+---------------+------------+---------+---------------+----------------+
|           good game|    Recommended|          25|     34.1|Posted: April 5|Lethal Companies|
|  very lethal, 10/10|    Recommended|          33|     19.9|Posted: April 5|Lethal Companies|
|pretty fun game e...|    Recommended|          99|     48.7|Posted: April 5|Lethal Companies|
|                good|    Recommended|          21|     33.9|Posted: April 5|Lethal Companies|
|I saw my friend b...|    Recommended|          84|     58.4|Posted: April 5|Lethal Companies|
|The community is ...|    Recommended|          89|     19.6|Posted: April 5|Lethal Companies|
|             love it|    Recommended|          23|      4.6|Posted: April 5|Lethal Companies|
|The future pentac...|    Recommended|          47

In [66]:
# Convert 'ReviewLength' from string to integer
df_combined = df_combined.withColumn("ReviewLength", col("ReviewLength").cast("integer"))

In [67]:
# Print the schema of the df_combined DataFrame
df_combined.printSchema()


root
 |-- ReviewText: string (nullable = true)
 |-- Review: string (nullable = true)
 |-- ReviewLength: integer (nullable = true)
 |-- PlayHours: float (nullable = true)
 |-- DatePosted: string (nullable = true)
 |-- Game: string (nullable = false)



In [68]:
# Save DataFrame to Parquet
df_combined.write.mode('overwrite').parquet('df_combined.parquet')

In [69]:
# Stop the SparkSession
spark.stop()

# Clean Game Info Data

In [24]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Data Cleaning with Spark") \
    .getOrCreate()

# Load the CSV files into Spark DataFrames
info_palworld = spark.read.csv('../Web Scraping/InfoPalworld.csv', header=True, inferSchema=True)
info_craftopia = spark.read.csv('../Web Scraping/InfoCraftopia.csv', header=True, inferSchema=True)
info_lethal = spark.read.csv('../Web Scraping/InfoLethal.csv', header=True, inferSchema=True)

# Normalize the game name in the info_lethal DataFrame
info_lethal = info_lethal.withColumn("Title", 
                                     when(col("Title") == "Lethal Company", "Lethal Companies")
                                     .otherwise(col("Title")))

# Combine all DataFrames into one
combined_info = info_lethal.unionByName(info_craftopia).unionByName(info_palworld)

# Clean up the 'Tags' column by removing unwanted symbols
combined_info = combined_info.withColumn("Tags", regexp_replace("Tags", r'[^\w\s,]', ''))

# Convert 'In-Game Count' to integer after removing commas
combined_info = combined_info.withColumn(
    "In-Game Count",
    regexp_replace(col("In-Game Count"), ",", "").cast("int")
)

24/04/11 16:18:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [25]:
combined_info.show()

+----------------+----------+-------------+--------------------+--------------------+
|           Title| Developer|In-Game Count|                Tags|          Categories|
+----------------+----------+-------------+--------------------+--------------------+
|Lethal Companies|  Zeekerss|        22876| Online CoOp,  Ho...|Single-player, On...|
|       Craftopia|Pocketpair|          335| Open World,  Cra...|Single-player, On...|
|        Palworld|Pocketpair|        60281| Multiplayer,  Op...|Single-player, On...|
+----------------+----------+-------------+--------------------+--------------------+



In [26]:
combined_info = combined_info.withColumn("In-Game Count", col("In-Game Count").cast("integer"))
combined_info.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Developer: string (nullable = true)
 |-- In-Game Count: integer (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Categories: string (nullable = true)



In [27]:
# Rename the 'Title' column to 'Game'
combined_info = combined_info.withColumnRenamed("Title", "Game")
combined_info.show()

+----------------+----------+-------------+--------------------+--------------------+
|            Game| Developer|In-Game Count|                Tags|          Categories|
+----------------+----------+-------------+--------------------+--------------------+
|Lethal Companies|  Zeekerss|        22876| Online CoOp,  Ho...|Single-player, On...|
|       Craftopia|Pocketpair|          335| Open World,  Cra...|Single-player, On...|
|        Palworld|Pocketpair|        60281| Multiplayer,  Op...|Single-player, On...|
+----------------+----------+-------------+--------------------+--------------------+



In [28]:
# Save the cleaned DataFrame to a Parquet file
combined_info.write.mode('overwrite').parquet('combined_info.parquet')

In [45]:
# Stop the Spark session
spark.stop()

# Clean Pricing Data

In [42]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Pricing Data Cleaning") \
    .getOrCreate()

# Load the pricing data into Spark DataFrames
pricing_palworld = spark.read.csv('../Web Scraping/Palworld_pricing.csv', header=True, inferSchema=True)
pricing_craftopia = spark.read.csv('../Web Scraping/Craftopia_pricing.csv', header=True, inferSchema=True)
pricing_lethal = spark.read.csv('../Web Scraping/Lethal_pricing.csv', header=True, inferSchema=True)

24/04/08 20:14:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [43]:
pricing_palworld = pricing_palworld.withColumn("Game", lit("Palworld"))
pricing_craftopia = pricing_craftopia.withColumn("Game", lit("Craftopia"))
pricing_lethal = pricing_lethal.withColumn("Game", lit("Lethal Companies"))
pricing_combined = pricing_palworld.unionByName(pricing_craftopia).unionByName(pricing_lethal)

In [44]:
def clean_price_udf(price_str):
    """UDF to clean the price field and keep only digits, periods, and commas."""
    return ''.join(filter(lambda x: x.isdigit() or x in '.,', price_str))

# Register UDF
spark.udf.register("clean_price_udf", clean_price_udf, StringType())
clean_price = udf(clean_price_udf, StringType())

# Apply the cleaning UDF to price columns
pricing_combined = pricing_combined.withColumn("Current Price", clean_price(col("Current Price")))
pricing_combined = pricing_combined.withColumn("Converted Price", clean_price(col("Converted Price")))
pricing_combined = pricing_combined.withColumn("Lowest Recorded Price", clean_price(col("Lowest Recorded Price")))

# Convert the cleaned price fields to float
pricing_combined = pricing_combined.withColumn("Current Price", col("Current Price").cast(FloatType()))
pricing_combined = pricing_combined.withColumn("Converted Price", col("Converted Price").cast(FloatType()))
pricing_combined = pricing_combined.withColumn("Lowest Recorded Price", col("Lowest Recorded Price").cast(FloatType()))


In [45]:
pricing_combined.show()

[Stage 6:>                                                          (0 + 1) / 1]

+-------------------+-------------+---------------+---------------------+--------+
|           Currency|Current Price|Converted Price|Lowest Recorded Price|    Game|
+-------------------+-------------+---------------+---------------------+--------+
|      British Pound|       22.491|          22.49|               22.491|Palworld|
|   South Asia - USD|        9.441|           7.47|                 7.47|Palworld|
|      Russian Ruble|      99010.0|           8.47|                 8.47|Palworld|
| South African Rand|      243.001|          10.31|                10.31|Palworld|
|  CIS - U.S. Dollar|       13.041|          10.32|                10.32|Palworld|
|       Chinese Yuan|       97.201|          10.63|                10.63|Palworld|
|LATAM - U.S. Dollar|       13.491|          10.68|                10.68|Palworld|
| MENA - U.S. Dollar|       13.491|          10.68|                10.68|Palworld|
|  Malaysian Ringgit|       64.801|           10.8|                 10.8|Palworld|
|   

                                                                                

In [39]:
pricing_combined.write.mode('overwrite').parquet('pricing_combined.parquet')


                                                                                

In [40]:
spark.stop()