In [19]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [2]:
pyspark.__version__

'3.5.1'

In [4]:
spark = (
    SparkSession
    .builder
    .appName("video Game Sales Analysis dev-mode") 
    .master("local[*]")
    .config("spark.sql.adaptive.enabled", "true") 
    # .config("spark.sql.shuffle.partitions", "8") 
    .getOrCreate()
)

24/05/02 20:00:12 WARN Utils: Your hostname, codespaces-0d4183 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/05/02 20:00:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/02 20:00:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
print(spark.sparkContext.defaultParallelism)

2


24/05/02 20:00:27 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [8]:
vg_sales_df = (
    spark.read
    .option("header", "true")
    .csv("../input_data/vgsales.csv")
)

In [9]:
vg_sales_df.show(5)

+----+--------------------+--------+----+------------+---------+--------+--------+--------+-----------+------------+
|Rank|                Name|Platform|Year|       Genre|Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|
+----+--------------------+--------+----+------------+---------+--------+--------+--------+-----------+------------+
|   1|          Wii Sports|     Wii|2006|      Sports| Nintendo|   41.49|   29.02|    3.77|       8.46|       82.74|
|   2|   Super Mario Bros.|     NES|1985|    Platform| Nintendo|   29.08|    3.58|    6.81|       0.77|       40.24|
|   3|      Mario Kart Wii|     Wii|2008|      Racing| Nintendo|   15.85|   12.88|    3.79|       3.31|       35.82|
|   4|   Wii Sports Resort|     Wii|2009|      Sports| Nintendo|   15.75|   11.01|    3.28|       2.96|          33|
|   5|Pokemon Red/Pokem...|      GB|1996|Role-Playing| Nintendo|   11.27|    8.89|   10.22|          1|       31.37|
+----+--------------------+--------+----+------------+---------+

In [10]:
vg_sales_df.count()

16598

In [11]:
vg_sales_df.printSchema

<bound method DataFrame.printSchema of DataFrame[Rank: string, Name: string, Platform: string, Year: string, Genre: string, Publisher: string, NA_Sales: string, EU_Sales: string, JP_Sales: string, Other_Sales: string, Global_Sales: string]>

## Easy Level

In [15]:
# Find the number of unique platforms in the dataset.
unique_platforms_df = (
    vg_sales_df
    .select('Platform')
    .distinct()
)

unique_platforms_df.show()
unique_platforms_df.count()

+--------+
|Platform|
+--------+
|     3DO|
|      PC|
|     PS3|
|     NES|
|      PS|
|      DC|
|     GEN|
|     PS2|
|     3DS|
|    PCFX|
|      GG|
|    WiiU|
|    SNES|
|      GB|
|     SCD|
|     N64|
|     PS4|
|     PSP|
|    2600|
|    XOne|
+--------+
only showing top 20 rows



31

In [21]:
# Calculate the total global sales.
global_sales_df = (
    vg_sales_df
    .agg(f.sum('Global_Sales').alias('Total_Global_Sales'))
)

global_sales_df.show()

+------------------+
|Total_Global_Sales|
+------------------+
| 8920.440000001283|
+------------------+



In [23]:
# Find the top 5 publishers with the highest number of games.

top_5_pub_df = (
    vg_sales_df
    .groupBy('Publisher')
    .agg(f.count('Name').alias("game_count"))
    .orderBy(f.desc('game_count'))
    .limit(5)
)

top_5_pub_df.show()

+--------------------+----------+
|           Publisher|game_count|
+--------------------+----------+
|     Electronic Arts|      1351|
|          Activision|       975|
|  Namco Bandai Games|       932|
|             Ubisoft|       921|
|Konami Digital En...|       832|
+--------------------+----------+



## Medium Level

In [28]:
# Calculate the average global sales for each year.
avg_global_sales_per_year_df = (
    vg_sales_df
    .groupBy("Year")
    .agg({"Global_Sales":"avg"})
    .orderBy(f.desc('Year'))
)

avg_global_sales_per_year_df.show()

+----+--------------------+
|Year|   avg(Global_Sales)|
+----+--------------------+
| N/A| 0.36929889298892965|
|2020|                0.29|
|2017|0.016666666666666666|
|2016| 0.20619186046511667|
|2015| 0.43068403908794456|
|2014|  0.5791237113402036|
|2013|  0.6741941391941367|
|2012|  0.5533333333333309|
|2011|  0.4530201931518849|
|2010|   0.476926131850671|
|2009|  0.4663172606568796|
|2008| 0.47542016806722354|
|2007|  0.5084276206322741|
|2006|  0.5169047619047537|
|2005|  0.4887778958554704|
|2004|  0.5495543905635631|
|2003| 0.46174193548386955|
|2002| 0.47710494571773016|
|2001|  0.6876970954356828|
|2000|  0.5775358166189117|
+----+--------------------+
only showing top 20 rows



In [29]:
# Find the platform with the highest average sales.
platform_with_highest_avg_sales = (
    vg_sales_df
    .groupBy('Platform')
    .agg({"Global_Sales":"avg"})
    .withColumnRenamed('avg(Global_Sales)', 'avg_global_sales')
    .orderBy(f.desc('avg_global_sales'))
    .limit(5)
)
platform_with_highest_avg_sales.show()

+--------+------------------+
|Platform|  avg_global_sales|
+--------+------------------+
|      GB| 2.606632653061223|
|     NES|2.5619387755102028|
|     GEN|1.0503703703703704|
|    SNES|0.8370292887029299|
|     PS4|0.8276785714285696|
+--------+------------------+



In [40]:
# Determine the total sales for each region (NA, EU, JP, Other) for the top 3 publishers.
total_sales_for_each_region_df = (
    vg_sales_df
    .groupBy('Publisher')
    .agg(f.round(f.sum("NA_Sales"), 2).alias("NA_Sales"),
         f.round(f.sum("EU_Sales"), 2).alias("EU_Sales"),
         f.round(f.sum("JP_Sales"), 2).alias("JP_Sales"),
         f.round(f.sum("Other_Sales"), 2).alias("Other_Sales")
        )
    .withColumn('Total_Sales',
                f.round(f.col("NA_Sales") + f.col("EU_Sales") + f.col("JP_Sales") + f.col("Other_Sales"), 2)
               )
    .orderBy(f.desc("Total_Sales"))
    .limit(3)
)
total_sales_for_each_region_df.show()

+---------------+--------+--------+--------+-----------+-----------+
|      Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Total_Sales|
+---------------+--------+--------+--------+-----------+-----------+
|       Nintendo|  816.87|  418.74|  455.42|      95.33|    1786.36|
|Electronic Arts|  595.07|  371.27|   14.04|     129.77|    1110.15|
|     Activision|   429.7|  215.53|    6.54|      75.34|     727.11|
+---------------+--------+--------+--------+-----------+-----------+



In [42]:
# Calculate the percentage of sales contributed by each region to the global sales.

percent_contributed_by_each_region_df = (
    total_sales_for_each_region_df
    .withColumn('NA_%', f.round(f.col('NA_Sales')/f.col('Total_Sales')*100, 2))
    .withColumn('EU_%', f.round(f.col('EU_Sales')/f.col('Total_Sales')*100, 2))
    .withColumn('JP_%', f.round(f.col('JP_Sales')/f.col('Total_Sales')*100 ,2))
    .withColumn('OTH_%', f.round(f.col('Other_Sales')/f.col('Total_Sales')*100, 2))
)

percent_contributed_by_each_region_df.show()

+---------------+--------+--------+--------+-----------+-----------+-----+-----+-----+-----+
|      Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Total_Sales| NA_%| EU_%| JP_%|OTH_%|
+---------------+--------+--------+--------+-----------+-----------+-----+-----+-----+-----+
|       Nintendo|  816.87|  418.74|  455.42|      95.33|    1786.36|45.73|23.44|25.49| 5.34|
|Electronic Arts|  595.07|  371.27|   14.04|     129.77|    1110.15| 53.6|33.44| 1.26|11.69|
|     Activision|   429.7|  215.53|    6.54|      75.34|     727.11| 59.1|29.64|  0.9|10.36|
+---------------+--------+--------+--------+-----------+-----------+-----+-----+-----+-----+

