In [1]:
import os
import math
from dis import show_code

import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from cffi.model import char_array_type
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import ShortType

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
df_path = r"F:\Datasets\CSV datasets\video games sales.csv"

In [4]:
spark = (
    SparkSession.builder
    .appName('MOMA art collection')
    .master('local[*]')
    .config('spark.executor.memory', '1g')
    .config('spark.executor.cores', '3')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "1")
    .config("spark.dynamicAllocation.maxExecutors", "4")
    .config('spark.executor.memoryOverhead', '256m')
    .config("spark.driver.memory", "1g")
    .config("spark.driver.maxResultSize", "1g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '16mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '60s')
    .config('spark.sql.autoBroadcastJoinThreshold', '128mb')
    .getOrCreate()
)

In [5]:
df = spark.read.option(
    'header', 'true'
).option(
    'inferSchema', 'true'
).csv(df_path)

In [9]:
from pyspark.sql.types import *

In [16]:
df = df.withColumn(
    'NA_Sales', F.col('NA_Sales').cast(FloatType())) \
    .withColumn('EU_Sales', F.col('EU_Sales').cast(FloatType())) \
    .withColumn('JP_Sales', F.col('JP_Sales').cast(FloatType())) \
    .withColumn('Other_Sales', F.col('Other_Sales').cast(FloatType())) \
    .withColumn('Global_Sales', F.col('Global_Sales').cast(FloatType())
)

In [29]:
df = df.withColumn(
    'Year', F.regexp_replace('Year', 'N/A', '1800').cast(ShortType())
)

In [30]:
df.select([
    F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)).alias(column)
    for column in df.columns
]).show()

+----+----+--------+----+-----+---------+--------+--------+--------+-----------+------------+
|Rank|Name|Platform|Year|Genre|Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|
+----+----+--------+----+-----+---------+--------+--------+--------+-----------+------------+
|   0|   0|       0|   0|    0|        0|       0|       0|       0|          0|           0|
+----+----+--------+----+-----+---------+--------+--------+--------+-----------+------------+



In [31]:
def count_distinct(df, column_name):
    return df.select(column_name).distinct().count()


{col: count_distinct(df, col) for col in df.columns}

{'Rank': 16598,
 'Name': 11493,
 'Platform': 31,
 'Year': 40,
 'Genre': 12,
 'Publisher': 579,
 'NA_Sales': 409,
 'EU_Sales': 305,
 'JP_Sales': 244,
 'Other_Sales': 157,
 'Global_Sales': 623}

In [34]:
count_exprs = [F.count_distinct(column).alias(column + '_distinct_count') for column in df.columns]

distinct_counts = df.select(*count_exprs).collect()[0].asDict()

for col, count in distinct_counts.items():
    original_col = col.replace('_distinct_count', '')
    print(f"Column '{original_col}' has {count} unique values")

Column 'Rank' has 16598 unique values
Column 'Name' has 11493 unique values
Column 'Platform' has 31 unique values
Column 'Year' has 40 unique values
Column 'Genre' has 12 unique values
Column 'Publisher' has 579 unique values
Column 'NA_Sales' has 409 unique values
Column 'EU_Sales' has 305 unique values
Column 'JP_Sales' has 244 unique values
Column 'Other_Sales' has 157 unique values
Column 'Global_Sales' has 623 unique values


In [44]:
platform_full_names = {
    "3DO": "3DO Interactive Multiplayer",
    "PC": "Personal Computer",
    "PS3": "PlayStation 3",
    "NES": "Nintendo Entertainment System",
    "PS": "PlayStation",
    "DC": "Dreamcast",
    "GEN": "Sega Genesis",
    "PS2": "PlayStation 2",
    "3DS": "Nintendo 3DS",
    "PCFX": "PC-FX",
    "GG": "Game Gear",
    "WiiU": "Wii U",
    "SNES": "Super Nintendo Entertainment System",
    "GB": "Game Boy",
    "SCD": "Sega CD",
    "N64": "Nintendo 64",
    "PS4": "PlayStation 4",
    "PSP": "PlayStation Portable",
    "2600": "Atari 2600",
    "XOne": "Xbox One",
    "X360": "Xbox 360",
    "GBA": "Game Boy Advance",
    "WS": "WonderSwan",
    "Wii": "Wii",
    "GC": "GameCube",
    "PSV": "PlayStation Vita",
    "XB": "Xbox",
    "DS": "Nintendo DS",
    "TG16": "TurboGrafx-16",
    "NG": "Neo Geo",
    "SAT": "Sega Saturn"
}

In [55]:
def get_full_name(abbreviation):
    if abbreviation is None:
        return None
    else:
        return platform_full_names.get(abbreviation, None)

get_full_name_udf = F.udf(get_full_name, StringType())

df = df.withColumn(
    "platform_full_name",
    get_full_name_udf(F.col("Platform"))
)

In [57]:
platform_count = df.select(
    'Platform',
).groupBy(
    'Platform',
).count().toPandas()

In [58]:
platform_count

Unnamed: 0,Platform,count
0,3DO,3
1,PC,960
2,PS3,1329
3,NES,98
4,PS,1196
5,DC,52
6,GEN,27
7,PS2,2161
8,3DS,509
9,PCFX,1
