# Video games sales dataset

In [None]:
# Matplotlib forms basis for visualization in Python
import matplotlib.pyplot as plt

# We will use the Seaborn library
import seaborn as sns

sns.set()

# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'

# Increase the default plot size and set the color scheme
plt.rcParams["figure.figsize"] = (8, 5)
plt.rcParams["image.cmap"] = "viridis"
import pandas as pd

In [None]:
DATA_URL = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

In [None]:
df = pd.read_csv(DATA_URL + "video_games_sales.csv").dropna()
print(df.shape)

In [None]:
df.head(10)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df["User_Score"] = df["User_Score"].astype("float64")
df["Year_of_Release"] = df["Year_of_Release"].astype("int64")
df["User_Count"] = df["User_Count"].astype("int64")
df["Critic_Count"] = df["Critic_Count"].astype("int64")

In [None]:
useful_cols = [
    "Name",
    "Platform",
    "Year_of_Release",
    "Genre",
    "Global_Sales",
    "Critic_Score",
    "Critic_Count",
    "User_Score",
    "User_Count",
    "Rating",
]
df[useful_cols].head()

In [None]:
# plotting using df.plot()
df[[x for x in df.columns if "Sales" in x] + ["Year_of_Release"]].groupby(
    "Year_of_Release"
).sum().plot()

In [None]:
df[[x for x in df.columns if "Sales" in x] + ["Year_of_Release"]].groupby(
    "Year_of_Release"
).sum().plot(kind="bar", rot=45)

In [None]:
df[[x for x in df.columns if "Sales" in x] + ["Year_of_Release"]].groupby(
    "Year_of_Release"
).sum().plot(kind="area")

pairplot() - pairwise relationship plot; matrix of scatter plots

In [None]:
%config InlineBackend.figure_format = 'png'
sns.pairplot(
    df[["Global_Sales", "Critic_Score", "Critic_Count", "User_Score", "User_Count"]]
)

In [None]:
sns.pairplot(df)

distplot() - distribution of observations. deprecated function.

In [None]:
sns.distplot(df['Critic_Score'])

since distplot() is being deprecated, we can do this:

In [None]:
sns.histplot(df["Critic_Score"], kde=True)

jointplot() - relationship between two numerical variables; scatter plot + histogram

In [None]:
sns.jointplot(x="Critic_Score", y="User_Score", data=df, kind="scatter")

boxplot() - range of observations (outliers, ranges, mean, etc.)

In [None]:
top_platforms = (
    df["Platform"].value_counts().sort_values(ascending=False).head(5).index.values
)
sns.boxplot(
    y="Platform",
    x="Critic_Score",
    data=df[df["Platform"].isin(top_platforms)],
    orient="h",
)
# individual points are outliers, box is IQR (interquartile range), vertical line is the median, whiskers are the lines extending from the box

heatmap() - view the distribution of a numerical variable over two categorical ones

In [None]:
platform_genre_sales = (
    df.pivot_table(
        index="Platform", columns="Genre", values="Global_Sales", aggfunc=sum
    )
    .fillna(0)
    .applymap(float)
)
sns.heatmap(platform_genre_sales, annot=True, fmt=".1f", linewidths=0.5)