In [None]:
import matplotlib.pyplot as plt
import numpy as np  # importing numpy
import pandas as pd  # Importing pandas
import seaborn as sns
from matplotlib.ticker import MaxNLocator

df = pd.read_csv(r'C:\UCDPA project datasets\Video Games Dataset.csv')  # Loading the csv data set

# Exploration of dataset:
print(df.head())  # displays the first few rows
print(df.info())  # displays info on the columns
print(df.shape)  # checks how many rows and columns there are
print(df.describe())  # displays some stats of the data set

# New column and sort
df["NonUS_Sales"] = df["Global_Sales"] - df["NorthAmerica_Sales"]  # creates a new column
df_NonUS_Sales = df.sort_values(["NonUS_Sales", "Year"], ascending=[False, True])  # sorts by Non US sales and by year
df_NonUS_Sales_simple = df_NonUS_Sales[["name", "Year", "NonUS_Sales"]]  # Selects the 3 columns we want to display only
print(df_NonUS_Sales_simple.head())

# Creation of boxplot with swarmplot to analyze Non-US sales
sns.boxplot(data=df_NonUS_Sales, x='Year', y='NonUS_Sales').set(title='Sales in regions other than US')
sns.swarmplot(x="Year", y="NonUS_Sales", data=df_NonUS_Sales, edgecolor="black",alpha=.5, s=1.8,linewidth=0.3)
plt.show()

# Grouping and sorting
total_sales = df.groupby("Publisher")["Global_Sales"].sum()  # groups by Publisher and the sum of all sales
total_sales_Sorted = total_sales.sort_values(ascending=[False])  # sorts in descending order
print(total_sales_Sorted)

# Plotting barplot with grouped sales by Publisher
total_sales = df.groupby("Publisher")["Global_Sales"].sum().reset_index() # let's group this again and reset its index
sns.barplot(x="Global_Sales", y="Publisher", palette="mako", # a more interesting colour palette
            ci = None, # I don't want the error bars displayed
            data=total_sales).set(title="Accumulated Global Sales by Publisher") # adding title
plt.tight_layout()
plt.show()


# Grouping & NumPy
info_publisher_sales = df.groupby("Publisher")["Global_Sales"]\
    .agg([np.sum, np.max, np.min, np.mean])  # groups by Publisher and shows total sales, max and min, and mean
print(info_publisher_sales)

# Sub-set
df_2010_Nintendo = df[
    (df["Year"] == 2010) & (df["Publisher"] == "Nintendo")]  # select games released in 2010 by Nintendo only
df_2010_Nintendo_simple = df_2010_Nintendo[["name", "Year", "Global_Sales"]]  # selects the columns we want to display

df_2010_Activision = df[
    (df["Year"] == 2010) & (df["Publisher"] == "Activision")]  # select games released in 2010 by Activision only
df_2010_Activision_simple = df_2010_Activision[["name", "Year", "Global_Sales"]]  # selects the columns we want to display

# Creation of list
sales_2010 = [["Nintendo", df_2010_Nintendo_simple],
              ["Activision", df_2010_Activision_simple]]
print(sales_2010)

# removing duplicates
unique_games = df.drop_duplicates(subset=["name"])  # removes name duplicates
print(unique_games)

# Merging data frames
df2 = pd.read_csv(r'C:\UCDPA project datasets\metacritic_games.csv')  # Loading another csv data set
print(df2.head())

df_df2 = df.merge(df2, on="name")  # Merging by 'name'
print(df_df2.columns)

print(df_df2['Year'].value_counts())  # printing most popular year of release

print('df_df2 table shape:', df_df2.shape)  # review


# Seaborn count plot
sns.countplot(y="platform", data=df2).set(title="Titles released per Platform")
plt.show()

# custom scatter plot
palette_colors = {"Action": "green", "Adventure": "blue",'Shooter': "red", 'Misc': "grey", 'Role-Playing': "brown",
                  'Sports':"pink", 'Platform': "purple", 'Fighting': "black", 'Racing': "orange",
                  'Simulation': "yellow", 'Puzzle': "white"}  # giving colours to categories
sns.scatterplot(x="Year", y="metascore", data=df_df2, hue="Genre",
              palette=palette_colors).set(title="Metascore evolution by genre")
plt.show()

# matplotlib scatter plot
N = 20422
x = df2["metascore"]
y = df2['user_score'].astype(float) # Changing so scores are recognized correctly
new_x, new_y = zip(*sorted(zip(x, y)))  # sorting the axis
colors = np.random.rand(N)  # giving a more interesting visual
area = (30 * np.random.rand(N))**2  # 0 to 15 point radii

plt.scatter(new_x, new_y, c=colors, alpha=0.5)
ax = plt.gca()
ax.yaxis.set_major_locator(MaxNLocator(11))   # reducing the number of points in the Y axis so they are readable
plt.xlabel("Metascore")
plt.ylabel("User Score")
plt.title("Metascore/User Score Scatter Plot")
plt.tight_layout()
plt.show()