In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import missingno as msno
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore') 

# Load and Check Data

In [None]:
data=pd.read_csv('../input/videogamesales/vgsales.csv')

In [None]:
data.info()

In [None]:
data.head()

In [None]:
data.corr()

In [None]:
data.isnull().any()

In [None]:
data.isnull().sum()

# Variable Description

* Rank - Ranking of overall sales
* Name - The games name
* Platform - Platform of the games release (i.e. PC,PS4, etc.)
* Year - Year of the game's release
* Genre - Genre of the game
* Publisher - Publisher of the game
* NA_Sales - Sales in North America (in millions)
* EU_Sales - Sales in Europe (in millions)
* JP_Sales - Sales in Japan (in millions)
* Other_Sales - Sales in the rest of the world (in millions)
* Global_Sales - Total worldwide sales.

In [None]:
# unique values of categorical titles
for i in data.select_dtypes(include=(object)).columns:
    print(f"{i}:\n{data[i].unique()} \n{'*'*50}")

# Missing Value

In [None]:
data.isnull().sum()

In [None]:
msno.bar(data)
plt.show()

In [None]:
data.Year.unique()

In [None]:
data[data.Publisher.isnull()].iloc[:5,:]

In [None]:
data.Publisher.value_counts().head(20)

In [None]:
round(np.mean(data.Year),0)

In [None]:
# fill Missing Value
data.Publisher=data.Publisher.fillna('Unknown')
data.Year=data.Year.fillna(round(np.mean(data.Year),0)) 


In [None]:
data[data.Publisher.isnull()]

In [None]:
data.Year.unique()

# Univariate Description Analysis

* Numeric variable: Rank , Year , NA_Sales , EU_Sales , JP_Sales , Other_Sales , Global_Sales
* Categorical variable: Name , Platform , Genre , Publisher

In [None]:
data_numeric=data.select_dtypes(include=np.number)
data_categorical=data.select_dtypes(include=object)

# Numeric Description

In [None]:
# average sales over the years
data_numeric[['Year','NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales']].groupby(['Year'],as_index=False).mean().sort_values(by='Global_Sales',ascending=False)

In [None]:
plt.figure(figsize=(15,15))
col=data_numeric.columns     
for i in range(len(col)):
        
    plt.subplot(3,3,i+1)
    
    sns.boxplot(x=data[col[i]])
       
    plt.xlabel(col[i].replace('_'," "))
    plt.ylabel(' ')

# Categorical Description

In [None]:
# Genre and Platform count
a=["Genre","Platform"]
for i in a:
    plt.figure(figsize=(10,7))
    sns.barplot(x=data_categorical[i].value_counts().index,y=data_categorical[i].value_counts().values)
    plt.xlabel(f'{i}')
    plt.ylabel('Count')
    plt.xticks(rotation=90)

### 20 Publishers with the Most Games 

In [None]:
publishers=data_categorical.Publisher.value_counts().head(20)
publishers

In [None]:
plt.figure(figsize=(10,7))

sns.barplot(x=publishers.index,y=publishers.values)
plt.xticks(rotation=90)
plt.xlabel('Publishers')
plt.ylabel('Count')
plt.title("20 Publishers with the Most Games")
plt.show()

# Visualization

In [None]:
# word cloud for Publishers
plt.subplots(figsize=(10,15))
wordcloud=WordCloud(background_color="black",
                    width=512,
                    height=384).generate(' '.join(data.Platform))
plt.imshow(wordcloud)
plt.axis("off")

plt.savefig('graph.png')
plt.show()

In [None]:
sales=data[['NA_Sales','EU_Sales','JP_Sales','Other_Sales']] 
f,ax = plt.subplots(figsize=(10, 10))
sns.heatmap(sales.corr(), annot=True, linewidths=0.5,linecolor="red", fmt= '.1f',ax=ax)
plt.show()

# Top 20 Publisher and Sales

In [None]:
list_publisher=list(publishers.index)
d_publisher=data[['Publisher','NA_Sales','EU_Sales','JP_Sales','Other_Sales']].groupby(['Publisher'],as_index=False).sum()


In [None]:

publisher_NA_Sales=[]
publisher_EU_Sales=[]
publisher_JP_Sales=[]
publisher_Other_Sales=[]

for i in list_publisher:
    publisher=d_publisher[d_publisher.Publisher==i]
    publisher_NA_Sales.append(int(publisher.NA_Sales))
    publisher_EU_Sales.append(int(publisher.EU_Sales))
    publisher_JP_Sales.append(int(publisher.JP_Sales))
    publisher_Other_Sales.append(int(publisher.Other_Sales))
    
new_publishers=pd.DataFrame({'Publishers':list_publisher,'North America':publisher_NA_Sales,
                             'Europe':publisher_EU_Sales,
                             'Japan':publisher_JP_Sales,'Other':publisher_Other_Sales})

top20_publisher=pd.melt(new_publishers,id_vars=['Publishers'],value_vars=['North America', 'Europe', 'Japan', 'Other'],
                    var_name='Sales_Area', value_name='Sales_Price')

plt.figure(figsize=(12,10))
sns.barplot(data=top20_publisher,x='Publishers',y='Sales_Price',hue='Sales_Area')
plt.ylabel('Sales Price')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Number of video games by year 
plt.figure(figsize=(10,8))
sns.countplot(data=data,x='Year')
plt.xticks(rotation=90)
plt.show()


### **The highest sales were between 2005 and 2013**

In [None]:
# Total Sales by Genres
genre_sales=data[['Genre','NA_Sales','EU_Sales','JP_Sales','Other_Sales']].groupby(['Genre'],as_index=False).sum()
genre_sales=genre_sales.set_axis(['Genre', 'North America', 'Europe', 'Japan','Other'], axis=1, inplace=False)
Genre_Sales=pd.melt(genre_sales,
                    id_vars=['Genre'],
                    value_vars=['North America', 'Europe', 'Japan', 'Other'],
                    var_name='Sales_Area', value_name='Sales_Price')
plt.figure(figsize=(12,10))
sns.barplot(data=Genre_Sales,x='Genre',y='Sales_Price',hue='Sales_Area')
plt.ylabel('Sales Price')
plt.show()

In [None]:
# Total Sales by Platforms
platform_sales=data[['Platform','NA_Sales','EU_Sales','JP_Sales','Other_Sales']].groupby(['Platform'],as_index=False).sum()
platform_sales=platform_sales.set_axis(['Platform', 'North America', 'Europe', 'Japan','Other'], axis=1, inplace=False)
Platform_Sales=pd.melt(platform_sales,
                    id_vars=['Platform'], 
                    value_vars=['North America', 'Europe', 'Japan', 'Other'],
                    var_name='Sales_Area', value_name='Sales_Price')
plt.figure(figsize=(12,10))
sns.barplot(data=Platform_Sales,x='Platform',y='Sales_Price',hue='Sales_Area')
plt.ylabel('Sales Price')
plt.xticks(rotation=90)
plt.show()

In [None]:
sales_year=data[['Year','NA_Sales','EU_Sales','JP_Sales','Other_Sales']].groupby(['Year'],as_index=False).sum()

trace1 = go.Scatter(
    x=sales_year.Year,
    y=sales_year.NA_Sales,
    name = "North America"
)
trace2 = go.Scatter(
    x=sales_year.Year,
    y=sales_year.EU_Sales,
    xaxis='x2',
    yaxis='y2',
    name = "Europe"
)
trace3 = go.Scatter(
    x=sales_year.Year,
    y=sales_year.JP_Sales,
    xaxis='x3',
    yaxis='y3',
    name = "Japan"
)
trace4 = go.Scatter(
    x=sales_year.Year,
    y=sales_year.Other_Sales,
    xaxis='x4',
    yaxis='y4',
    name = "Other"
)
data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
    xaxis=dict(
        domain=[0, 0.45]
    ),
    yaxis=dict(
        domain=[0, 0.45]
    ),
    xaxis2=dict(
        domain=[0.55, 1]
    ),
    xaxis3=dict(
        domain=[0, 0.45],
        anchor='y3'
    ),
    xaxis4=dict(
        domain=[0.55, 1],
        anchor='y4'
    ),
    yaxis2=dict(
        domain=[0, 0.45],
        anchor='x2'
    ),
    yaxis3=dict(
        domain=[0.55, 1]
    ),
    yaxis4=dict(
        domain=[0.55, 1],
        anchor='x4'
    ),
    title = 'Total Sales by Year'
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

### **The highest sales in all regions were between 2005 and 2013**