In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/kaggle/input/videogamesales/vgsales.csv')

In [3]:
df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [7]:
df.shape

(16598, 11)

In [8]:
df.columns

Index(['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'],
      dtype='object')

In [12]:
df.isnull().any()

Rank            False
Name            False
Platform        False
Year             True
Genre           False
Publisher        True
NA_Sales        False
EU_Sales        False
JP_Sales        False
Other_Sales     False
Global_Sales    False
dtype: bool

In [13]:
categorical_df = df.select_dtypes('O')
categorical_features = categorical_df.columns
categorical_df.head()

Unnamed: 0,Name,Platform,Genre,Publisher
0,Wii Sports,Wii,Sports,Nintendo
1,Super Mario Bros.,NES,Platform,Nintendo
2,Mario Kart Wii,Wii,Racing,Nintendo
3,Wii Sports Resort,Wii,Sports,Nintendo
4,Pokemon Red/Pokemon Blue,GB,Role-Playing,Nintendo


In [58]:
numerical_df = df.select_dtypes(('int', 'float'))
numerical_features = numerical_df.columns
numerical_df.sample(5)

Unnamed: 0,Rank,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
4079,4081,0.44,0.01,0.0,0.04,0.49
2441,2443,0.47,0.32,0.0,0.06,0.85
14137,14139,0.02,0.01,0.0,0.0,0.03
9106,9108,0.13,0.01,0.0,0.0,0.14
3309,3311,0.34,0.23,0.0,0.04,0.61


In [17]:
df.isnull().sum()

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

In [18]:
df.Year = df.Year.fillna(df.Year.mean())

In [19]:
df.Year = df.Year.astype('int32')
df.Year

0        2006
1        1985
2        2008
3        2009
4        1996
         ... 
16593    2002
16594    2003
16595    2008
16596    2010
16597    2003
Name: Year, Length: 16598, dtype: int32

In [20]:
df.Publisher.value_counts(normalize=True)

Electronic Arts                 0.081681
Activision                      0.058948
Namco Bandai Games              0.056348
Ubisoft                         0.055683
Konami Digital Entertainment    0.050302
                                  ...   
Pow                             0.000060
Tryfirst                        0.000060
Interplay Productions           0.000060
Stainless Games                 0.000060
White Park Bay Software         0.000060
Name: Publisher, Length: 578, dtype: float64

In [22]:
df.Publisher = df.Publisher.fillna(df.Publisher.mode()[0])

In [24]:
from plotly import express as px

In [27]:
top_10_platform = df.Platform.value_counts().sort_values()
top_10_platform

fig = px.line(top_10_platform, title='Top Playing Platforms',
              labels={'value': "Counts",'index': "Name of the Platform"})

fig.show()

In [29]:
from plotly import graph_objects as go
from plotly.offline import iplot

In [33]:
year_wise_sales = df.loc[:, ['Name', 'Year', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales' ,'Global_Sales']].groupby(by =  'Year').sum()

fig1 = go.Scatter(x = year_wise_sales.index, y = year_wise_sales['NA_Sales'],
                  name = "North America's Sales",
                  line_shape='vh')

fig2 = go.Scatter(x = year_wise_sales.index, y = year_wise_sales['EU_Sales'],
                  name = "Europe's Sales",
                  line_shape='vh')

fig3 = go.Scatter(x = year_wise_sales.index, y = year_wise_sales['JP_Sales'],
                  name = "Japan's Sales",
                  line_shape='vh')

fig4 = go.Scatter(x = year_wise_sales.index, y = year_wise_sales['Other_Sales'],
                  name = "Other Sales",
                  line_shape='vh')

fig5 = go.Scatter(x = year_wise_sales.index, y = year_wise_sales['Global_Sales'],
                  name = "Global Sales",
                  line_shape='vh')

figs = [ fig1, fig2, fig3, fig4, fig5 ]

layout = dict(title = 'Year Wise Total Game Sales of North America, Europe, Japan and Other Country',
              xaxis= dict(title= 'Year' ),
              yaxis= dict(title= 'Total Sales In Millions',))

figure = dict(data = figs, layout = layout)

iplot(figure)

In [31]:
fig = px.scatter(df, x="Year", y="Global_Sales", color="Genre",
                 size='Global_Sales', hover_data=['Name'],
                 title="Year Wise Global Video Game Sales by Genere",
                 labels={'x':'Years', 'y':'Global Sales In Millions'})

fig.show()

In [32]:
top_sales = df.sort_values(by=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'], ascending=False).head(10)

dicts_name = {
    'NA_Sales' : "North America Sales ( In Millions)",
    'EU_Sales' : "Europe Sales ( In Millions)",
    'JP_Sales' : "Japan Sales ( In Millions)",
    'Other_Sales' : "Other Sales ( In Millions)",}

for (key, title) in dicts_name.items():
    fig = px.sunburst(top_sales, path=['Genre', 'Publisher', 'Platform'], values=key, title= 'Top Selling by '+ title)
    fig.update_layout(
        grid= dict(columns=2, rows=2),
        margin = dict(t=40, l=2, r=2, b=5))
    fig.show()

In [35]:
from sklearn.preprocessing import LabelEncoder

In [36]:
data = df.copy()
le = LabelEncoder()
feature = ["Platform", "Genre"]
for col in feature:
    data[col] = le.fit_transform(df[col])

X = data[['Platform', 'Genre', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].values
y = data['Global_Sales'].values


In [37]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [41]:
from sklearn.linear_model import LinearRegression

In [42]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

In [43]:
print(lr.score(X_test, y_test))

0.9999934761517112


In [44]:
from sklearn.neighbors import KNeighborsRegressor

In [45]:
regressor_knn = KNeighborsRegressor(n_neighbors = 3)
regressor_knn.fit(X_train,y_train)
pred = regressor_knn.predict(X_test)

In [51]:
from sklearn.metrics import r2_score

In [52]:
r2_score(y_test,pred)

0.8143797704058167

In [53]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)

In [55]:
r2_score(y_test, pred)

0.8432409890602865