In [1]:
import pandas as pd
import numpy as np
import re
%matplotlib inline
import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from numbers import Number
import statsmodels.formula.api as smf

def get_whole_dataset():
    column_names = ['Name', 'Series', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Rating', 'Predecessors_Count', 'Predecessors_Sales_Mean']
    raw_data = pd.read_csv('./dataset/video_games_sales_with_predecessors.csv', usecols = column_names, sep = ';')
    return raw_data

show_data = get_whole_dataset()
#show_data.head()
show_data

Unnamed: 0,Name,Series,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Rating,Predecessors_Count,Predecessors_Sales_Mean
0,Wii Sports,,Wii,2006,Sports,Nintendo,41.36,28.96,3.77,8.45,82.54,76.0,51,8.0,324,E,2,16.580000
1,Super Mario Bros.,,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,0.0,0,0.0,0,,12,10.276364
2,Mario Kart Wii,,Wii,2008,Racing,Nintendo,15.68,12.80,3.79,3.29,35.57,82.0,73,8.3,712,E,0,0.000000
3,Wii Sports Resort,,Wii,2009,Sports,Nintendo,15.61,10.95,3.28,2.95,32.78,80.0,73,8.0,193,E,0,0.000000
4,Pokemon Red/Pokemon Blue,,G,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,0.0,0,0.0,0,,1,2.060000
5,Tetris,,G,1989,Puzzle,Nintendo,23.20,2.26,4.22,0.58,30.26,0.0,0,0.0,0,,27,0.875385
6,New Super Mario Bros.,,DS,2006,Platform,Nintendo,11.28,9.15,6.50,2.88,29.81,89.0,65,8.5,433,E,3,14.573333
7,Wii Play,,Wii,2006,Misc,Nintendo,13.96,9.18,2.93,2.84,28.92,58.0,41,6.6,129,E,1,0.920000
8,New Super Mario Bros. Wii,,Wii,2009,Platform,Nintendo,14.48,6.95,4.70,2.25,28.38,87.0,80,8.4,595,E,0,0.000000
9,Duck Hunt,,NES,1984,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31,0.0,0,0.0,0,,9,0.150000


## Prepare data

Niektoré časti pre spracovanie ako spracovanie predchodcov boli vyhodnotené za pomoci ElasticSearch.

In [2]:
#remove not used columns
def remove_unused_columns(dataset):
    dataset = dataset.drop('Series', axis = 1)
    dataset = dataset.drop('Rating', axis = 1)
    dataset = dataset.drop('Publisher', axis = 1) #aj toto ???
    return dataset

In [10]:
d = get_whole_dataset()
genre_only = d[d['Genre'] == 'Racing']
genre_only['NA_Sales'].mean()

0.28120904836193245

In [47]:
# Years in dataset
# min(prepared_data.Year_of_Release) # 1985
# max(prepared_data.Year_of_Release) # 2016

genres = ['Sports', 'Platform', 'Racing', 'Role-Playing',
          'Puzzle', 'Misc','Shooter', 'Simulation', 'Action',
          'Fighting', 'Adventure','Strategy']

platforms = ['Wii', 'DS', 'X360', 'PS3', 'PS2', 'PS4',
             '3DS','PS', 'X', 'PC', 'PSP', 'WiiU', 'GC',
             'GBA', 'XOne', 'PSV', 'DC']


def get_decade(row):
    if row['Year_of_Release'] <= 1990:    
        return 1980
    elif row['Year_of_Release'] <= 2000:
        return 1990
    elif row['Year_of_Release'] <= 2010:
        return 2000
    else:
        return 2010

def label_genres(row, genre):
    if(row['Genre'] == genre):
        return 1
    else:
        return 0

def label_platforms(row, platform):
    if(row['Platform'] == platform):
        return 1
    else:
        return 0

def set_means(row, means):
    return means[row['Genre']]

def calculate_means(data, group):
    # get mean of sales for each genre in group(NA,EU,...)
    means = {}
    for genre in genres:
        genre_only = data[data['Genre'] == genre]
        means[genre] = genre_only[group].mean()
    return means

def mean_sales_of_genre_for_group(data):
    means = calculate_means(data,'Global_Sales')
    data['Global_Mean_Sale_For_Genre'] = data.apply(lambda row: set_means(row,means), axis=1)
    means = calculate_means(data,'NA_Sales')
    data['NA_Mean_Sale_For_Genre'] = data.apply(lambda row: set_means(row,means), axis=1)
    means = calculate_means(data,'EU_Sales')
    data['EU_Mean_Sale_For_Genre'] = data.apply(lambda row: set_means(row,means), axis=1)
    means = calculate_means(data,'JP_Sales')
    data['JP_Mean_Sale_For_Genre'] = data.apply(lambda row: set_means(row,means), axis=1)
    means = calculate_means(data,'Other_Sales')
    data['Other_Mean_Sale_For_Genre'] = data.apply(lambda row: set_means(row,means), axis=1)

def get_filtered_data():
    data = get_whole_dataset()
    data = remove_unused_columns(data)

    # remove data with no user count with score
    data = data[data.User_Count != 0]
    # remove data with no critic count with score
    data = data[data.Critic_Count != 0]

    # remove all null columns
    data = data.dropna()
    
    # add Decade column
    data = data.copy()
    data['Decade'] = data.apply(get_decade,axis=1)
    
    # add count of genre for each part
    
    #add 0/1 to
    for genre in genres:
        data[genre] = data.apply(lambda row: label_genres(row,genre), axis=1)
    # remove genre column
    mean_sales_of_genre_for_group(data)
    data = data.drop('Genre', axis = 1)
    
    #add 0/1 to
    for platform in platforms:
        data[platform] = data.apply(lambda row: label_platforms(row,platform), axis=1)
    # remove platform column
    data = data.drop('Platform', axis = 1)
    
    # remove name column
    data = data.drop('Name', axis = 1)
    
    return data

## Linear Regression - Global Sales

In [51]:
data = get_filtered_data()

data = data.drop('NA_Sales', axis = 1)
data = data.drop('EU_Sales', axis = 1)
data = data.drop('JP_Sales', axis = 1)
data = data.drop('Other_Sales', axis = 1)
data = data.drop('NA_Mean_Sale_For_Genre', axis = 1)
data = data.drop('EU_Mean_Sale_For_Genre', axis = 1)
data = data.drop('JP_Mean_Sale_For_Genre', axis = 1)
data = data.drop('Other_Mean_Sale_For_Genre', axis = 1)

# create train set 80% and train set 20%
train_set, test_set = train_test_split(data, test_size = 0.2)

Y_train = train_set['Global_Sales']
X_train = train_set.drop('Global_Sales', axis = 1)

lin_regresion = linear_model.LinearRegression()
model = lin_regresion.fit(X_train, Y_train)



Unnamed: 0,Year_of_Release,Critic_Score,Critic_Count,User_Score,User_Count,Predecessors_Count,Predecessors_Sales_Mean,Decade,Sports,Racing,...,PS,X,PC,PSP,WiiU,GC,GBA,XOne,PSV,DC
7072,2015,66.0,30,7.0,47,46,0.243750,2010,0,1,...,0,0,0,0,0,0,0,0,0,0
1220,2000,80.0,27,8.3,100,81,0.481304,1990,0,0,...,0,0,0,0,0,0,0,0,0,0
3200,2001,78.0,24,8.8,5,0,0.000000,2000,1,0,...,0,0,0,0,0,0,0,0,0,0
9665,2007,60.0,7,8.3,48,2,0.045000,2000,0,0,...,0,0,0,0,0,0,0,0,0,0
12473,2002,67.0,7,8.6,14,49,0.454000,2000,0,0,...,0,0,0,0,0,0,0,0,0,0
13725,2006,72.0,26,7.0,4,51,0.459000,2000,0,1,...,0,0,0,0,0,0,0,0,0,0
2309,2002,80.0,23,7.6,5,55,0.552364,2000,1,0,...,0,0,0,0,0,0,0,0,0,0
15678,2003,74.0,22,8.7,76,2,0.130000,2000,0,0,...,0,0,0,0,0,0,0,0,0,0
2096,2009,87.0,63,8.2,95,20,0.400000,2000,0,1,...,0,0,0,0,0,0,0,0,0,0
5360,2006,61.0,21,9.3,6,11,0.281250,2000,0,0,...,0,0,0,0,0,0,0,0,0,0


{'Action': 0.36021754894851327,
 'Adventure': 0.13705357142857139,
 'Fighting': 0.37029702970297024,
 'Misc': 0.624966887417218,
 'Platform': 0.4707384615384617,
 'Puzzle': 0.28680412371134023,
 'Racing': 0.3648466257668713,
 'Role-Playing': 0.301789297658863,
 'Shooter': 0.49552341597796107,
 'Simulation': 0.30214022140221414,
 'Sports': 0.5055000000000006,
 'Strategy': 0.11680672268907565}