In [2]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import operator

ITERATION_FAKTOR = 20
GROUP_OF_PREDICTION = 'Other_Sales'

def get_whole_dataset():
    column_names = ['Name', 'Series', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score', 'Critic_Count', 'User_Score', 'User_Count', 'Rating', 'Predecessors_Count', 'Predecessors_Global_Sales_Mean', 'Predecessors_JP_Sales_Mean', 'Predecessors_EU_Sales_Mean', 'Predecessors_NA_Sales_Mean', 'Predecessors_Other_Sales_Mean']
    raw_data = pd.read_csv('./dataset/video_games_sales_with_predecessors.csv', usecols = column_names, sep = ';')
    return raw_data

show_data = get_whole_dataset()
#show_data.head()
show_data

Unnamed: 0,Name,Series,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,...,Critic_Count,User_Score,User_Count,Rating,Predecessors_Count,Predecessors_Global_Sales_Mean,Predecessors_JP_Sales_Mean,Predecessors_EU_Sales_Mean,Predecessors_NA_Sales_Mean,Predecessors_Other_Sales_Mean
0,Wii Sports,,Wii,2006,Sports,Nintendo,41.36,28.96,3.77,8.45,...,51,8.0,324,E,2,16.580000,1.660000,5.545000,7.890000,1.490000
1,Super Mario Bros.,,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,...,0,0.0,0,,12,10.276364,2.267273,2.540000,4.816364,0.653636
2,Mario Kart Wii,,Wii,2008,Racing,Nintendo,15.68,12.80,3.79,3.29,...,73,8.3,712,E,0,0.000000,0.000000,0.000000,0.000000,0.000000
3,Wii Sports Resort,,Wii,2009,Sports,Nintendo,15.61,10.95,3.28,2.95,...,73,8.0,193,E,0,0.000000,0.000000,0.000000,0.000000,0.000000
4,Pokemon Red/Pokemon Blue,,G,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,...,0,0.0,0,,1,2.060000,0.740000,0.520000,0.710000,0.080000
5,Tetris,,G,1989,Puzzle,Nintendo,23.20,2.26,4.22,0.58,...,0,0.0,0,,27,0.875385,0.208462,0.158077,0.473462,0.036154
6,New Super Mario Bros.,,DS,2006,Platform,Nintendo,11.28,9.15,6.50,2.88,...,65,8.5,433,E,3,14.573333,2.823333,3.873333,6.813333,1.066667
7,Wii Play,,Wii,2006,Misc,Nintendo,13.96,9.18,2.93,2.84,...,41,6.6,129,E,1,0.920000,0.180000,0.420000,0.230000,0.090000
8,New Super Mario Bros. Wii,,Wii,2009,Platform,Nintendo,14.48,6.95,4.70,2.25,...,80,8.4,595,E,0,0.000000,0.000000,0.000000,0.000000,0.000000
9,Duck Hunt,,NES,1984,Shooter,Nintendo,26.93,0.63,0.28,0.47,...,0,0.0,0,,9,0.150000,0.000000,0.000000,0.140000,0.010000


## Prepare data

Niektoré časti pre spracovanie ako spracovanie predchodcov boli vyhodnotené za pomoci ElasticSearch.

In [3]:
#remove not used columns
def remove_unused_columns(dataset):
    dataset = dataset.drop('Series', axis = 1)
    dataset = dataset.drop('Rating', axis = 1)
    dataset = dataset.drop('Publisher', axis = 1)
    return dataset

In [4]:
# Years in dataset
# min(prepared_data.Year_of_Release) # 1985
# max(prepared_data.Year_of_Release) # 2016

genres = ['Sports', 'Platform', 'Racing', 'Role-Playing',
          'Puzzle', 'Misc','Shooter', 'Simulation', 'Action',
          'Fighting', 'Adventure','Strategy']

platforms = ['Wii', 'DS', 'X360', 'PS3', 'PS2', 'PS4',
             '3DS','PS', 'X', 'PC', 'PSP', 'WiiU', 'GC',
             'GBA', 'XOne', 'PSV', 'DC']


def get_decade(row):
    if row['Year_of_Release'] <= 1990:    
        return 1980
    elif row['Year_of_Release'] <= 2000:
        return 1990
    elif row['Year_of_Release'] <= 2010:
        return 2000
    else:
        return 2010

def label_genres(row, genre):
    if(row['Genre'] == genre):
        return 1
    else:
        return 0

def label_platforms(row, platform):
    if(row['Platform'] == platform):
        return 1
    else:
        return 0

def set_means(row, means):
    return means[row['Genre']]

def calculate_means(data, group):
    # get mean of sales for each genre in group(NA,EU,...)
    means = {}
    for genre in genres:
        genre_only = data[data['Genre'] == genre]
        means[genre] = genre_only[group].mean()
    return means

def mean_sales_of_genre_for_group(data):
    means = calculate_means(data,'Global_Sales')
    data['Global_Mean_Sale_For_Genre'] = data.apply(lambda row: set_means(row,means), axis=1)
    means = calculate_means(data,'NA_Sales')
    data['NA_Mean_Sale_For_Genre'] = data.apply(lambda row: set_means(row,means), axis=1)
    means = calculate_means(data,'EU_Sales')
    data['EU_Mean_Sale_For_Genre'] = data.apply(lambda row: set_means(row,means), axis=1)
    means = calculate_means(data,'JP_Sales')
    data['JP_Mean_Sale_For_Genre'] = data.apply(lambda row: set_means(row,means), axis=1)
    means = calculate_means(data,'Other_Sales')
    data['Other_Mean_Sale_For_Genre'] = data.apply(lambda row: set_means(row,means), axis=1)

def get_filtered_data():
    data = get_whole_dataset()
    data = remove_unused_columns(data)

    # remove data with no user count with score
    data = data[data.User_Count != 0]
    # remove data with no critic count with score
    data = data[data.Critic_Count != 0]

    # remove all null columns
    data = data.dropna()
    
    # add Decade column
    data = data.copy()
    data['Decade'] = data.apply(get_decade,axis=1)
    
    # add count of genre for each part
    
    #add 0/1 to
    for genre in genres:
        data[genre] = data.apply(lambda row: label_genres(row,genre), axis=1)
    # remove genre column
    mean_sales_of_genre_for_group(data)
    data = data.drop('Genre', axis = 1)
    
    #add 0/1 to
    for platform in platforms:
        data[platform] = data.apply(lambda row: label_platforms(row,platform), axis=1)
    # remove platform column
    data = data.drop('Platform', axis = 1)
    
    # remove name column
    data = data.drop('Name', axis = 1)
    
    return data

## Linear Regression

In [5]:
def data_for_global():
    data = get_filtered_data()
    data = data.drop('NA_Sales', axis = 1)
    data = data.drop('EU_Sales', axis = 1)
    data = data.drop('JP_Sales', axis = 1)
#     data = data.drop('Other_Sales', axis = 1)
    data = data.drop('Global_Sales', axis = 1)
    data = data.drop('NA_Mean_Sale_For_Genre', axis = 1)
    data = data.drop('EU_Mean_Sale_For_Genre', axis = 1)
    data = data.drop('JP_Mean_Sale_For_Genre', axis = 1)
#     data = data.drop('Other_Mean_Sale_For_Genre', axis = 1)
    data = data.drop('Global_Mean_Sale_For_Genre', axis = 1)
    data = data.drop('Predecessors_JP_Sales_Mean', axis = 1)
    data = data.drop('Predecessors_EU_Sales_Mean', axis = 1)
    data = data.drop('Predecessors_NA_Sales_Mean', axis = 1)
#     data = data.drop('Predecessors_Other_Sales_Mean', axis = 1)
    data = data.drop('Predecessors_Global_Sales_Mean', axis = 1)
    return data

def print_model(coef, labels, rsq_error, mean_error, alpha = None):
    # View the R-Squared score
    alpha_str = '' if alpha is None else 'Alpha = ' + str(alpha) + ' '
    print alpha_str + 'R-Squared score: ' + str(rsq_error) + '\n'

    df = pd.DataFrame(coef, index = labels, columns = ['Coefficient'])
    df = df[df.Coefficient != 0]
    df = df.sort_values(by = 'Coefficient', ascending = False)
    print df

    alpha_str = '\n' if alpha is None else '\nAlpha = ' + str(alpha) + ' '
    print alpha_str + 'Mean squared error: ' + str(mean_error),
    print '\n__________________________________________________\n'

# View the R-Squared score
def get_rsq_error(model,X_test,Y_test):
    return model.score(X_test, Y_test)

# View the mean square error score
def get_mean_sq_error(model,X_test,Y_test):
    Y_pred = model.predict(X_test)
    return mean_squared_error(y_true = Y_test, y_pred = Y_pred)

In [6]:
data = data_for_global()

count_of_coef = (len(data.columns)-1)  # -1, one column(y) will be deleted 
coef_array = [0] * count_of_coef
rsq_error = 0
mean_error = 0

for _ in range(ITERATION_FAKTOR):
    # create train set 80% and train set 20%
    train_set, test_set = train_test_split(data, test_size = 0.2)

    # training set
    Y_train = train_set[GROUP_OF_PREDICTION]
    X_train = train_set.drop(GROUP_OF_PREDICTION, axis = 1)

    # test set
    Y_test = test_set[GROUP_OF_PREDICTION]
    X_test = test_set.drop(GROUP_OF_PREDICTION, axis = 1)

    lin_regresion = linear_model.LinearRegression()
    model = lin_regresion.fit(X_train, Y_train)
    
    coef_array = map(operator.add,coef_array,model.coef_)
    rsq_error += get_rsq_error(model,X_test,Y_test)
    mean_error += get_mean_sq_error(model,X_test,Y_test)

coef_array = map(operator.truediv,coef_array,[ITERATION_FAKTOR]*count_of_coef)
rsq_error = rsq_error / ITERATION_FAKTOR
mean_error = mean_error / ITERATION_FAKTOR

print_model(coef_array, list(X_train), rsq_error, mean_error)

# # Run the model on X_test and show the first five results
# print list(model.predict(X_test)[0:5])

# # View the first five test Y values
# print list(Y_test)[0:5]

R-Squared score: 0.174141149942

                               Coefficient
Predecessors_Other_Sales_Mean     0.663626
Misc                              0.035592
Sports                            0.019382
Racing                            0.010579
Other_Mean_Sale_For_Genre         0.004764
Critic_Count                      0.002020
Critic_Score                      0.000541
Puzzle                            0.000165
Predecessors_Count                0.000056
User_Count                        0.000054
Year_of_Release                  -0.000034
User_Score                       -0.000122
Decade                           -0.001994
Simulation                       -0.003644
Action                           -0.010526
Fighting                         -0.012820
Adventure                        -0.012875
Shooter                          -0.032407
Role-Playing                     -0.034557
Strategy                         -0.040927

Mean squared error: 0.0648439772758 
__________________________

## Lasso

In [7]:
data = data_for_global()

for alpha in [.0001, .1, 10]:
    
    count_of_coef = (len(data.columns)-1)  # -1, one column(y) will be deleted
    coef_array = [0] * count_of_coef
    rsq_error = 0
    mean_error = 0

    for _ in range(ITERATION_FAKTOR):
        # create train set 80% and train set 20%
        train_set, test_set = train_test_split(data, test_size = 0.2)
        # training set
        Y_train = train_set[GROUP_OF_PREDICTION]
        X_train = train_set.drop(GROUP_OF_PREDICTION, axis = 1)

        # test set
        Y_test = test_set[GROUP_OF_PREDICTION]
        X_test = test_set.drop(GROUP_OF_PREDICTION, axis = 1)

        lasso = linear_model.Lasso(alpha = alpha)
        model = lasso.fit(X_train, Y_train)

        coef_array = map(operator.add,coef_array,model.coef_)
        rsq_error += get_rsq_error(model,X_test,Y_test)
        mean_error += get_mean_sq_error(model,X_test,Y_test)

    coef_array = map(operator.truediv,coef_array,[ITERATION_FAKTOR]*count_of_coef)
    rsq_error = rsq_error / ITERATION_FAKTOR
    mean_error = mean_error / ITERATION_FAKTOR

    print_model(coef_array, list(X_train), rsq_error, mean_error, alpha)

Alpha = 0.0001 R-Squared score: 0.202447429951

                               Coefficient
Predecessors_Other_Sales_Mean     0.640976
Misc                              0.038301
Sports                            0.022145
Racing                            0.016212
Critic_Count                      0.002097
Critic_Score                      0.000583
Puzzle                            0.000538
Predecessors_Count                0.000054
User_Count                        0.000050
Year_of_Release                   0.000017
User_Score                       -0.000103
Simulation                       -0.000674
Decade                           -0.001973
Fighting                         -0.006330
Action                           -0.006565
Adventure                        -0.007420
Shooter                          -0.027902
Role-Playing                     -0.029636
Strategy                         -0.036798

Alpha = 0.0001 Mean squared error: 0.065334362909 
________________________________________

## Ridge

In [8]:
data = data_for_global()

count_of_coef = (len(data.columns)-1)  # -1, one column(y) will be deleted 
coef_array = [0] * count_of_coef
rsq_error = 0
mean_error = 0

for _ in range(ITERATION_FAKTOR):
    # create train set 80% and train set 20%
    train_set, test_set = train_test_split(data, test_size = 0.2)

    # training set
    Y_train = train_set[GROUP_OF_PREDICTION]
    X_train = train_set.drop(GROUP_OF_PREDICTION, axis = 1)

    # test set
    Y_test = test_set[GROUP_OF_PREDICTION]
    X_test = test_set.drop(GROUP_OF_PREDICTION, axis = 1)

    ridge = linear_model.Ridge()
    model = ridge.fit(X_train, Y_train)

    coef_array = map(operator.add,coef_array,model.coef_)
    rsq_error += get_rsq_error(model,X_test,Y_test)
    mean_error += get_mean_sq_error(model,X_test,Y_test)

coef_array = map(operator.truediv,coef_array,[ITERATION_FAKTOR]*count_of_coef)
rsq_error = rsq_error / ITERATION_FAKTOR
mean_error = mean_error / ITERATION_FAKTOR

print_model(coef_array, list(X_train), rsq_error, mean_error)

R-Squared score: 0.199864459038

                               Coefficient
Predecessors_Other_Sales_Mean     0.658791
Misc                              0.035659
Sports                            0.019098
Racing                            0.009289
Other_Mean_Sale_For_Genre         0.004996
Critic_Count                      0.002087
Critic_Score                      0.000532
Predecessors_Count                0.000059
User_Count                        0.000051
User_Score                        0.000007
Year_of_Release                  -0.000022
Decade                           -0.001947
Puzzle                           -0.002862
Simulation                       -0.005020
Action                           -0.010640
Adventure                        -0.013843
Fighting                         -0.013979
Shooter                          -0.033168
Role-Playing                     -0.035833
Strategy                         -0.041891

Mean squared error: 0.060639099676 
___________________________

## Elastic Net

In [9]:
data = data_for_global()

count_of_coef = (len(data.columns)-1)  # -1, one column(y) will be deleted 
coef_array = [0] * count_of_coef
rsq_error = 0
mean_error = 0

for _ in range(ITERATION_FAKTOR):
    # create train set 80% and train set 20%
    train_set, test_set = train_test_split(data, test_size = 0.2)

    # training set
    Y_train = train_set[GROUP_OF_PREDICTION]
    X_train = train_set.drop(GROUP_OF_PREDICTION, axis = 1)

    # test set
    Y_test = test_set[GROUP_OF_PREDICTION]
    X_test = test_set.drop(GROUP_OF_PREDICTION, axis = 1)

    elasticNet = linear_model.ElasticNet()
    model = elasticNet.fit(X_train, Y_train)
    
    coef_array = map(operator.add,coef_array,model.coef_)
    rsq_error += get_rsq_error(model,X_test,Y_test)
    mean_error += get_mean_sq_error(model,X_test,Y_test)

coef_array = map(operator.truediv,coef_array,[ITERATION_FAKTOR]*count_of_coef)
rsq_error = rsq_error / ITERATION_FAKTOR
mean_error = mean_error / ITERATION_FAKTOR

print_model(coef_array, list(X_train), rsq_error, mean_error)

R-Squared score: 0.0849978725871

              Coefficient
Critic_Count     0.000872
User_Count       0.000096

Mean squared error: 0.0790085040146 
__________________________________________________

