### Firstly, we import all the necessary packages

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression

import keras
from keras import optimizers
from keras import regularizers
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout
from keras.optimizers import SGD
from keras.constraints import maxnorm
from keras.models import load_model
from keras.wrappers.scikit_learn import KerasRegressor



Using TensorFlow backend.


### Loading the dataset

In [103]:
df = pd.read_csv('lego_Sets.csv')

### Removing variables that would give no predictive power (information variables such as ID's or descriptions)

In [104]:
df = df.drop(['prod_desc', 
              'prod_id', 
              'prod_long_desc', 
              'set_name'],
             axis=1)

### Next, we are checking for missing values within the dataset

In [109]:
missing_data = pd.DataFrame({'Number of occurencies': df.isna().sum(),
                             'Percentage': 100*df.isna().sum()/len(df)
                            })
missing_data[missing_data['Number of occurencies'] > 0]

Unnamed: 0,Number of occurencies,Percentage
num_reviews,1620,13.212625
play_star_rating,1620,13.212625
review_difficulty,2055,16.76046
star_rating,1620,13.212625
theme_name,3,0.024468
val_star_rating,1620,13.212625
star/play,1775,14.476796
star/val,1795,14.639915


Since the missings in review difficulty can be easily imputed with 

In [None]:
df.loc[df['review_difficulty'].isnull(), "review_diff_unkown"] = 1

In [106]:
df['star/play'] = df['star_rating']/df['play_star_rating']
df['star/val'] = df['star_rating']/df['val_star_rating']

In [107]:
star_p_mean = df['star/play'].mean()
star_v_mean = df['star/val'].mean()

In [108]:
df.loc[(df['play_star_rating'].isnull()) & (df['star_rating'].notna()), "play_star_rating"] = df["star_rating"]*star_p_mean
df.loc[(df['val_star_rating'].isnull()) & (df['star_rating'].notna()), "val_star_rating"] = df["star_rating"]*star_v_mean

In [99]:
df.loc[df.isnull().sum(1)>=3].index

Int64Index([   22,    32,    48,    69,    84,    92,    94,   108,   153,
              162,
            ...
            12080, 12082, 12096, 12106, 12108, 12111, 12113, 12136, 12137,
            12196],
           dtype='int64', length=1795)

### Checking for categorical variables - whether we can use one hot encoding or will it produce too many new columns

In [42]:
cat_vars = []
cont_vars = []
for variable in df.columns:
    if df[variable].dtype==object:
        cat_vars.append(variable)
    else:
        cont_vars.append(variable)

In [43]:
num_of_cats = {}

for category in cat_cols:
    num_of_cats[category] = len(list(df[category].unique()))
    
print(num_of_cats)

{'ages': 31, 'review_difficulty': 6, 'theme_name': 41, 'country': 21}


In [52]:
for var in cat_vars:
    print(var, 
          "\n", 
          dict(df[var].value_counts()), 
          "\n")


ages 
 {'6-12': 1476, '7-14': 1421, '8-14': 1180, '4-7': 957, '5-12': 911, '10+': 870, '2-5': 840, '7-12': 723, '9-14': 624, '16+': 420, '8-12': 350, '4-99': 311, '12+': 298, '6-14': 233, '8+': 226, '1½-3': 213, '14+': 212, '10-21': 184, '10-16': 148, '6+': 148, '1½-5': 113, '9-16': 92, '5+': 71, '11-16': 66, '9-12': 46, '12-16': 42, '9+': 21, '4+': 21, '5-8': 21, '10-14': 21, '7+': 2} 

review_difficulty 
 {'Easy': 4236, 'Average': 3765, 'Very Easy': 1139, 'Challenging': 1058, 'Very Challenging': 8} 

theme_name 
 {'Star Wars™': 1377, 'DUPLO®': 1166, 'City': 1092, 'Juniors': 978, 'THE LEGO® NINJAGO® MOVIE™': 796, 'BrickHeadz': 765, 'Friends': 606, 'THE LEGO® BATMAN MOVIE': 533, 'Technic': 505, 'Marvel Super Heroes': 414, 'Creator 3-in-1': 403, 'Classic': 332, 'Creator Expert': 317, 'Minecraft™': 303, 'Disney™': 285, 'DIMENSIONS™': 280, 'MINDSTORMS®': 272, 'NINJAGO®': 263, 'Speed Champions': 254, 'Architecture': 210, 'NEXO KNIGHTS™': 197, 'Elves': 195, 'DC Comics™ Super Heroes': 148, '

### Creating correlation matrix - maybe we can omit some columns to get rid off NA's

In [110]:
corr = df.drop(['list_price'], axis=1).corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,num_reviews,piece_count,play_star_rating,star_rating,val_star_rating,star/play,star/val
num_reviews,1.0,0.546618,-0.0451025,0.00711082,0.0148903,-0.00467004,-0.039724
piece_count,0.546618,1.0,0.014792,0.0739027,0.0636993,-0.00773725,-0.0342985
play_star_rating,-0.0451025,0.014792,1.0,0.613616,0.499188,-0.639263,-0.0508187
star_rating,0.00711082,0.0739027,0.613616,1.0,0.729778,0.060486,-0.0487282
val_star_rating,0.0148903,0.0636993,0.499188,0.729778,1.0,-0.0177912,-0.624812
star/play,-0.00467004,-0.00773725,-0.639263,0.060486,-0.0177912,1.0,0.0516611
star/val,-0.039724,-0.0342985,-0.0508187,-0.0487282,-0.624812,0.0516611,1.0


In [None]:
df.list_price = df.list_price.astype(float)
df['review_difficulty'] = df['review_difficulty'].astype("category")
df['review_difficulty'] = df['review_difficulty'].cat.reorder_categories(['Very Easy',
                                                               'Easy',
                                                               'Average',
                                                               'Challenging',
                                                               'Very Challenging'],
                                                                        ordered = True)
df['review_difficulty'] = df['review_difficulty'].cat.codes
df.theme_name = df.theme_name.astype("category")
df.ages = df.ages.astype("category")


In [None]:
df1 = df.copy()
df1 = df1.drop(['prod_desc', 
                'prod_id', 
                'prod_long_desc', 
                'set_name',
                'theme_name',
                'country'], 
                 axis=1)

print(df1.shape)
# df2 = df1.copy().dropna()
df2 = pd.get_dummies(df1).dropna()
print(df2.shape)

In [None]:
X, Y = df2.drop(['list_price'], axis=1), df2['list_price']

X_train, X_test, Y_train, Y_test = train_test_split(
                                                    X, 
                                                    Y, 
                                                    test_size=0.3, 
                                                    random_state=361)