In [1]:
import pandas as pd
import numpy as np
import keras
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout
from keras import optimizers
from keras import regularizers
from keras.optimizers import SGD
from keras.constraints import maxnorm
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
import statsmodels.api as sm
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
df = pd.read_csv('lego_Sets.csv')

In [3]:
df.list_price = df.list_price.astype(float)
df['review_difficulty'] = df['review_difficulty'].astype("category")
df['review_difficulty'] = df['review_difficulty'].cat.reorder_categories(['Very Easy',
                                                               'Easy',
                                                               'Average',
                                                               'Challenging',
                                                               'Very Challenging'],
                                                                        ordered = True)
df['review_difficulty'] = df['review_difficulty'].cat.codes
df.theme_name = df.theme_name.astype("category")
df.ages = df.ages.astype("category")


In [4]:
df1 = df.copy().dropna()
df1 = df1.drop(['prod_desc', 
                'prod_id', 
                'prod_long_desc', 
                'set_name',
                'theme_name',
                'country'], 
                 axis=1)
df2 = pd.get_dummies(df1)

In [5]:
X, Y = df2.drop(['list_price'], axis=1), df2['list_price']

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                                    X, 
                                                    Y, 
                                                    test_size=0.3, 
                                                    random_state=361)

In [None]:
# with sklearn
regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)



# with statsmodels
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)# adding a constant
 
model = sm.OLS(Y_train, X_train).fit()
Y_pred = model.predict(X_test) 
 
print_model = model.summary()
print(print_model)

In [None]:
df_forecast = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
df_forecast

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

In [None]:
X.hist(figsize = (12,10))
plt.show()



In [None]:
plt.scatter(df1['review_difficulty'], df1['list_price'], color='green')
plt.title('List price vs Review difficulty', fontsize=14)
plt.xlabel('Review difficulty', fontsize=14)
plt.ylabel('List price', fontsize=14)
plt.grid(True)
plt.show()

plt.scatter(df1['num_reviews'], df1['list_price'], color='green')
plt.title('List price vs Number of reviews', fontsize=14)
plt.xlabel('Number of reviews', fontsize=14)
plt.ylabel('List price', fontsize=14)
plt.grid(True)
plt.show()

plt.scatter(df1['piece_count'], df1['list_price'], color='green')
plt.title('List price vs Piece count', fontsize=14)
plt.xlabel('Piece count', fontsize=14)
plt.ylabel('List price', fontsize=14)
plt.grid(True)
plt.show()

plt.scatter(df1['play_star_rating'], df1['list_price'], color='green')
plt.title('List price vs Play star rating', fontsize=14)
plt.xlabel('Play star rating', fontsize=14)
plt.ylabel('List price', fontsize=14)
plt.grid(True)
plt.show()

In [23]:
def create_model(momentum = 0.0,
                activation = 'softplus', #git
                learn_rate = 0.1, #git
                dropout_rate = 0.0,
                weight_constraint = 1,
                neurons = 100, #git
                init = 'lecun_uniform', #git
                optimizer = 'adagrad'): #git

    # create model
    NN_model = Sequential()
    NN_model.add(Dense(128, 
                   kernel_initializer=init,
                   input_dim = X_train.shape[1], 
                   activation=activation))

    # The Hidden Layers :
    NN_model.add(Dense(neurons, 
                       kernel_initializer=init,
                       activation=activation,
                      kernel_constraint=maxnorm(weight_constraint)))
    NN_model.add(Dense(neurons, 
                       kernel_initializer=init,
                       activation=activation,
                      kernel_constraint=maxnorm(weight_constraint)))
    NN_model.add(Dropout(dropout_rate)) 
    NN_model.add(Dense(neurons, 
                       kernel_initializer=init,
                       activation=activation,
                      kernel_constraint=maxnorm(weight_constraint)))

    # The Output Layer :
    NN_model.add(Dense(1, kernel_initializer=init,
                       activation='linear'))

    
    NN_model.compile(loss='mean_squared_error', 
                  optimizer=optimizer, 
                  metrics=['mean_absolute_error',
                           'mean_squared_error'])
    return NN_model


model = KerasRegressor(build_fn=create_model, 
                        epochs = 50, 
                        batch_size = 32,
                        verbose = 1)

In [None]:
def grid_search(model = model):
# Use scikit-learn to grid search 
    activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
    momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
    learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
    dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    weight_constraint=[1, 2, 3, 4, 5]
    neurons = [30, 100, 128, 256]
    init = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
    optimizer = [ 'SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
    ##############################################################
    # grid search epochs, batch size
    epochs = [50, 100] # add 50, 100, 150 etc
    batch_size = [10, 32, 64] # add 5, 10, 20, 40, 60, 80, 100 etc
    param_grid = dict(# epochs=epochs,
    #                  batch_size=batch_size,
#                      activation=activation,
    #                  momentum=momentum,
    #                  learn_rate=learn_rate,
                     dropout_rate=dropout_rate,
                     weight_constraint=weight_constraint,
#                      neurons=neurons,
#                      init=init,
#                      optimizer=optimizer
                     )
    ##############################################################
    grid = GridSearchCV(estimator=model, 
                        param_grid=param_grid, 
                        scoring=['r2'],
                        refit = 'r2',
                        n_jobs=1,
                        cv = 2)
    grid_result = grid.fit(X_train, Y_train) 
    ##############################################################
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_r2']
    stds = grid_result.cv_results_['std_test_r2']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
        
grid_search(model)

In [None]:
wights_file = 'Weights-957--230.78113.hdf5' # choose the best checkpoint 
NN_model.load_weights(wights_file) # load it
NN_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

In [None]:
neural_pred = list(NN_model.predict(X_test)[:,0])

In [None]:
neural_forecast = ({'Actual': Y_test, 'Predicted': neural_pred})

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, neural_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, neural_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, neural_pred)))

In [None]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_breast_cancer


lasso001 = Lasso(alpha=0.01, max_iter=10e5)
lasso001.fit(X_train,Y_train)
train_score001=lasso001.score(X_train,Y_train)
test_score001=lasso001.score(X_test,Y_test)
coeff_used001 = np.sum(lasso001.coef_!=0)
print ("training score for alpha=0.01:", train_score001 )
print ("test score for alpha =0.01: ", test_score001)
print ("number of features used: for alpha =0.01:", coeff_used001)

lasso_pred = lasso001.predict(X_test)


In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, lasso_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, lasso_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, lasso_pred)))