In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('lego_Sets.csv')

In [3]:
df.list_price = df.list_price.astype(float)
df['review_difficulty'] = df['review_difficulty'].astype("category")
df['review_difficulty'] = df['review_difficulty'].cat.reorder_categories(['Very Easy',
                                                               'Easy',
                                                               'Average',
                                                               'Challenging',
                                                               'Very Challenging'],
                                                                        ordered = True)
df['review_difficulty'] = df['review_difficulty'].cat.codes
df.theme_name = df.theme_name.astype("category")
df.ages = df.ages.astype("category")


In [4]:
df1 = df.copy().dropna()
df1 = df1.drop(['prod_desc', 
                'prod_id', 
                'prod_long_desc', 
                'set_name',
                'theme_name',
                'country'], 
                 axis=1)
df2 = pd.get_dummies(df1)

In [5]:
X, Y = df2.drop(['list_price'], axis=1), df2['list_price']

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                                    X, 
                                                    Y, 
                                                    test_size=0.3, 
                                                    random_state=361)

In [7]:

X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)# adding a constant
 
model = sm.OLS(Y_train, X_train).fit()
Y_pred = model.predict(X_test) 
 
print_model = model.summary()
print(print_model)

Intercept: 
 30.901344531178218
Coefficients: 
 [ 1.99713057e-01  1.01323505e-01  1.12535631e+01  5.99413873e+00
 -2.64504311e+00 -1.66857111e+01  2.06111404e+01  1.28232295e+01
  4.73645085e+00  3.37507799e-14 -1.32384418e+01 -4.39353767e+00
 -6.13849638e+01  1.16130596e+01 -7.95315638e+01  1.84696563e+01
  1.12063828e+01  4.14989830e+01 -1.95749861e+01  1.23728216e+01
 -4.28254664e+00  4.49863005e-01  1.74470886e+00  2.71896236e+01
 -6.36881915e-01  4.80532815e+00 -1.19826474e+00  6.95260735e+00
  4.83318992e+00  4.14707702e+00  8.75302544e+00  4.68442572e+00
  4.96721051e+00 -1.61465773e+01 -1.11526757e+01 -8.10713244e-01
  1.04923692e+01]
                            OLS Regression Results                            
Dep. Variable:             list_price   R-squared:                       0.861
Model:                            OLS   Adj. R-squared:                  0.860
Method:                 Least Squares   F-statistic:                     1257.
Date:                Tue, 04 Jun 

In [8]:
df_forecast = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
df_forecast

Unnamed: 0,Actual,Predicted
3452,21.3180,36.487817
10419,67.0878,59.078452
10134,6.0878,-8.238637
8084,60.9878,77.950198
3177,77.9922,100.337872
5556,36.0000,44.411773
12246,103.6878,133.638529
920,30.3924,49.036847
12202,170.7878,234.055797
12099,36.5878,50.931033


In [9]:
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

Mean Absolute Error: 16.700803920243782
Mean Squared Error: 926.5778843273896
Root Mean Squared Error: 30.439741857108277


In [None]:
plt.scatter(df1['review_difficulty'], df1['list_price'], color='green')
plt.title('List price vs Review difficulty', fontsize=14)
plt.xlabel('Review difficulty', fontsize=14)
plt.ylabel('List price', fontsize=14)
plt.grid(True)
plt.show()

plt.scatter(df1['num_reviews'], df1['list_price'], color='green')
plt.title('List price vs Number of reviews', fontsize=14)
plt.xlabel('Number of reviews', fontsize=14)
plt.ylabel('List price', fontsize=14)
plt.grid(True)
plt.show()

plt.scatter(df1['piece_count'], df1['list_price'], color='green')
plt.title('List price vs Piece count', fontsize=14)
plt.xlabel('Piece count', fontsize=14)
plt.ylabel('List price', fontsize=14)
plt.grid(True)
plt.show()

plt.scatter(df1['play_star_rating'], df1['list_price'], color='green')
plt.title('List price vs Play star rating', fontsize=14)
plt.xlabel('Play star rating', fontsize=14)
plt.ylabel('List price', fontsize=14)
plt.grid(True)
plt.show()