# Predict Album sales with MLP

In [15]:
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import keras as K
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
import keras.backend.tensorflow_backend as KTF

In [2]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=True   
sess = tf.Session(config=config)

KTF.set_session(sess)

In [145]:
data = pd.read_csv('albums.csv')
data.head()

Unnamed: 0,id,artist_id,album_title,genre,year_of_pub,num_of_tracks,num_of_sales,rolling_stone_critic,mtv_critic,music_maniac_critic
0,1,1767,Call me Cat Moneyless That Doggies,Folk,2006,11,905193,4.0,1.5,3.0
1,2,23548,Down Mare,Metal,2014,7,969122,3.0,4.0,5.0
2,3,17822,Embarrassed Hungry,Latino,2000,11,522095,2.5,1.0,2.0
3,4,19565,Standard Immediate Engineer Slovakia,Pop,2017,4,610116,1.5,2.0,4.0
4,5,24941,Decent Distance Georgian,Black Metal,2010,8,151111,4.5,2.5,1.0


In [146]:
np.shape(data)

(100000, 10)

## Data to categorical

In [147]:
data = pd.concat([data,pd.get_dummies(data.genre)],axis=1)
data.drop(['genre'],axis=1, inplace=True)

## Delete unnecesary columns

In [148]:
del data['album_title']
del data['id']
del data['artist_id']
del data['year_of_pub']

In [149]:
data.head()

Unnamed: 0,num_of_tracks,num_of_sales,rolling_stone_critic,mtv_critic,music_maniac_critic,Alternative,Ambient,Black Metal,Blues,Boy Band,...,Pop-Rock,Progressive,Punk,Rap,Retro,Rock,Techno,Trap,Unplugged,Western
0,11,905193,4.0,1.5,3.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,969122,3.0,4.0,5.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11,522095,2.5,1.0,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,610116,1.5,2.0,4.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,151111,4.5,2.5,1.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Prepare data

In [150]:
x = data.loc[:, data.columns != 'num_of_sales']
y = data["num_of_sales"]

In [151]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [174]:
np.shape(y_train)

(80000,)

## MLP

Create a model with:
* Xavier initialization (glorot uniform)
* Batch normalization between each layer
* Nadam optimizer
* Mean Absolute Error for cost

In [176]:
model = Sequential()
model.add(Dense(128, input_shape = (42, ), activation = "relu",kernel_initializer='glorot_uniform',
                bias_initializer='zeros'))
model.add(BatchNormalization())
model.add(Dense(64, activation = "relu",kernel_initializer='glorot_uniform',
                bias_initializer='zeros'))
model.add(BatchNormalization())
model.add(Dense(32, activation = "relu",kernel_initializer='glorot_uniform',
                bias_initializer='zeros'))
model.add(BatchNormalization())
model.add(Dense(1, activation="linear"))
model.compile(loss = "mae", optimizer = "nadam")

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_69 (Dense)             (None, 128)               5504      
_________________________________________________________________
batch_normalization_48 (Batc (None, 128)               512       
_________________________________________________________________
dense_70 (Dense)             (None, 64)                8256      
_________________________________________________________________
batch_normalization_49 (Batc (None, 64)                256       
_________________________________________________________________
dense_71 (Dense)             (None, 32)                2080      
_________________________________________________________________
batch_normalization_50 (Batc (None, 32)                128       
_________________________________________________________________
dense_72 (Dense)             (None, 1)                 33        
Total para

In [177]:
model.fit(x_train, y_train, epochs = 100, validation_split = 0.2)

Train on 64000 samples, validate on 16000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/1

<keras.callbacks.History at 0x2190e9732b0>

## Save weigths on disk

In [205]:
model.save_weights('modelwithoutDropout2layersRegulizer.h5') 

## Testing model

In [178]:
scores = model.evaluate(x_test, y_test, verbose=0)

In [180]:
print("costo de test: "+str(scores))

costo de test: 250876.6443


## Manual Test

In [207]:
print("real value: "+str(y_test.iloc[12]))

real value: 225722


In [206]:
print("predicted value: "+str(model.predict(np.array(x_test.iloc[12]).reshape(1,42))))

predicted value: [[398261.34]]
