In [1]:
import pandas as pd
from sklearn.preprocessing import normalize
import numpy as np
def convert_string_to_array(vector_string):
    # remove square brackets
    vector_string = vector_string[2:-2]
    # split string into array of floats
    vector_array = np.array([float(num) for num in vector_string.split()])
    # reshape array to 2D array with single row
    vector_array = vector_array.reshape(1, -1)
    # normalize vectors in the array
    vector_array = normalize(vector_array)
    return vector_array
data = pd.read_csv("word2vec.csv")
print('csv read')
data['vectors'] = data['vectors'].apply(convert_string_to_array)
print('vectors converted')
nparr = np.array(data.vectors.to_list())
print('nparr created')
nparr = nparr.squeeze(axis=1)
print('nparr squeezed')
product_length = data['PRODUCT_LENGTH']

csv read
vectors converted
nparr created
nparr squeezed


In [16]:
nparrID = np.array(data.PRODUCT_TYPE_ID.to_list())
x_min = np.min(nparrID)
x_max = np.max(nparrID)
nparrID = (nparrID - x_min) / (x_max - x_min)

In [23]:
nparrID_col = nparrID.reshape((-1, 1))
npconcat = np.concatenate((nparr, nparrID_col), axis=1)
nparr = npconcat

0.12295081967213115


In [25]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
from tensorflow.keras.callbacks import ReduceLROnPlateau
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(nparr, product_length, test_size=0.05)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.05)
# create a neural network model
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(1024, input_shape=(101,), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='linear')
])
optimizer = tf.keras.optimizers.Adam(lr=0.002)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=1)
# compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=tf.keras.losses.MeanAbsolutePercentageError(), metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])

# train the model
model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_val, y_val), callbacks=[lr_scheduler])

# predict on test data
y_pred = model.predict(X_test)
# calculate evaluation metric
score = max(0, 100 * (1 - metrics.mean_absolute_percentage_error(y_test, y_pred)))
print(metrics.mean_absolute_percentage_error(y_test, y_pred))
print('Score:', score)
model.save('mymodel')

In [4]:
y_pred = model.predict(X_test)



In [3]:
y_pred[:10]

array([[ 268.04276],
       [ 277.26526],
       [ 521.2401 ],
       [ 167.57553],
       [ 496.8228 ],
       [ 957.27747],
       [ 418.289  ],
       [ 393.6325 ],
       [2209.6594 ],
       [ 479.679  ]], dtype=float32)

In [4]:
y_test[:10]

1307313     196.850393
1457979     240.000000
314082      500.000000
824495      270.000000
139289      138.000000
2079520    1181.102361
690524      830.000000
1403365    2050.000000
222487     2400.000000
2111576     590.551180
Name: PRODUCT_LENGTH, dtype: float64

In [5]:
model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30