In [2]:
import pandas as pd
df = pd.read_excel("/kaggle/input/ddssssd/Online Retail.xlsx")

In [3]:
# List of descriptions to drop
descriptions_to_drop = [
    'Manual', 'Adjust bad debt', 'AMAZON FEE', 'DOTCOM POSTAGE', 
    'Bank Charges', 'POSTAGE', 'SAMPLES', 'Discount', 
    'CRUK Commission', 'CARRIAGE'
]
# Dropping rows where 'Description' is in descriptions_to_drop
df = df[~df['Description'].isin(descriptions_to_drop)]

In [4]:
df = df[['Description', 'Country', 'UnitPrice']]
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [19]:
import numpy as np
# List of descriptions to drop
descriptions_to_drop = [
    'Manual', 'Adjust bad debt', 'AMAZON FEE', 'DOTCOM POSTAGE',
    'Bank Charges', 'POSTAGE', 'SAMPLES', 'Discount',
    'CRUK Commission', 'CARRIAGE'
]

# Dropping rows where 'Description' is in descriptions_to_drop
df = df[~df['Description'].isin(descriptions_to_drop)]

# Keep only necessary columns
df = df[['Description', 'Country', 'UnitPrice']]

# Dropping duplicates
df = df.drop_duplicates()

# Dropping rows with missing values
df = df.dropna()

# Ensure all descriptions are strings
df['Description'] = df['Description'].astype(str)

# Convert descriptions to lowercase
df['Description'] = df['Description'].str.lower()

# Drop rows where 'UnitPrice' is less than or equal to 0
df = df[df['UnitPrice'] > 0]

# Remove outliers in 'UnitPrice' (e.g., prices above 220)
df = df[df['UnitPrice'] < 220]

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import re
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, concatenate, Dropout
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.optimizers import Adam

# Assuming 'df' is already loaded with your data

# Data preprocessing steps
# ... [Your existing preprocessing code]

# Clean the descriptions
def clean_text(text):    
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    text = ' '.join(text.split())
    return text

df['Description'] = df['Description'].apply(clean_text)


# Tokenize the descriptions
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df['Description'])
X_description_seq = tokenizer.texts_to_sequences(df['Description'])

# Pad the sequences
max_sequence_length = 12
X_description_padded = pad_sequences(X_description_seq, maxlen=max_sequence_length)

# Create an embedding matrix
vocab_size = min(10000, len(tokenizer.word_index) + 1)
embedding_dim = 50  # Using GloVe 50d embeddings you already have
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Encode 'Country'
ohe_country = OneHotEncoder(drop='first', sparse=False)
X_country_encoded = ohe_country.fit_transform(df[['Country']])

# Combine all features
X_combined = np.hstack((X_description_padded, X_country_encoded, additional_features))

# Split data into features (X) and target (y)
y = df['UnitPrice']
X_train_combined, X_test_combined, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)

# Separate the inputs
X_train_text = X_train_combined[:, :max_sequence_length]
X_train_country = X_train_combined[:, max_sequence_length:max_sequence_length + X_country_encoded.shape[1]]
X_train_additional = X_train_combined[:, -additional_features.shape[1]:]

X_test_text = X_test_combined[:, :max_sequence_length]
X_test_country = X_test_combined[:, max_sequence_length:max_sequence_length + X_country_encoded.shape[1]]
X_test_additional = X_test_combined[:, -additional_features.shape[1]:]

# Log transform the target variable
epsilon = 1e-6
y_train_log = np.log(y_train + epsilon)
y_test_log = np.log(y_test + epsilon)

# Build the model with additional features
text_input = Input(shape=(max_sequence_length,), name='text_input')
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            trainable=True)(text_input)
lstm_layer = LSTM(128)(embedding_layer)

country_input = Input(shape=(X_country_encoded.shape[1],), name='country_input')
additional_input = Input(shape=(additional_features.shape[1],), name='additional_input')

combined = concatenate([lstm_layer, country_input, additional_input])

dense1 = Dense(256, activation='relu')(combined)
dropout1 = Dropout(0.3)(dense1)
dense2 = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.3)(dense2)
dense3 = Dense(64, activation='relu')(dropout2)

output = Dense(1, name='output')(dense3)

model = Model(inputs=[text_input, country_input, additional_input], outputs=output)

# Compile the model
learning_rate = 0.0001
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='mean_squared_error')

# Implement early stopping
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    [X_train_text, X_train_country, X_train_additional],
    y_train_log,
    epochs=25,
    batch_size=32,
    validation_data=([X_test_text, X_test_country, X_test_additional], y_test_log),
    callbacks=[early_stopping]
)

# Predict and evaluate
y_pred_log = model.predict([X_test_text, X_test_country, X_test_additional])
y_pred = np.exp(y_pred_log.flatten()) - epsilon
y_pred = np.maximum(y_pred, 0)

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Save the model and tokenizer
model.save('unitprice_prediction_model_v4.h5')
with open('tokenizer_v4.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
with open('ohe_country_v4.pkl', 'wb') as f:
    pickle.dump(ohe_country, f)
with open('scaler_v4.pkl', 'wb') as f:
    pickle.dump(scaler, f)
params = {
    'max_sequence_length': max_sequence_length,
    'epsilon': epsilon,
    'vocab_size': vocab_size,
    'embedding_dim': embedding_dim
}
with open('model_params_v4.pkl', 'wb') as f:
    pickle.dump(params, f)


Epoch 1/25




[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - loss: 1.0800 - val_loss: 0.6998
Epoch 2/25
[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 0.7008 - val_loss: 0.5180
Epoch 3/25
[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 0.5380 - val_loss: 0.4253
Epoch 4/25
[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.4421 - val_loss: 0.3582
Epoch 5/25
[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - loss: 0.3769 - val_loss: 0.3370
Epoch 6/25
[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.3399 - val_loss: 0.3289
Epoch 7/25
[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.3100 - val_loss: 0.2854
Epoch 8/25
[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 0.2906 - val_loss: 0.2680
Epoch 9/25
[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━