In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test.tsv', sep='\t')

In [3]:
print(train.shape)
print(test.shape)    

(1482535, 8)
(693359, 7)


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1482535 non-null  int64  
 1   name               1482535 non-null  object 
 2   item_condition_id  1482535 non-null  int64  
 3   category_name      1476208 non-null  object 
 4   brand_name         849853 non-null   object 
 5   price              1482535 non-null  float64
 6   shipping           1482535 non-null  int64  
 7   item_description   1482531 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693359 entries, 0 to 693358
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   test_id            693359 non-null  int64 
 1   name               693359 non-null  object
 2   item_condition_id  693359 non-null  int64 
 3   category_name      690301 non-null  object
 4   brand_name         397834 non-null  object
 5   shipping           693359 non-null  int64 
 6   item_description   693359 non-null  object
dtypes: int64(3), object(4)
memory usage: 37.0+ MB


In [6]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


# Data processing

In [7]:
# 1. Split category_name and fill in missing values
def cat_split(text):
    if str(text) != "nan" and len(text.split("/"))==3:
        return text.split("/")
    else:return ("Other","Other","Other")
train["cat_1"], train["cat_2"], train["cat_3"] = zip(*train["category_name"].apply(lambda cat: cat_split(cat)))
test["cat_1"], test["cat_2"], test["cat_3"] = zip(*test["category_name"].apply(lambda cat: cat_split(cat)))

In [8]:
# 2. Fill in null values in category_name, brand_name and item_description
train['category_name'].fillna('Missing', inplace = True)
train['brand_name'].fillna('Missing', inplace = True)
train['item_description'].fillna(value="None", inplace=True)
test['category_name'].fillna('Missing', inplace = True)
test['brand_name'].fillna('Missing', inplace = True)
test['item_description'].fillna(value="None", inplace=True)

In [9]:
train.head(5)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,cat_1,cat_2,cat_3
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,Missing,10.0,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,Missing,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,Missing,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


In [10]:
# id is not used, category_name is split to predict, but a specific combination may have an impact, so it is retained
train.drop(columns =["train_id"], inplace = True)
test.drop(columns =["test_id"], inplace = True)

In [11]:
# For train, there is a price attribute, which is a label that needs to be taken out separately
y_train = train['price']
train = train.drop(columns=['price'])

In [12]:
# Adjust y to log (y + 1)
y = np.log(y_train.values + 1)

In [13]:
# Non-numerical types need to be changed to numerical types
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
for col in train.select_dtypes(include=['object']).columns.tolist():
    train[col] = le.fit_transform(train[col])

In [14]:
train.head(5)

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,cat_1,cat_2,cat_3
0,640809,3,808,2887,1,806610,5,102,758
1,903932,3,86,3558,0,1090878,1,30,212
2,91532,1,1255,4181,1,115289,9,103,94
3,561143,1,485,2887,1,782305,3,55,401
4,45483,1,1182,2887,0,366652,9,58,532


In [15]:
# test is the same
le=LabelEncoder()
for col in test.select_dtypes(include=['object']).columns.tolist():
    test[col] = le.fit_transform(test[col])

In [16]:
test.head(5)

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,cat_1,cat_2,cat_3
0,110376,1,1119,2334,1,457436,9,58,621
1,21757,1,826,2334,1,31753,6,71,652
2,140281,1,922,814,1,125968,8,7,352
3,186403,2,1170,2334,0,11516,9,96,158
4,277503,3,790,2334,1,431600,6,14,616


In [17]:
# Split the dataset into 8:2
split_index = int(0.8 * len(train))
X_train, X_val = train[:split_index], train[split_index:]
y_train, y_val = y[:split_index], y[split_index:]

In [18]:
from sklearn.metrics import mean_squared_error, r2_score

def rmsle(Y, Y_pred):
    assert Y.shape == Y_pred.shape
    return np.sqrt(np.mean(np.square(Y_pred - Y)))

In [19]:
def run_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    
    ypre = model.predict(X_val)
    
    mse = mean_squared_error(y_val, ypre)
    r_sq = r2_score(y_val, ypre)
    rmlse_1 = rmsle(y_val, ypre)
    
    print("Mean Squared Error Value : "+"{:.2f}".format(mse))
    print("\nR-Squared Value : "+"{:.2f}".format(r_sq))
    print("\nRMLSE : "+"{:.2f}".format(rmlse_1))
    return model, mse, r_sq, rmlse_1

In [20]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
print("Linear Regression")
print("---------------------------------------")
model_2, mse_2, r_sq_2, rmlse_3 = run_model(linear_reg, X_train, y_train, X_val, y_val)

Linear Regression
---------------------------------------
Mean Squared Error Value : 0.52

R-Squared Value : 0.08

RMLSE : 0.72


In [21]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=0.1)
print("Ridge Regression")
print("---------------------------------------")
model_3, mse_3, r_sq_3, rmlse_3 = run_model(ridge_reg, X_train, y_train, X_val, y_val)

Ridge Regression
---------------------------------------
Mean Squared Error Value : 0.52

R-Squared Value : 0.08

RMLSE : 0.72


In [22]:
# import numpy as np
from sklearn.linear_model import RidgeCV
# from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error

# Define a list of candidate alpha values
alphas = np.logspace(-4, 4, 100)

# Selecting the optimal alpha value in cross validation using RidgeCV
ridge_cv = RidgeCV(alphas=alphas, cv=10) # 10 fold cross validation
print("Ridge Regression")
print("---------------------------------------")

# run model
model_4, mse_4, r_sq_4, rmlse_4 = run_model(ridge_cv, X_train, y_train, X_val, y_val)
# print the best alpha
print("Best alpha:", ridge_cv.alpha_)

Ridge Regression
---------------------------------------
Mean Squared Error Value : 0.52

R-Squared Value : 0.08

RMLSE : 0.72
Best alpha: 10.235310218990268


In [23]:
import lightgbm as lgb
# import numpy as np
from sklearn.model_selection import GridSearchCV

# create a LightGBM classification
lgbm_reg = lgb.LGBMRegressor()
print("LGBM Regression")
print("---------------------------------------")


# Define the grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [20, 40, 60],
    'num_leaves': [15, 31, 63],
    'max_depth': [3, 5, 7]
}

# Use GridSearchCV to improve the data
grid = GridSearchCV(lgbm_reg, param_grid, cv=5)


model_4, mse_4, r_sq_4, rmlse_4 = run_model(grid, X_train, y_train, X_val, y_val)

# To print the best combination of data
print("Best parameters found: ", grid.best_params_)


LGBM Regression
---------------------------------------
Mean Squared Error Value : 0.33

R-Squared Value : 0.41

RMLSE : 0.57
Best parameters found:  {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 60, 'num_leaves': 63}


In [24]:
# import lightgbm as lgb
# import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

# created lightGBM model
lgbm_reg = lgb.LGBMRegressor()
print("LGBM Regression")
print("---------------------------------------")

param_dist = {
    'learning_rate': sp_uniform(0.01, 0.2),
    'n_estimators': sp_randint(20, 100),
    'num_leaves': sp_randint(15, 64),
    'max_depth': sp_randint(3, 8),
    'min_child_samples': sp_randint(10, 50),
    'subsample': sp_uniform(0.5, 0.5),
    'colsample_bytree': sp_uniform(0.5, 0.5)
}

# use RandomizedSearchCV to perform parameter optimization
random_search = RandomizedSearchCV(lgbm_reg, param_distributions=param_dist, n_iter=50, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
random_search.fit(X_train, y_train)

# run model
model_4, mse_4, r_sq_4, rmlse_4 = run_model(random_search.best_estimator_, X_train, y_train, X_val, y_val)

print("Best parameters found: ", random_search.best_params_)



LGBM Regression
---------------------------------------
Mean Squared Error Value : 0.33

R-Squared Value : 0.41

RMLSE : 0.57
Best parameters found:  {'colsample_bytree': 0.5104296063761273, 'learning_rate': 0.20955079695161294, 'max_depth': 6, 'min_child_samples': 26, 'n_estimators': 88, 'num_leaves': 44, 'subsample': 0.8588721707751128}


In [25]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor()
print("RandomForest Regression")
print("---------------------------------------")
model_4, mse_4, r_sq_4, rmlse_4 = run_model(rf_reg,X_train, y_train, X_val, y_val)

RandomForest Regression
---------------------------------------
Mean Squared Error Value : 0.29

R-Squared Value : 0.48

RMLSE : 0.54


In [26]:
# from sklearn.ensemble import RandomForestRegressor

# Setting of the data
params = {
    'n_estimators': 200,
    'max_depth': 30,
    'min_samples_split': 4,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'min_impurity_decrease': 0.01
}

rf_reg = RandomForestRegressor(**params)
print("RandomForest Regression")
print("---------------------------------------")
model_4, mse_4, r_sq_4, rmlse_4 = run_model(rf_reg, X_train, y_train, X_val, y_val)


RandomForest Regression
---------------------------------------
Mean Squared Error Value : 0.53

R-Squared Value : 0.06

RMLSE : 0.73


In [27]:
# from sklearn.ensemble import RandomForestRegressor

# Setting of the data
params = {
    'n_estimators': 200,
    # 'max_depth': 25,
    # 'min_samples_split': 4,
    'min_samples_leaf': 5,
    # 'max_features': 0.8,
    # 'min_impurity_decrease': 0.01,
    'n_jobs': -1
}

rf_reg = RandomForestRegressor(**params)
print("RandomForest Regression")
print("---------------------------------------")
model_4, mse_4, r_sq_4, rmlse_4 = run_model(rf_reg, X_train, y_train, X_val, y_val)


RandomForest Regression
---------------------------------------
Mean Squared Error Value : 0.28

R-Squared Value : 0.50

RMLSE : 0.53


In [28]:
# from sklearn.ensemble import RandomForestRegressor

# Setting of the data
params = {
    'n_estimators': 250,
    # 'max_depth': 25,
    #'min_samples_split': 1,
    'min_samples_leaf': 3,
    #'max_features': 'auto',
    # 'min_impurity_decrease': 0.01,
    'n_jobs': -1
}

rf_reg = RandomForestRegressor(**params)
print("RandomForest Regression")
print("---------------------------------------")
model_4, mse_4, r_sq_4, rmlse_4 = run_model(rf_reg, X_train, y_train, X_val, y_val)


RandomForest Regression
---------------------------------------
Mean Squared Error Value : 0.28

R-Squared Value : 0.49

RMLSE : 0.53


In [29]:
# -------------------------- LSTM was added to the data processing to deal with the text features -------------------------------------

In [30]:
train_df = pd.read_table("train.tsv")
print(train_df.shape)
train_df.head(5)

(1482535, 8)


Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [31]:
# 1. Category_ Name split and fill the missing value
def cat_split(text):
    text = str(text)
    if len(text.split("/"))==3:
        return text.split("/")
    else:return ("Other","Other","Other")
train_df["cat_1"], train_df["cat_2"], train_df["cat_3"] = zip(*train_df["category_name"].apply(lambda cat: cat_split(cat)))
train_df.head(5)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,cat_1,cat_2,cat_3
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,Women,Jewelry,Necklaces


In [32]:
# 2.Fill the category_ Name, brand_ Name and item_ Null in description
train_df['category_name'].fillna('Missing', inplace = True)
train_df['brand_name'].fillna('Missing', inplace = True)
train_df['item_description'].fillna(value="None", inplace=True)

In [33]:
# ID is useless, category_ Although name's prediction had been decoded, the specific combination might have an impact, so he kept it
train_df.drop(columns =["train_id"], inplace = True)

In [34]:
#As for the train, it had the attribute of price. Label had to take it out separately
y_train = train_df['price']
train_df2 = train_df.drop(columns=['price'])

In [35]:
#Adjust y to log (y+1)
y = np.log(y_train.values + 1)

In [36]:
#You should change the non value type to the value type
from sklearn.preprocessing import LabelEncoder

#Make the same changes here,'name'and'description'
col_temp = ['category_name','brand_name','cat_1','cat_2','cat_3']

le=LabelEncoder()
for col in col_temp:
    train_df2[col] = le.fit_transform(train_df[col])

In [37]:
train_df2.head(5)

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,cat_1,cat_2,cat_3
0,MLB Cincinnati Reds T Shirt Size XL,3,808,2887,1,No description yet,5,102,758
1,Razer BlackWidow Chroma Keyboard,3,86,3558,0,This keyboard is in great condition and works ...,1,30,212
2,AVA-VIV Blouse,1,1255,4181,1,Adorable top with a hint of lace and a key hol...,9,103,94
3,Leather Horse Statues,1,485,2887,1,New with tags. Leather horses. Retail for [rm]...,3,55,401
4,24K GOLD plated rose,1,1182,2887,0,Complete with certificate of authenticity,9,58,532


In [38]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model

In [39]:
#Word division

max_features = 10000
tokenizer = Tokenizer(num_words=max_features, split=" ")
tokenizer.fit_on_texts(train_df2["name"] + " " + train_df2["item_description"])
X_name = tokenizer.texts_to_sequences(train_df2["name"])
X_item_description = tokenizer.texts_to_sequences(train_df2["item_description"])

In [40]:
#Fill the sequence to get the same length

max_len_name = 10
max_len_item_description = 50
X_name = pad_sequences(X_name, maxlen=max_len_name)
X_item_description = pad_sequences(X_item_description, maxlen=max_len_item_description)

In [41]:
X_name_train, X_name_test, X_item_desc_train, X_item_desc_test, y_train, y_test = train_test_split( X_name, X_item_description, y, test_size=0.2, random_state=42)

In [42]:
#To define the LSTM model
embedding_dim = 128
lstm_units = 64

In [43]:
input_name = Input(shape=(max_len_name,))
embedding_name = Embedding(max_features, embedding_dim)(input_name)
lstm_name = LSTM(lstm_units)(embedding_name)

In [44]:
input_item_description = Input(shape=(max_len_item_description,))
embedding_item_description = Embedding(max_features, embedding_dim)(input_item_description)
lstm_item_description = LSTM(lstm_units)(embedding_item_description)

In [45]:
#Connect the two LSTM's output
concatenated = Concatenate()([lstm_name, lstm_item_description])

In [46]:
#A new model was defined, which was used to transform the text into a

 

encoder_model = Model(inputs=[input_name, input_item_description], outputs=concatenated)

In [47]:
from keras.callbacks import EarlyStopping

#Create the EarlyStopping object and start the early stop strategy
early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=3,
    verbose=1,
    restore_best_weights=True,
    min_delta=0.001,
    mode="min"
)


#Training the LSTM model
encoder_model.compile(optimizer="adam", loss="mean_squared_error")
encoder_model.fit(
    [X_name_train, X_item_desc_train],
    y_train,
    validation_data=([X_name_test, X_item_desc_test], y_test),
    epochs=10,
    batch_size=64,
#     callbacks=[early_stopping]  # 在此处添加 early_stopping 回调
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x202c1c20af0>

In [48]:
from keras.models import load_model

#Save the model
encoder_model.save("encoder_model.h5")

# Loading the model
# loaded_model = load_model("encoder_model.h5")

In [49]:
#Using the LSTM model to transform the text into a vector
X_train_encoded = encoder_model.predict([X_name_train, X_item_desc_train])
X_test_encoded = encoder_model.predict([X_name_test, X_item_desc_test])



In [50]:
train_temp = train_df2.drop(columns =["item_description","name"])
train_temp.head(5)

Unnamed: 0,item_condition_id,category_name,brand_name,shipping,cat_1,cat_2,cat_3
0,3,808,2887,1,5,102,758
1,3,86,3558,0,1,30,212
2,1,1255,4181,1,9,103,94
3,1,485,2887,1,3,55,401
4,1,1182,2887,0,9,58,532


In [51]:
x_train,x_test = train_test_split(train_temp, test_size=0.2, random_state=42)

In [52]:
#Splice other preprocessed data with LSTM encoded text data
X_train_combined = np.concatenate((X_train_encoded,x_train.values), axis=1)
X_test_combined = np.concatenate((X_test_encoded,x_test.values), axis=1)

In [53]:
# from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
print("Linear Regression")
print("---------------------------------------")
model_2, mse_2, r_sq_2, rmlse_3 = run_model(linear_reg, X_train_combined, y_train, X_test_combined, y_test)

Linear Regression
---------------------------------------
Mean Squared Error Value : 0.52

R-Squared Value : 0.07

RMLSE : 0.72


In [54]:
# import lightgbm as lgb

lgbm_reg = lgb.LGBMRegressor()
print("LGBM Regression")
print("---------------------------------------")
model_4, mse_4, r_sq_4, rmlse_4 = run_model(lgbm_reg,X_train_combined, y_train, X_test_combined, y_test)

LGBM Regression
---------------------------------------
Mean Squared Error Value : 0.35

R-Squared Value : 0.38

RMLSE : 0.59


In [55]:
# from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor()
print("RandomForest Regression")
print("---------------------------------------")
model_4, mse_4, r_sq_4, rmlse_4 = run_model(rf_reg,X_train_combined, y_train, X_test_combined, y_test)

RandomForest Regression
---------------------------------------
Mean Squared Error Value : 0.31

R-Squared Value : 0.45

RMLSE : 0.56
