In [2]:
import pandas as pd
df=pd.read_csv('train.csv')
df=df.drop(columns=['image_link'])
df.sample(6)

Unnamed: 0,sample_id,catalog_content,price
44246,236341,Item Name: Lemon Lime Snow Cone Syrup (25 oz) ...,14.95
10974,14064,Item Name: Great Value Aspartame Sweetener Pac...,5.84
30583,264281,"Item Name: Progresso Chicken Broth, Reduced So...",3.535
39,274881,"Item Name: Starbucks Ground Coffee, 16 caffe v...",16.99
4543,82605,Item Name: Newmans Own Organic Wheat Free Fig ...,7.7
24555,145402,Item Name: Yummallo Marshmallows 185g (Twist M...,5.39


In [3]:
import re
def clean_text(text):
    text=text.lower()
    text=re.sub(r"<.*?>"," ",text)
    text=re.sub(r'http\S+|www\S+|https\S+','',text)
    text=re.sub(r'[^a-zA-Z0-9\s]','',text)
    text=re.sub(r'\s+',' ',text).strip()
    return text

In [4]:
df['cleaned_text']=df['catalog_content'].apply(clean_text)

In [5]:
df['cleaned_text'].str.len()

0          83
1         490
2         313
3        1264
4         139
         ... 
74995     748
74996    1645
74997     132
74998     869
74999     273
Name: cleaned_text, Length: 75000, dtype: int64

In [6]:
df['cleaned_text'].str.split().str.len()

0         18
1         80
2         59
3        205
4         26
        ... 
74995    130
74996    271
74997     21
74998    134
74999     47
Name: cleaned_text, Length: 75000, dtype: int64

In [7]:
df['cleaned_text'].str.len().describe()

count    75000.000000
mean       866.888160
std        816.969425
min         25.000000
25%        235.000000
50%        608.000000
75%       1223.000000
max       7715.000000
Name: cleaned_text, dtype: float64

In [25]:
from sentence_transformers import SentenceTransformer
model=SentenceTransformer('bert-base-nli-mean-tokens')
text=df['cleaned_text'].to_list()
embeddings=model.encode(text,batch_size=32,show_progress_bar=True,convert_to_numpy=True)

Batches: 100%|██████████| 2344/2344 [14:05<00:00,  2.77it/s]  


In [26]:
embeddings.shape

(75000, 768)

In [122]:
# Load the model (uses GPU automatically if available)
from sentence_transformers import SentenceTransformer
model=SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Convert cleaned text into embeddings
texts2=df["cleaned_text"].tolist()

embeddings2=model.encode(
    texts2,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

print("Embedding shape:",embeddings2.shape)

Batches: 100%|██████████| 2344/2344 [34:34<00:00,  1.13it/s] 


Embedding shape: (75000, 768)


In [27]:
df["desc_len"]=df["cleaned_text"].str.len()
df["word_count"]=df["cleaned_text"].str.split().str.len()
df["num_digits"]=df["cleaned_text"].str.count(r"\d")

In [28]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
pca=PCA(n_components=300,random_state=42)
x_pca=pca.fit_transform(embeddings)
kmeans=KMeans(n_clusters=121,random_state=42)
df['automated_category_ID']=kmeans.fit_predict(x_pca)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
df['price_category_ID']=KMeans(n_clusters=5,random_state=42).fit_predict(df[['price']])

In [30]:
df['automated_category_ID'].value_counts()

automated_category_ID
26     1161
92     1143
14     1091
28     1021
91      996
       ... 
6       290
55      287
82      260
83      251
116     205
Name: count, Length: 121, dtype: int64

In [98]:
df.where(df['automated_category_ID']==26).dropna()

Unnamed: 0,sample_id,catalog_content,price,cleaned_text,desc_len,word_count,num_digits,automated_category_ID,price_category_ID
127,144305.0,Item Name: Maraschino Cherry Oolong Tea (50 te...,56.01,item name maraschino cherry oolong tea 50 tea ...,2341.0,390.0,29.0,26.0,1.0
144,142282.0,"Item Name: Raspberry Oolong Tea (50 tea bags, ...",38.54,item name raspberry oolong tea 50 tea bags zin...,2213.0,377.0,29.0,26.0,1.0
180,20013.0,Item Name: Orange Lemongrass Dandelion Root Te...,33.50,item name orange lemongrass dandelion root tea...,2407.0,395.0,28.0,26.0,1.0
318,147682.0,"Item Name: Gooseberry White Tea (50 tea bags, ...",45.27,item name gooseberry white tea 50 tea bags zin...,2352.0,388.0,27.0,26.0,1.0
453,2563.0,Item Name: Lemon Tarragon Chamomile Tea (Loose...,56.37,item name lemon tarragon chamomile tea loose 8...,2327.0,385.0,28.0,26.0,1.0
...,...,...,...,...,...,...,...,...,...
74761,143562.0,"Item Name: Corn Rooibos Tea (Loose) (4 oz, ZIN...",17.97,item name corn rooibos tea loose 4 oz zin 5319...,2226.0,382.0,26.0,26.0,0.0
74787,19474.0,Item Name: Root Beer Dandelion Root Tea (25 te...,32.30,item name root beer dandelion root tea 25 tea ...,2370.0,392.0,28.0,26.0,1.0
74846,5114.0,Item Name: Citrus Punch Green Tea (50 tea bags...,38.92,item name citrus punch green tea 50 tea bags z...,2329.0,396.0,29.0,26.0,1.0
74867,144874.0,Item Name: Peppermint Lemongrass White Tea (Lo...,57.63,item name peppermint lemongrass white tea loos...,2325.0,385.0,27.0,26.0,1.0


In [102]:
df.where((df['automated_category_ID']==26) & (df['price_category_ID']==2)).dropna()

Unnamed: 0,sample_id,catalog_content,price,cleaned_text,desc_len,word_count,num_digits,automated_category_ID,price_category_ID
831,145261.0,Item Name: Mulled Spice Chamomile Tea (50 tea ...,86.43,item name mulled spice chamomile tea 50 tea ba...,2362.0,388.0,29.0,26.0,2.0
972,145542.0,Item Name: Blackberry Rooibos Tea (50 tea bags...,86.02,item name blackberry rooibos tea 50 tea bags z...,2293.0,381.0,29.0,26.0,2.0
1323,1708.0,Item Name: Passionfruit Papaya Rooibos Tea (50...,86.02,item name passionfruit papaya rooibos tea 50 t...,2323.0,382.0,29.0,26.0,2.0
1361,148709.0,Item Name: Blueberry Lemon Rooibos Tea (50 tea...,86.02,item name blueberry lemon rooibos tea 50 tea b...,2325.0,385.0,29.0,26.0,2.0
2052,143908.0,Item Name: French Vanilla Rooibos Tea (Loose) ...,81.28,item name french vanilla rooibos tea loose 8 o...,2295.0,383.0,28.0,26.0,2.0
...,...,...,...,...,...,...,...,...,...
72771,20136.0,Item Name: Ginger Papaya Yerba Mate Tea (50 te...,93.46,item name ginger papaya yerba mate tea 50 tea ...,2389.0,398.0,29.0,26.0,2.0
72951,20959.0,Item Name: Apricot Chili Pepper Yerba Mate Tea...,93.46,item name apricot chili pepper yerba mate tea ...,2441.0,407.0,29.0,26.0,2.0
73017,144956.0,"Item Name: Carrot Chamomile Tea (50 tea bags, ...",86.43,item name carrot chamomile tea 50 tea bags zin...,2306.0,383.0,29.0,26.0,2.0
73745,147837.0,Item Name: Bergamot Lemon Lime Chamomile Tea (...,86.43,item name bergamot lemon lime chamomile tea 50...,2354.0,395.0,29.0,26.0,2.0


In [48]:
df['price_category_ID'].value_counts()

price_category_ID
0    53804
1    17098
2     3749
4      344
3        5
Name: count, dtype: int64

In [56]:
df.where(df['price_category_ID']==2).dropna()

Unnamed: 0,sample_id,catalog_content,price,cleaned_text,desc_len,word_count,num_digits,automated_category_ID,price_category_ID
7,222007.0,Item Name: VineCo Original Series Chilean Sauv...,94.00,item name vineco original series chilean sauvi...,2230.0,354.0,11.0,64.0,2.0
15,266475.0,Item Name: Organic Vinegar; Apple Cider\nValue...,81.44,item name organic vinegar apple cider value 10...,59.0,11.0,4.0,62.0,2.0
18,271423.0,Item Name: BulkSupplements.com Trehalose Powde...,109.97,item name bulksupplementscom trehalose powder ...,2757.0,433.0,18.0,80.0,2.0
20,267410.0,"Item Name: Food to Live Black-Eyed Peas, 25 Po...",98.99,item name food to live blackeyed peas 25 pound...,1604.0,281.0,16.0,99.0,2.0
38,196576.0,Item Name: Buchu Leaf (Organic) Tea (Loose) (4...,124.52,item name buchu leaf organic tea loose 4 oz zi...,577.0,101.0,32.0,27.0,2.0
...,...,...,...,...,...,...,...,...,...
74875,263110.0,"Item Name: Yocream Yogurt Mix, Cookies and Cre...",107.24,item name yocream yogurt mix cookies and cream...,104.0,20.0,5.0,98.0,2.0
74895,204173.0,Item Name: Tropical Mixed Dried Fruit Medley -...,89.99,item name tropical mixed dried fruit medley by...,2925.0,484.0,17.0,80.0,2.0
74926,247833.0,Item Name: Bloomingmore | Fresh Cut Gypsophili...,105.98,item name bloomingmore fresh cut gypsophilia 1...,1576.0,258.0,17.0,113.0,2.0
74959,52787.0,Item Name: Love and Quiches 1/4 Sheet Special ...,116.44,item name love and quiches 14 sheet special oc...,104.0,19.0,8.0,13.0,2.0


In [73]:
df['price'].where(df['price_category_ID']==3).dropna().describe()

count       5.000000
mean     1439.208000
std       771.543098
min       921.500000
25%      1010.540000
50%      1188.000000
75%      1280.000000
max      2796.000000
Name: price, dtype: float64

In [75]:
for i in range(5):
    print(f"Price Category ID: {i}")
    display(df['price'].where(df['price_category_ID']==i).dropna().describe())

Price Category ID: 0


count    53804.000000
mean        10.770182
std          6.575020
min          0.130000
25%          5.155000
50%          9.770000
75%         15.380000
max         25.910000
Name: price, dtype: float64

Price Category ID: 1


count    17098.000000
mean        41.077261
std         12.151528
min         25.920000
25%         30.582500
50%         37.990000
75%         49.708750
max         72.450000
Name: price, dtype: float64

Price Category ID: 2


count    3749.000000
mean      103.907431
std        27.801083
min        72.465000
25%        81.440000
50%        94.460000
75%       121.180000
max       189.900000
Name: price, dtype: float64

Price Category ID: 3


count       5.000000
mean     1439.208000
std       771.543098
min       921.500000
25%      1010.540000
50%      1188.000000
75%      1280.000000
max      2796.000000
Name: price, dtype: float64

Price Category ID: 4


count    344.000000
mean     276.195262
std       96.532720
min      189.980000
25%      212.912500
50%      239.165000
75%      311.675000
max      779.250000
Name: price, dtype: float64

In [103]:
tab_feats=df[['desc_len','word_count','num_digits','automated_category_ID','price_category_ID']].values

In [104]:
import numpy as np
x=np.hstack([embeddings,tab_feats])
y=df["price"].values
#x2=np.hstack([embeddings2,tab_feats])

In [105]:
#x=np.log1p(np.log1p(x))
y=np.log1p(np.log1p(y))
#x2=np.log1p(np.log1p(x2))

In [106]:
pd.DataFrame(x).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,763,764,765,766,767,768,769,770,771,772
count,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,...,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0,75000.0
mean,-0.768743,0.981656,-0.84926,0.075294,0.211644,-0.929654,0.227844,0.258686,0.518962,-0.341166,...,-0.890655,0.073654,-0.117545,0.356634,0.164734,866.88816,145.62516,14.50596,58.888773,0.346493
std,0.324655,0.306574,0.417551,0.189699,0.280356,0.355002,0.487686,0.375055,0.266908,0.304837,...,0.30112,0.207215,0.327481,0.408655,0.238027,816.969425,134.862567,8.531221,35.393983,0.617943
min,-1.808311,-0.331592,-2.439052,-0.921851,-0.890438,-2.107126,-1.403423,-1.589805,-0.896705,-1.428295,...,-2.045497,-0.920758,-1.830568,-1.40012,-0.872885,25.0,6.0,0.0,0.0,0.0
25%,-0.995073,0.783355,-1.12674,-0.041104,0.008777,-1.174639,-0.114017,0.004275,0.365352,-0.551613,...,-1.105427,-0.05681,-0.330589,0.055013,0.009256,235.0,41.0,8.0,26.0,0.0
50%,-0.801302,0.989269,-0.858102,0.081057,0.188742,-0.959499,0.21602,0.234608,0.545926,-0.343593,...,-0.897996,0.084015,-0.116298,0.340806,0.169028,608.0,103.0,13.0,59.0,0.0
75%,-0.577487,1.191023,-0.583814,0.197945,0.392063,-0.723099,0.546405,0.487436,0.703474,-0.145283,...,-0.686805,0.213204,0.101598,0.639913,0.326693,1223.0,204.0,19.0,90.0,1.0
max,0.841826,2.322398,1.278879,1.055276,1.470654,1.05701,2.262991,1.991799,1.442846,1.242883,...,0.538414,0.883568,1.315381,1.961597,1.056484,7715.0,1330.0,186.0,120.0,4.0


In [158]:
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.2,random_state=42)

In [145]:
from xgboost import XGBRegressor
xgb=XGBRegressor(n_estimators=2500,
    learning_rate=0.02,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.2,
    reg_lambda=0.4,
    tree_method='hist',
    random_state=42)
xgb.fit(x_train,y_train)

In [159]:
y_xgb=xgb.predict(x_val)
np.expm1(np.expm1(y_xgb))

array([ 6.7407775, 48.82841  ,  7.845478 , ...,  4.5023794,  9.212451 ,
       35.186867 ], dtype=float32)

In [160]:
np.expm1(np.expm1(y_val))

array([12.195, 38.54 , 17.86 , ...,  3.34 ,  6.63 , 26.95 ])

In [161]:
def smape(y_true,y_pred):
    return np.mean(2*np.abs(y_true-y_pred)/(np.abs(y_true)+np.abs(y_pred)))*100

y_xgb=xgb.predict(x_val)
score=smape(np.expm1(np.expm1(y_val)),np.expm1(np.expm1(y_xgb)))
print("SMAPE:", score)

SMAPE: 40.396565049641865


In [153]:
testx=x[0:14380]
testy=y[0:14380]

In [154]:
predy=xgb.predict(testx)
smape(np.expm1(np.expm1(testy)),np.expm1(np.expm1(predy)))

17.48826858458779

In [155]:
np.expm1(np.expm1(testy))

array([ 4.89, 13.12,  1.97, ..., 18.95,  3.49, 19.99])

In [None]:
np.expm1(np.expm1(predy))

array([ 5.043171 , 12.241031 ,  1.95681  , ..., 12.605513 ,  4.0867543,
       18.231108 ], dtype=float32)