# Recommended Rank (Super market)

## Libraries

In [2]:
import warnings
import numpy as np
import re
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr
from xgboost import XGBRanker
from collections import Counter

In [3]:
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)


## important Functions

In [4]:
def spearman_scorer(y_true, y_pred):
    return spearmanr(y_true, y_pred).correlation


In [5]:
def most_common_word(text):
    words = re.findall(r'\b\w+\b', text.lower())
    
    if not words:
        return None
    
    counter = Counter(words)
    most_common = counter.most_common(1)
    return most_common[0][0] if most_common else None

In [6]:
def make_groups(X, group_size=10):
    n_samples = len(X)
    n_full_groups = n_samples // group_size
    remainder = n_samples % group_size

    groups = np.repeat(np.arange(n_full_groups), group_size)

    if remainder > 0:
        groups = np.concatenate([groups, np.full(remainder, n_full_groups)])
    
    return groups

## Load dataset

In [7]:
superMarket_dataSet=pd.read_csv("supermarkets_data.csv")
print(superMarket_dataSet.shape,end='\n\n')
print(superMarket_dataSet.columns)
superMarket_dataSet


(541910, 9)

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country', 'comment'],
      dtype='object')


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,comment
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,1/12/2010 8:26,2.55,17850.0,United Kingdom,good
1,536365,71053,WHITE METAL LANTERN,6,1/12/2010 8:26,3.39,17850.0,United Kingdom,good
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,1/12/2010 8:26,2.75,17850.0,United Kingdom,bad
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,1/12/2010 8:26,3.39,17850.0,United Kingdom,bad
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,1/12/2010 8:26,3.39,17850.0,United Kingdom,good
...,...,...,...,...,...,...,...,...,...
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,9/12/2011 12:50,2.10,12680.0,France,good
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,9/12/2011 12:50,4.15,12680.0,France,good
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,9/12/2011 12:50,4.15,12680.0,France,bad
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,9/12/2011 12:50,4.95,12680.0,France,good


## Pre Processing

### Mising values

In [8]:
missing_columns_list=[]
for col in superMarket_dataSet.columns:
    missing_percentage = (superMarket_dataSet[col].isna().sum() / superMarket_dataSet.shape[0]) * 100
    if missing_percentage > 0:
        print(f"{col} missing percentage:\t{missing_percentage}%")
        missing_columns_list.append(col)
    else:
        superMarket_dataSet.dropna(subset=[col], inplace=True)

Description missing percentage:	0.26831023601705084%
Customer ID missing percentage:	24.92664833643963%


In [9]:
for col in missing_columns_list:

    missing_value_percentage=(superMarket_dataSet.loc[:,col].isna().sum()/superMarket_dataSet.shape[0])*100

    if (missing_value_percentage)<0.5:
        superMarket_dataSet=superMarket_dataSet[superMarket_dataSet.loc[:,col].notna()]

    elif superMarket_dataSet.loc[:,col].dtype == np.float64 or superMarket_dataSet.loc[:,col].dtype == np.int64:
        median_col=superMarket_dataSet.loc[:,col].median()
        superMarket_dataSet.loc[:,col].fillna(median_col,inplace=True)
    else:    
        mode_col = superMarket_dataSet[col].mode()
        if not mode_col.empty:
            superMarket_dataSet[col] = superMarket_dataSet[col].fillna(mode_col[0])

In [10]:
superMarket_dataSet.describe()

Unnamed: 0,Quantity,Price,Customer ID
count,540456.0,540456.0,540456.0
mean,9.603113,4.623544,15254.13669
std,218.007397,96.88954,1487.896635
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,14364.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16261.0
max,80995.0,38970.0,18287.0


### Drop the unImportant coulmns (for cosine similarity)

In [11]:
superMarket_dataSet.drop(["Customer ID","Quantity","Quantity",'Invoice',"InvoiceDate"],axis=1,inplace=True)

In [12]:
superMarket_dataSet

Unnamed: 0,StockCode,Description,Price,Country,comment
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,2.55,United Kingdom,good
1,71053,WHITE METAL LANTERN,3.39,United Kingdom,good
2,84406B,CREAM CUPID HEARTS COAT HANGER,2.75,United Kingdom,bad
3,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,3.39,United Kingdom,bad
4,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,United Kingdom,good
...,...,...,...,...,...
541905,22899,CHILDREN'S APRON DOLLY GIRL,2.10,France,good
541906,23254,CHILDRENS CUTLERY DOLLY GIRL,4.15,France,good
541907,23255,CHILDRENS CUTLERY CIRCUS PARADE,4.15,France,bad
541908,22138,BAKING SET 9 PIECE RETROSPOT,4.95,France,good


### Feature organizing

In [13]:
superMarket_dataSet["Description"] = superMarket_dataSet["Description"].str.replace(" ", "_")
superMarket_dataSet["Country"] = superMarket_dataSet["Country"].str.replace(" ", "_", regex=False)

superMarket_dataSet

Unnamed: 0,StockCode,Description,Price,Country,comment
0,85123A,WHITE_HANGING_HEART_T-LIGHT_HOLDER,2.55,United_Kingdom,good
1,71053,WHITE_METAL_LANTERN,3.39,United_Kingdom,good
2,84406B,CREAM_CUPID_HEARTS_COAT_HANGER,2.75,United_Kingdom,bad
3,84029G,KNITTED_UNION_FLAG_HOT_WATER_BOTTLE,3.39,United_Kingdom,bad
4,84029E,RED_WOOLLY_HOTTIE_WHITE_HEART.,3.39,United_Kingdom,good
...,...,...,...,...,...
541905,22899,CHILDREN'S_APRON_DOLLY_GIRL_,2.10,France,good
541906,23254,CHILDRENS_CUTLERY_DOLLY_GIRL_,4.15,France,good
541907,23255,CHILDRENS_CUTLERY_CIRCUS_PARADE,4.15,France,bad
541908,22138,BAKING_SET_9_PIECE_RETROSPOT_,4.95,France,good


In [14]:
grouped_superM_price = superMarket_dataSet.groupby("Description")["Price"].mean().reset_index()
grouped_superM_stock = superMarket_dataSet.groupby("Description")["StockCode"].apply(lambda x: ", ".join(x)).reset_index()
grouped_superM_cmnt = superMarket_dataSet.groupby("Description")["comment"].apply(lambda x: ", ".join(x)).reset_index()
grouped_superM_data=pd.concat([grouped_superM_price,grouped_superM_stock["StockCode"],grouped_superM_cmnt["comment"]],axis=1)

In [15]:
grouped_superM_data["comment"] = grouped_superM_data["comment"].apply(most_common_word)
grouped_superM_data["StockCode"] = grouped_superM_data["StockCode"].apply(most_common_word)

In [16]:
grouped_superM_data

Unnamed: 0,Description,Price,StockCode,comment
0,*Boombox_Ipod_Classic,16.980000,21120,good
1,*USB_Office_Mirror_Ball,8.470000,20954,bad
2,10_COLOUR_SPACEBOY_PEN,1.050917,22418,good
3,12_COLOURED_PARTY_BALLOONS,0.703000,22436,good
4,12_DAISY_PEGS_IN_WOOD_BOX,1.907143,21448,bad
...,...,...,...,...
4218,wrongly_marked_23343,0.000000,20713,bad
4219,wrongly_marked_carton_22804,0.000000,85123a,bad
4220,wrongly_sold_(22719)_barcode,0.000000,22467,good
4221,wrongly_sold_as_sets,0.000000,85172,good


In [17]:
grouped_superM_data[grouped_superM_data["Price"]==0.000000]=None

In [18]:
grouped_superM_data=grouped_superM_data[grouped_superM_data.loc[:,'Price'].notna()]

In [19]:
grouped_superM_data

Unnamed: 0,Description,Price,StockCode,comment
0,*Boombox_Ipod_Classic,16.980000,21120,good
1,*USB_Office_Mirror_Ball,8.470000,20954,bad
2,10_COLOUR_SPACEBOY_PEN,1.050917,22418,good
3,12_COLOURED_PARTY_BALLOONS,0.703000,22436,good
4,12_DAISY_PEGS_IN_WOOD_BOX,1.907143,21448,bad
...,...,...,...,...
4099,_RED_SPOT_GIFT_BAG_LARGE,1.374762,23438,bad
4100,_SET_2_TEA_TOWELS_I_LOVE_LONDON_,3.542730,22900,bad
4101,_SPACEBOY_BABY_GIFT_SET,15.935479,23007,bad
4102,_TOADSTOOL_BEDSIDE_LIGHT_,8.950000,23079,bad


In [20]:
encoder = OneHotEncoder(sparse_output=False)
stock_code_encoded = encoder.fit_transform(grouped_superM_data[["StockCode"]])

stock_code_df = pd.DataFrame(
    stock_code_encoded,
    columns=[f"stock_code_{col}" for col in encoder.categories_[0]]
)

stock_code_df

Unnamed: 0,stock_code_10002,stock_code_10080,stock_code_10120,stock_code_10123c,stock_code_10124a,stock_code_10124g,stock_code_10125,stock_code_10133,stock_code_11001,stock_code_15030,...,stock_code_dot,stock_code_gift_0001_10,stock_code_gift_0001_20,stock_code_gift_0001_30,stock_code_gift_0001_40,stock_code_gift_0001_50,stock_code_m,stock_code_pads,stock_code_post,stock_code_s
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4040,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Making our final data set

In [21]:
encoder_desc = LabelEncoder()
grouped_superM_data["Description"] = encoder_desc.fit_transform(grouped_superM_data["Description"])

encoder_stock = LabelEncoder()
grouped_superM_data["StockCode"] = encoder_stock.fit_transform(grouped_superM_data["StockCode"])

encoder_comment = LabelEncoder()
grouped_superM_data["comment"] = encoder_comment.fit_transform(grouped_superM_data["comment"])


In [22]:
grouped_superM_data

Unnamed: 0,Description,Price,StockCode,comment
0,0,16.980000,369,1
1,1,8.470000,270,0
2,2,1.050917,1300,1
3,3,0.703000,1318,1
4,4,1.907143,587,0
...,...,...,...,...
4099,4037,1.374762,2265,0
4100,4038,3.542730,1761,0
4101,4039,15.935479,1865,0
4102,4040,8.950000,1932,0


### Determine training dataset and Target

In [23]:
X = grouped_superM_data[["Description", "Price", "StockCode", "comment"]].values
y = grouped_superM_data["comment"].values  

## Model Selection

In [24]:
n_samples = 4042
group_size = 10
n_full_groups = n_samples // group_size   
remainder = n_samples % group_size      

groups = [group_size] * n_full_groups    
if remainder > 0:
    groups.append(remainder)            

print(sum(groups))  

4042


In [25]:
xg_boost_ranker_model = XGBRanker(
    objective="rank:pairwise",
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)

In [26]:
xg_boost_ranker_model.fit(X,y,group=groups)

0,1,2
,objective,'rank:pairwise'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


## Recommendation

In [27]:
decs=[
    1,
    7,
    9
]


user_idx = grouped_superM_data[grouped_superM_data["Description"].isin(decs)].index
if len(user_idx) < 1:
    print("Your customer IDs are not in our dataset !!!")

In [28]:
customer_vec = np.mean(grouped_superM_data.iloc[user_idx].values, axis=0).reshape(1, -1)

In [29]:
similarity_score = cosine_similarity(grouped_superM_data, customer_vec).reshape(-1)


In [30]:
grouped_superM_data["Rank"] = xg_boost_ranker_model.predict(grouped_superM_data)


In [31]:
grouped_superM_data["Final_Score"] = grouped_superM_data["Rank"] + similarity_score * 0.5

In [32]:
user_predict = xg_boost_ranker_model.predict(customer_vec)
user_predict

array([-4.05346], dtype=float32)

In [33]:
n_recommended = 5
recommended_stock = grouped_superM_data[~grouped_superM_data["Description"].isin(decs)].sort_values("Final_Score" ,ascending=False).head(n_recommended)   
recommended_stock[["Description","StockCode","Price" ,"Rank", "Final_Score"]]
recommended_stock

customer_id_list = recommended_stock["Description"].tolist()  
result = grouped_superM_data[grouped_superM_data["Description"].isin(customer_id_list)]
result

result["Description"]=encoder_desc.inverse_transform(result["Description"])
result["StockCode"]=encoder_stock.inverse_transform(result["StockCode"])
result["comment"]=encoder_comment.inverse_transform(result["comment"])

result

Unnamed: 0,Description,Price,StockCode,comment,Rank,Final_Score
22,16_PIECE_CUTLERY_SET_PANTRY_DESIGN,17.131972,23253,good,4.05346,4.553455
35,36_FOIL_HEART_CAKE_CASES,2.213014,22956,good,4.05346,4.55344
42,3D_DOG_PICTURE_PLAYING_CARDS,3.263333,84558a,good,4.05346,4.553442
44,3D_SHEET_OF_CAT_STICKERS,1.181169,84559b,good,4.05346,4.553439
45,3D_SHEET_OF_DOG_STICKERS,1.245431,84559a,good,4.05346,4.553439
