In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from xgboost import XGBRanker
from collections import Counter


In [2]:
def most_common_word(text):
    words = text.split()
    counter = Counter(words)
    most_common = counter.most_common(1)
    return most_common[0][0] if most_common else None

In [3]:
superMarket_dataSet=pd.read_csv("supermarkets_data.csv")
superMarket_dataSet.drop(["Quantity","InvoiceDate"],axis=1,inplace=True)
superMarket_dataSet


Unnamed: 0,Invoice,StockCode,Description,Price,Customer ID,Country,comment
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,2.55,17850.0,United Kingdom,good
1,536365,71053,WHITE METAL LANTERN,3.39,17850.0,United Kingdom,good
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,2.75,17850.0,United Kingdom,bad
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,3.39,17850.0,United Kingdom,bad
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,3.39,17850.0,United Kingdom,good
...,...,...,...,...,...,...,...
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,2.10,12680.0,France,good
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4.15,12680.0,France,good
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4.15,12680.0,France,bad
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,4.95,12680.0,France,good


In [4]:
superMarket_dataSet.count()

Invoice        541910
StockCode      541910
Description    540456
Price          541910
Customer ID    406830
Country        541910
comment        541910
dtype: int64

In [5]:
missing_coulmn_list=[]
for col in superMarket_dataSet.columns:
    missing_percentage = (superMarket_dataSet[col].isna().sum() / superMarket_dataSet.shape[0]) * 100
    if missing_percentage > 0:
        print(f"{col} missing percentage:\t{missing_percentage}%")
        missing_coulmn_list.append(col)
    else:
        superMarket_dataSet.dropna(subset=[col], inplace=True)

Description missing percentage:	0.26831023601705084%
Customer ID missing percentage:	24.92664833643963%


In [6]:
for col in missing_coulmn_list:

    missing_value_percentage=(superMarket_dataSet.loc[:,col].isna().sum()/superMarket_dataSet.shape[0])*100

    if (missing_value_percentage)<0.5:
        superMarket_dataSet=superMarket_dataSet[superMarket_dataSet.loc[:,col].notna()]

    elif superMarket_dataSet.loc[:,col].dtype == np.float64 or superMarket_dataSet.loc[:,col].dtype == np.int64:
        median_col=superMarket_dataSet.loc[:,col].median()
        superMarket_dataSet.loc[:,col].fillna(median_col,inplace=True)
    else:    
        mode_col = superMarket_dataSet[col].mode()
        if not mode_col.empty:
            superMarket_dataSet[col] = superMarket_dataSet[col].fillna(mode_col[0])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  superMarket_dataSet.loc[:,col].fillna(median_col,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  superMarket_dataSet.loc[:,col].fillna(median_col,inplace=True)


In [7]:
superMarket_dataSet["Description"] = superMarket_dataSet["Description"].str.replace(" ", "_")
superMarket_dataSet["Country"] = superMarket_dataSet["Country"].str.replace(" ", "_", regex=False)

superMarket_dataSet

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  superMarket_dataSet["Description"] = superMarket_dataSet["Description"].str.replace(" ", "_")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  superMarket_dataSet["Country"] = superMarket_dataSet["Country"].str.replace(" ", "_", regex=False)


Unnamed: 0,Invoice,StockCode,Description,Price,Customer ID,Country,comment
0,536365,85123A,WHITE_HANGING_HEART_T-LIGHT_HOLDER,2.55,17850.0,United_Kingdom,good
1,536365,71053,WHITE_METAL_LANTERN,3.39,17850.0,United_Kingdom,good
2,536365,84406B,CREAM_CUPID_HEARTS_COAT_HANGER,2.75,17850.0,United_Kingdom,bad
3,536365,84029G,KNITTED_UNION_FLAG_HOT_WATER_BOTTLE,3.39,17850.0,United_Kingdom,bad
4,536365,84029E,RED_WOOLLY_HOTTIE_WHITE_HEART.,3.39,17850.0,United_Kingdom,good
...,...,...,...,...,...,...,...
541905,581587,22899,CHILDREN'S_APRON_DOLLY_GIRL_,2.10,12680.0,France,good
541906,581587,23254,CHILDRENS_CUTLERY_DOLLY_GIRL_,4.15,12680.0,France,good
541907,581587,23255,CHILDRENS_CUTLERY_CIRCUS_PARADE,4.15,12680.0,France,bad
541908,581587,22138,BAKING_SET_9_PIECE_RETROSPOT_,4.95,12680.0,France,good


In [8]:
print(superMarket_dataSet.columns.tolist())


['Invoice', 'StockCode', 'Description', 'Price', 'Customer ID', 'Country', 'comment']


In [10]:
# grouped_superM_des = superMarket_dataSet.groupby("Description")["Description"].apply(lambda x: ", ".join(x)).reset_index()
grouped_superM_price = superMarket_dataSet.groupby("Description")["Price"].mean().reset_index()

grouped_superM_stock = superMarket_dataSet.groupby("Description")["StockCode"].apply(lambda x: ", ".join(x)).reset_index()
grouped_superM_cmnt = superMarket_dataSet.groupby("Description")["comment"].apply(lambda x: ", ".join(x)).reset_index()


grouped_superM_data=pd.concat([grouped_superM_price,grouped_superM_stock["StockCode"]],axis=1)
grouped_superM_data
grouped_superM_cmnt["most_common_comment"] = grouped_superM_cmnt["comment"].apply(most_common_word)
grouped_superM_cmnt
grouped_superM_data


Unnamed: 0,Description,Price,StockCode
0,*Boombox_Ipod_Classic,16.980000,21120
1,*USB_Office_Mirror_Ball,8.470000,"20954, 20954"
2,10_COLOUR_SPACEBOY_PEN,1.050917,"22418, 22418, 22418, 22418, 22418, 22418, 2241..."
3,12_COLOURED_PARTY_BALLOONS,0.703000,"22436, 22436, 22436, 22436, 22436, 22436, 2243..."
4,12_DAISY_PEGS_IN_WOOD_BOX,1.907143,"21448, 21448, 21448, 21448, 21448, 21448, 2144..."
...,...,...,...
4218,wrongly_marked_23343,0.000000,20713
4219,wrongly_marked_carton_22804,0.000000,85123A
4220,wrongly_sold_(22719)_barcode,0.000000,22467
4221,wrongly_sold_as_sets,0.000000,85172


In [11]:
grouped_superM_data["StockCode"] = grouped_superM_data["StockCode"].apply(lambda stock: stock.split(","))

In [12]:
mlb=MultiLabelBinarizer()
stock_code=mlb.fit_transform(grouped_superM_data["StockCode"])
stock_code_df = pd.DataFrame(stock_code, columns=[f"stock_cod{g}" for g in mlb.classes_])

stock_code_df.head(50)

Unnamed: 0,stock_cod 10002,stock_cod 10080,stock_cod 10120,stock_cod 10123C,stock_cod 10124A,stock_cod 10124G,stock_cod 10125,stock_cod 10133,stock_cod 10135,stock_cod 11001,...,stock_codDOT,stock_codM,stock_codPADS,stock_codPOST,stock_codS,stock_codgift_0001_10,stock_codgift_0001_20,stock_codgift_0001_30,stock_codgift_0001_40,stock_codgift_0001_50
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
X_y = pd.concat([grouped_superM_data,grouped_superM_cmnt["most_common_comment"]], axis=1)

X_y_encoded=X_y.copy()

In [19]:
X_y_encoded["StockCode"] = X_y_encoded["StockCode"].apply(
    lambda x: ", ".join(map(str, x)) if isinstance(x, (list, tuple)) else str(x)
)

In [20]:
X_y_encoded

Unnamed: 0,Description,Price,StockCode,most_common_comment
0,*Boombox_Ipod_Classic,16.980000,389,2
1,*USB_Office_Mirror_Ball,8.470000,283,1
2,10_COLOUR_SPACEBOY_PEN,1.050917,1374,3
3,12_COLOURED_PARTY_BALLOONS,0.703000,1393,3
4,12_DAISY_PEGS_IN_WOOD_BOX,1.907143,620,1
...,...,...,...,...
4218,wrongly_marked_23343,0.000000,170,0
4219,wrongly_marked_carton_22804,0.000000,3666,0
4220,wrongly_sold_(22719)_barcode,0.000000,1425,2
4221,wrongly_sold_as_sets,0.000000,3722,2


In [25]:
encoder_desc = LabelEncoder()
X_y_encoded["Description"] = encoder_desc.fit_transform(X_y_encoded["Description"])

encoder_stock = LabelEncoder()
X_y_encoded["StockCode"] = encoder_stock.fit_transform(X_y_encoded["StockCode"])

encoder_comment = LabelEncoder()
X_y_encoded["most_common_comment"] = encoder_comment.fit_transform(X_y_encoded["most_common_comment"])
X_y_encoded


Unnamed: 0,Description,Price,StockCode,most_common_comment
0,0,16.980000,3212,2
1,1,8.470000,2035,1
2,2,1.050917,418,3
3,3,0.703000,439,3
4,4,1.907143,3771,1
...,...,...,...,...
4218,4218,0.000000,780,0
4219,4219,0.000000,2964,0
4220,4220,0.000000,475,2
4221,4221,0.000000,3027,2


In [26]:
y = X_y_encoded["most_common_comment"]

group = [len(X_y_encoded)]

In [27]:
model = XGBRanker(
    objective="rank:pairwise",
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8
)

model.fit(X_y_encoded, y, group=group)

0,1,2
,objective,'rank:pairwise'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [32]:
decs=[
    1,
    7,
    9
]


user_idx = X_y_encoded[X_y_encoded["Description"].isin(decs)].index
if len(user_idx) < 1:
    print("Your customer IDs are not in our dataset !!!")


In [33]:

encoder=LabelEncoder()
X_y_encoded["Description"] = encoder.fit_transform(X_y_encoded["Description"])
X_y_encoded["StockCode"] = encoder.fit_transform(X_y_encoded["StockCode"])
customer_vec = np.mean(X_y_encoded.iloc[user_idx].values, axis=0).reshape(1, -1)
# user_vec

In [34]:
similarity_score = cosine_similarity(X_y_encoded, customer_vec).reshape(-1)


In [35]:
X_y["Rank"] = model.predict(X_y_encoded)


In [36]:
X_y["Final_Score"] = X_y["Rank"] + similarity_score * 0.5

In [37]:
user_predict = model.predict(customer_vec)
user_predict

array([-3.3792927], dtype=float32)

In [42]:
n_recommended = 5
recommended_stock = X_y[~X_y["Description"].isin(decs)].sort_values("Final_Score" ,ascending=False).head(n_recommended)   
recommended_stock[["Description","StockCode","Price" ,"Rank", "Final_Score"]]
recommended_stock

customer_id_list = recommended_stock["Description"].tolist()  
result = X_y[X_y["Description"].isin(customer_id_list)]
result


Unnamed: 0,Description,Price,StockCode,most_common_comment,Rank,Final_Score
12,12_PENCILS_TALL_TUBE_RED_RETROSPOT,0.93108,"[20983, 20983, 20983, 20983, 20983, 20983...","good,",4.752856,5.252853
32,2_PICTURE_BOOK_EGGS_EASTER_DUCKS,1.6856,"[21457, 21457, 21457, 21457, 21457, 21457...","good,",4.752856,5.252846
44,3D_SHEET_OF_CAT_STICKERS,1.181169,"[84559B, 84559B, 84559B, 84559B, 84559b, ...","good,",4.752856,5.252799
45,3D_SHEET_OF_DOG_STICKERS,1.245431,"[84559A, 84559a, 84559A, 84559A, 84559A, ...","good,",4.752856,5.252796
69,4_BURGUNDY_WINE_DINNER_CANDLES,1.336429,"[72801G, 72801G, 72801G, 72801G, 72801G, ...","good,",4.752856,5.252661
