In [2]:
## SKIP THIS PART
# Ploomber related variables
upstream = None

# This is a placeholder, leave it as None
product = None

In [None]:
# Parameters
input_path = "products/cleaned_train_data.csv"
upstream = {"clean": {"nb": "C:\\Users\\berkayg\\Desktop\\Coding env\\test-case\\products\\clean_train_data.ipynb", "data": "C:\\Users\\berkayg\\Desktop\\Coding env\\ test-case\\products\\cleaned_train_data.csv"}}
product = {"nb": "C:\\Users\\berkayg\\Desktop\\Coding env\\ test-case\\products\\feature_generation_nb.ipynb", "data": "C:\\Users\\berkayg\\Desktop\\Coding env\\ test-case\\products\\processed_train_data.csv"}


## Feature Generation

Search terms table carries a great deal of potential to encrich the dataset. We first need to use text vectorization techniques to create a structural data. To this end we will use __Term Frequency–Inverse Document Frequency (TFIDF)__. Simply put, we will try to match the search terms with the content category names so that we can see if a searching event can be linked to a content in the df_products table.

In [4]:
from src.data.config_reader import *
from src.data.path_finder import *
from src.data.database_configurations import *

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
conn = connect_db()

In [6]:
# Get category names from the product table
df_cats = pd.read_sql("SELECT categoryname FROM df_product", conn)
df_cats = df_cats.dropna().drop_duplicates()
df_cats = df_cats.replace("[^\w ]", "", regex=True)
print(df_cats.shape)

(2529, 1)


In [7]:
# Get search terms
df_search = pd.read_sql("SELECT * FROM df_search_term", conn)
df_search = df_search.dropna().drop_duplicates(subset=["search_term"]).query("search_term != 'tum urunler'")
df_search = df_search.replace("[^\w ]", "", regex=True)
df_search.shape

(373425, 3)

In [8]:
# Initialize a TFIDF vectorizer to create structural text data
vectorizer_ntf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3))

# Transform both Vector 1 (Category Names) and Vector 2 (Search Terms)
X_ntf_cats = vectorizer_ntf.fit_transform(df_cats["categoryname"].to_numpy())
X_ntf_search = vectorizer_ntf.transform(df_search["search_term"].to_numpy())

In [9]:
# Calculate cosine similarities of each pair
X_ntf = cosine_similarity(X_ntf_search, X_ntf_cats)
X_ntf.shape

(373425, 2529)

In [10]:
# The data occupies a huge area in the memory so we try to get rid of the unsignificant rows/columns with 0 score
df_similarity = pd.concat([pd.DataFrame(X_ntf,index=df_search["search_term"], columns=df_cats["categoryname"])],axis=1).replace(0, np.nan)
df_similarity.dropna(how="all", inplace=True)
df_similarity.dropna(how="all", inplace=True, axis=1)
df_similarity.shape

(208791, 2487)

In [11]:
# En example of a matching pair (e.g., "Kazak" and "ananaslı kazak")
a = "ananaslı kazak"
df_similarity.loc[a].sort_values(ascending=False).head()

categoryname
Kazak                1.000000
Tesettür Kazak       0.607630
Büyük Beden Kazak    0.444389
Klasik Ayakkabı           NaN
Pantolon                  NaN
Name: ananaslı kazak, dtype: float64

In [12]:
# We now make some changes in the dataframe to fit it into a dictionary format for later use
melted = df_similarity.reset_index().melt(id_vars="search_term", value_vars=[k for k in df_similarity.columns if k != "search_term"]).dropna(subset=["value"])
melted = melted.query("value >= 0.15").sort_values(["search_term", "value"], ascending=False)
melted["rank"] = 1
melted["num"] = melted.groupby("search_term")["rank"].transform(np.cumsum)
melted = melted.query("num < 4")[["search_term", "categoryname"]]
melted.head()

Unnamed: 0,search_term,categoryname
255824352,əl kol diz dirsək ağrıları,Kol Düğmesi
162034741,űstű dolap alti cekmece,Dolap ve Gardrop
59291071,şşn musluk,Batarya Musluk
94844666,şırınga model kalem,Model Araçlar
327437840,şırınga model kalem,Kalem Setleri


In [23]:
# Since we see that some of the terms are matched with some category names, this can lead us to connecting search terms
# to product table which eventually be useful to create a new feature in the target table
search_term_dict = melted.set_index("search_term").to_dict("dict")["categoryname"]
search_term_dict["şık yüz maskesi"]

'Yüz Temizleyici'

In [None]:
# Search terms are replaced with category names to create implicit representations of search terms
df_search["target_categoryname"] = df_search["search_term"].map(search_term_dict)

In [14]:
df_products = pd.read_sql("SELECT categoryname, currentbugroupname FROM df_product", conn)
df_products.head()

Unnamed: 0,categoryname,currentbugroupname
0,Klasik Ayakkabı,Ayakkabı & Çanta
1,Pantolon,Branded Tekstil
2,Akvaryum Balık Yemleri,FMCG
3,Külot,Branded Tekstil
4,Tabak,Ev


In [20]:
products_dict = df_products.set_index("categoryname").to_dict("dict")["currentbugroupname"]
products_dict["Pantolon"]

'Branded Tekstil'

In [16]:
# Now we can match implicit representations of search terms with the product table
df_search["currentbugroupname"] = df_search["target_categoryname"].map(products_dict)

In [17]:
# Aggregating the values by summation
df_final = df_search.groupby(["userid", "currentbugroupname"], as_index=False).agg({"partition_date": "count"}).rename(columns={"partition_date": "search_count"})
df_final["userid"] = df_final["userid"].astype(str)

In [24]:
df_final.head()

Unnamed: 0,userid,currentbugroupname,search_count
0,12810,Aksesuar & Saat & Gözlük,1
1,12810,GAS,2
2,12810,Kozmetik,1
3,15148,Ev,1
4,23198,FMCG,1


In [31]:
# Read the main (train dataset) and merge it with the newly created dataframe
df = pd.read_csv(upstream['clean']['data'])
df["userid"] = df["userid"].astype(str)
df = df.merge(df_final, on=["userid", "currentbugroupname"], how="left")

In [32]:
df.head()

Unnamed: 0,index,userid,probability,target,currentbugroupname,y,gender,age,tenure,addtobasket_count_basket,productdetailcount_visit,quantity_trx,price_trx,fav_count_fav,search_count
0,0,10000970,,,Aksesuar & Saat & Gözlük,1,Bayan,50.0,85,0.0,336.0,2.0,225.89,45.0,3.0
1,1,10000970,,,Kozmetik,1,Bayan,50.0,85,0.0,138.0,3.0,102.62,9.0,
2,2,10000970,,,Mobilya,1,Bayan,50.0,85,0.0,0.0,0.0,0.0,0.0,
3,3,10000970,,,Private Label,1,Bayan,50.0,85,0.0,19.0,0.0,0.0,2.0,
4,4,10000970,,,FMCG,1,Bayan,50.0,85,0.0,4.0,0.0,0.0,0.0,


In [19]:
df.to_csv(product["data"], index=False)