In [1]:
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# adapter le chemin vers le fichier CSV nettoyé produit par le notebook 01_xiaosong_text_clean.ipynb
df = pd.read_csv("rakuten_text_train_v1.csv")
df.head()

Unnamed: 0,productid,imageid,prdtypecode,designation_cleaned,description_cleaned,text_cleaned,dup_count,is_duplicated_group
0,3804725264,1263597046,10,olivia: personalisiertes notizbuch 150 seiten ...,,olivia: personalisiertes notizbuch 150 seiten ...,1,False
1,436067568,1008141237,2280,journal arts (le) n° 133 28/09/2001 l'art marc...,,journal arts (le) n° 133 28/09/2001 l'art marc...,1,False
2,201115110,938777978,50,grand stylet ergonomique bleu gamepad nintendo...,pilot style touch pen marque speedlink stylet ...,grand stylet ergonomique bleu gamepad nintendo...,1,False
3,50418756,457047496,1280,peluche donald europe disneyland 2000 (marionn...,,peluche donald europe disneyland 2000 (marionn...,1,False
4,278535884,1077757786,2705,guerre tuques,luc idées grandeur veut organiser jeu guerre b...,guerre tuques luc idées grandeur veut organise...,1,False


In [3]:
def safe_str(x):
    """Convertit une valeur en chaîne, en gérant les NaN."""
    if isinstance(x, str):
        return x
    if pd.isna(x):
        return ""
    return str(x)

# On s'assure d'avoir des colonnes titre / description nettoyées
if "designation_cleaned" not in df.columns and "designation" in df.columns:
    df["designation_cleaned"] = df["designation"].apply(safe_str)
else:
    df["designation_cleaned"] = df["designation_cleaned"].apply(safe_str)

if "description_cleaned" not in df.columns and "description" in df.columns:
    df["description_cleaned"] = df["description"].fillna("").apply(safe_str)
else:
    df["description_cleaned"] = df["description_cleaned"].fillna("").apply(safe_str)

# Texte global = titre + description (utile pour TF-IDF global)
df["text_cleaned"] = (
    df["designation_cleaned"].str.strip()
    + " "
    + df["description_cleaned"].str.strip()
).str.strip()

df[["designation_cleaned", "description_cleaned", "text_cleaned"]].head()

Unnamed: 0,designation_cleaned,description_cleaned,text_cleaned
0,olivia: personalisiertes notizbuch 150 seiten ...,,olivia: personalisiertes notizbuch 150 seiten ...
1,journal arts (le) n° 133 28/09/2001 l'art marc...,,journal arts (le) n° 133 28/09/2001 l'art marc...
2,grand stylet ergonomique bleu gamepad nintendo...,pilot style touch pen marque speedlink stylet ...,grand stylet ergonomique bleu gamepad nintendo...
3,peluche donald europe disneyland 2000 (marionn...,,peluche donald europe disneyland 2000 (marionn...
4,guerre tuques,luc idées grandeur veut organiser jeu guerre b...,guerre tuques luc idées grandeur veut organise...


In [None]:
# TF-IDF global enrichi (n-grams 1 à 3)
tfidf_global = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 3),   # unigrams + bigrams + trigrams
    min_df=5,
    max_df=0.8,
    lowercase=False,
    tokenizer=str.split,
)

X_tfidf_global = tfidf_global.fit_transform(df["text_cleaned"])

print("TF-IDF global - forme :", X_tfidf_global.shape)



TF-IDF global - forme : (84916, 50000)


In [None]:
# TF-IDF séparé : titre vs description

# TF-IDF pour le titre
tfidf_title = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.8,
    lowercase=False,
    tokenizer=str.split,
)

X_tfidf_title = tfidf_title.fit_transform(df["designation_cleaned"])
print("TF-IDF titre - forme :", X_tfidf_title.shape)

# TF-IDF pour la description
tfidf_desc = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.8,
    lowercase=False,
    tokenizer=str.split,
)

X_tfidf_desc = tfidf_desc.fit_transform(df["description_cleaned"])
print("TF-IDF description - forme :", X_tfidf_desc.shape)


TF-IDF titre - forme : (84916, 20000)
TF-IDF description - forme : (84916, 30000)


In [6]:
# 4. Features structurales sur le texte

UNIT_PATTERN = re.compile(r"\b\d+\s*(cm|mm|kg|g|ml|l|m)\b", flags=re.IGNORECASE)
MULT_PATTERN = re.compile(r"\bx\s*\d+\b|\b\d+\s*x\b", flags=re.IGNORECASE)
DIGIT_PATTERN = re.compile(r"\d")

def structural_stats(s: str) -> dict:
    """Calcule des indicateurs simples de structure."""
    s = safe_str(s)
    tokens = s.split()
    length_char = len(s)
    length_tokens = len(tokens)
    
    num_digits = len(DIGIT_PATTERN.findall(s))
    num_units = len(UNIT_PATTERN.findall(s))
    num_mult = len(MULT_PATTERN.findall(s))
    
    return {
        "len_char": length_char,
        "len_tokens": length_tokens,
        "num_digits": num_digits,
        "num_units": num_units,
        "num_mult_pattern": num_mult,
    }

# Application sur le titre et la description
for col in ["designation_cleaned", "description_cleaned"]:
    stats_series = df[col].apply(structural_stats)
    stats_df = pd.DataFrame(list(stats_series))
    for c in stats_df.columns:
        df[f"{col}_{c}"] = stats_df[c]

# Aperçu de quelques features de longueur
df[
    [
        "designation_cleaned_len_char",
        "designation_cleaned_len_tokens",
        "description_cleaned_len_char",
        "description_cleaned_len_tokens",
    ]
].head()


Unnamed: 0,designation_cleaned_len_char,designation_cleaned_len_tokens,description_cleaned_len_char,description_cleaned_len_tokens
0,80,10,0,0
1,161,24,0,0
2,72,10,546,70
3,57,7,0,0
4,13,2,127,18


In [7]:

# 5. Features sémantiques ciblées

# Groupes de mots-clés, d'après le document du groupe
KW_PISCINE = {"piscine", "gonflable", "intex", "galet", "tubulaire"}
KW_BEBE = {"bébé", "bebe", "hochet", "0-24", "0-36", "0-12", "garçon", "garcon", "fille"}
KW_LIVRES = {"édition", "edition", "poche", "tome", "roman"}
KW_GAMING = {"ps4", "ps5", "xbox", "switch", "manette", "wifi"}
KW_JOUETS = {"peluche", "figurine", "modèle", "modele", "collection"}

KEYWORD_GROUPS = {
    "piscine": KW_PISCINE,
    "bebe": KW_BEBE,
    "livres": KW_LIVRES,
    "gaming": KW_GAMING,
    "jouets": KW_JOUETS,
}

def keyword_features(s: str) -> dict:
    """
    Pour chaque famille de produits, on crée :
    - une feature binaire de présence (0/1)
    - une feature de fréquence (nombre de tokens du groupe)
    """
    s = safe_str(s).lower()
    tokens = s.split()
    feats = {}
    for group_name, keywords in KEYWORD_GROUPS.items():
        count = sum(tok in keywords for tok in tokens)
        feats[f"kw_{group_name}_present"] = int(count > 0)
        feats[f"kw_{group_name}_count"] = count
    return feats

kw_series = df["text_cleaned"].apply(keyword_features)
kw_df = pd.DataFrame(list(kw_series))

for c in kw_df.columns:
    df[c] = kw_df[c]

df[[c for c in df.columns if c.startswith("kw_")]].head()


Unnamed: 0,kw_piscine_present,kw_piscine_count,kw_bebe_present,kw_bebe_count,kw_livres_present,kw_livres_count,kw_gaming_present,kw_gaming_count,kw_jouets_present,kw_jouets_count
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,1
4,0,0,0,0,0,0,0,0,0,0


In [8]:
print("TF-IDF global :", X_tfidf_global.shape)
print("TF-IDF titre  :", X_tfidf_title.shape)
print("TF-IDF desc   :", X_tfidf_desc.shape)

feature_cols = [
    c for c in df.columns
    if c.startswith("designation_cleaned_")
    or c.startswith("description_cleaned_")
    or c.startswith("kw_")
]

print("Nombre de features numériques (structure + sémantique) :", len(feature_cols))
print(feature_cols[:10])

TF-IDF global : (84916, 50000)
TF-IDF titre  : (84916, 20000)
TF-IDF desc   : (84916, 30000)
Nombre de features numériques (structure + sémantique) : 20
['designation_cleaned_len_char', 'designation_cleaned_len_tokens', 'designation_cleaned_num_digits', 'designation_cleaned_num_units', 'designation_cleaned_num_mult_pattern', 'description_cleaned_len_char', 'description_cleaned_len_tokens', 'description_cleaned_num_digits', 'description_cleaned_num_units', 'description_cleaned_num_mult_pattern']
