In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")

In [2]:
# adapter le chemin vers le fichier CSV nettoyé produit par le notebook 01_xiaosong_text_clean.ipynb
df = pd.read_csv("rakuten_text_train_v1.csv")
df.head()

Unnamed: 0,productid,imageid,prdtypecode,designation_cleaned,description_cleaned,text_cleaned,dup_count,is_duplicated_group
0,3804725264,1263597046,10,olivia: personalisiertes notizbuch 150 seiten ...,,olivia: personalisiertes notizbuch 150 seiten ...,1,False
1,436067568,1008141237,2280,journal arts (le) n° 133 28/09/2001 l'art marc...,,journal arts (le) n° 133 28/09/2001 l'art marc...,1,False
2,201115110,938777978,50,grand stylet ergonomique bleu gamepad nintendo...,pilot style touch pen marque speedlink stylet ...,grand stylet ergonomique bleu gamepad nintendo...,1,False
3,50418756,457047496,1280,peluche donald europe disneyland 2000 (marionn...,,peluche donald europe disneyland 2000 (marionn...,1,False
4,278535884,1077757786,2705,guerre tuques,luc idées grandeur veut organiser jeu guerre b...,guerre tuques luc idées grandeur veut organise...,1,False


In [3]:
def safe_str(x):
    if isinstance(x, str):
        return x
    if pd.isna(x):
        return ""
    return str(x)

df["designation_cleaned"] = df["designation_cleaned"].fillna("").apply(safe_str)
df["description_cleaned"] = df["description_cleaned"].fillna("").apply(safe_str)

In [4]:
# TF-IDF pour le titre
tfidf_title = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.7,
    lowercase=False,
    tokenizer=str.split,
)

X_tfidf_title = tfidf_title.fit_transform(df["designation_cleaned"])
print("TF-IDF titre - forme :", X_tfidf_title.shape)

# TF-IDF pour la description
tfidf_desc = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.7,
    lowercase=False,
    tokenizer=str.split,
)

X_tfidf_desc = tfidf_desc.fit_transform(df["description_cleaned"])
print("TF-IDF description - forme :", X_tfidf_desc.shape)


TF-IDF titre - forme : (84916, 20000)
TF-IDF description - forme : (84916, 30000)


In [5]:
# Features structurales sur le texte

UNIT_PATTERN = re.compile(r"\b\d+\s*(cm|mm|kg|g|ml|l|m)\b", flags=re.IGNORECASE)
MULT_PATTERN = re.compile(r"\bx\s*\d+\b|\b\d+\s*x\b", flags=re.IGNORECASE)
DIGIT_PATTERN = re.compile(r"\d")

def structural_stats(s: str) -> dict:
    """Calcule des indicateurs simples de structure."""
    s = safe_str(s)
    tokens = s.split()
    length_char = len(s)
    length_tokens = len(tokens)
    
    num_digits = len(DIGIT_PATTERN.findall(s))
    num_units = len(UNIT_PATTERN.findall(s))
    num_mult = len(MULT_PATTERN.findall(s))
    
    return {
        "len_char": length_char,
        "len_tokens": length_tokens,
        "num_digits": num_digits,
        "num_units": num_units,
        "num_mult_pattern": num_mult,
    }

# Application sur le titre et la description
for col in ["designation_cleaned", "description_cleaned"]:
    stats_series = df[col].apply(structural_stats)
    stats_df = pd.DataFrame(list(stats_series))
    for c in stats_df.columns:
        df[f"{col}_{c}"] = stats_df[c]

# Aperçu de quelques features de longueur
df[
    [
        "designation_cleaned_len_char",
        "designation_cleaned_len_tokens",
        "description_cleaned_len_char",
        "description_cleaned_len_tokens",
    ]
].head()


Unnamed: 0,designation_cleaned_len_char,designation_cleaned_len_tokens,description_cleaned_len_char,description_cleaned_len_tokens
0,80,10,0,0
1,161,24,0,0
2,72,10,546,70
3,57,7,0,0
4,13,2,127,18


In [6]:

# Features sémantiques ciblées

# Groupes de mots-clés, d'après le document du groupe
KW_PISCINE = {"piscine", "gonflable", "intex", "galet", "tubulaire"}
KW_BEBE = {"bébé", "bebe", "hochet", "0-24", "0-36", "0-12", "garçon", "garcon", "fille"}
KW_LIVRES = {"édition", "edition", "poche", "tome", "roman"}
KW_GAMING = {"ps4", "ps5", "xbox", "switch", "manette", "wifi"}
KW_JOUETS = {"peluche", "figurine", "modèle", "modele", "collection"}

KEYWORD_GROUPS = {
    "piscine": KW_PISCINE,
    "bebe": KW_BEBE,
    "livres": KW_LIVRES,
    "gaming": KW_GAMING,
    "jouets": KW_JOUETS,
}

def keyword_features(s: str) -> dict:
    """
    Pour chaque famille de produits, on crée :
    - une feature binaire de présence (0/1)
    - une feature de fréquence (nombre de tokens du groupe)
    """
    s = safe_str(s).lower()
    tokens = s.split()
    feats = {}
    for group_name, keywords in KEYWORD_GROUPS.items():
        count = sum(tok in keywords for tok in tokens)
        feats[f"kw_{group_name}_present"] = int(count > 0)
        feats[f"kw_{group_name}_count"] = count
    return feats

kw_series = df["text_cleaned"].apply(keyword_features)
kw_df = pd.DataFrame(list(kw_series))

for c in kw_df.columns:
    df[c] = kw_df[c]

df[[c for c in df.columns if c.startswith("kw_")]].head()


Unnamed: 0,kw_piscine_present,kw_piscine_count,kw_bebe_present,kw_bebe_count,kw_livres_present,kw_livres_count,kw_gaming_present,kw_gaming_count,kw_jouets_present,kw_jouets_count
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,1
4,0,0,0,0,0,0,0,0,0,0


In [7]:
print("TF-IDF titre  :", X_tfidf_title.shape)
print("TF-IDF desc   :", X_tfidf_desc.shape)

feature_cols = [
    c for c in df.columns
    if c.startswith("designation_cleaned_")
    or c.startswith("description_cleaned_")
    or c.startswith("kw_")
]

print("Nombre de features numériques (structure + sémantique) :", len(feature_cols))
print(feature_cols[:10])

TF-IDF titre  : (84916, 20000)
TF-IDF desc   : (84916, 30000)
Nombre de features numériques (structure + sémantique) : 20
['designation_cleaned_len_char', 'designation_cleaned_len_tokens', 'designation_cleaned_num_digits', 'designation_cleaned_num_units', 'designation_cleaned_num_mult_pattern', 'description_cleaned_len_char', 'description_cleaned_len_tokens', 'description_cleaned_num_digits', 'description_cleaned_num_units', 'description_cleaned_num_mult_pattern']


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Standardisation des features numériques
num_scaler = StandardScaler(with_mean=False)  # with_mean=False pour matrices creuses

meta_cols = feature_cols

# ColumnTransformer : combine texte (TF-IDF) + features numériques
preprocess = ColumnTransformer(
    transformers=[
        ("title_tfidf", tfidf_title, "designation_cleaned"),
        ("desc_tfidf", tfidf_desc, "description_cleaned"),
        ("numeric", num_scaler, meta_cols),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)

# Modèle : Régression Logistique multiclasse
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",  # important pour les classes déséquilibrées
    solver="saga",
    n_jobs=-1,
)

# Pipeline complet : prétraitement + modèle
clf_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", log_reg),
    ]
)

print(clf_pipeline)


Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('title_tfidf',
                                                  TfidfVectorizer(lowercase=False,
                                                                  max_df=0.7,
                                                                  max_features=20000,
                                                                  min_df=5,
                                                                  ngram_range=(1,
                                                                               3),
                                                                  tokenizer=<method 'split' of 'str' objects>),
                                                  'designation_cleaned'),
                                                 ('desc_tfidf',
                                                  TfidfVectorizer(lowercase=False,
                                                                  max_df=0.7,
     

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

# Construction de X (DataFrame d'entrée pour la pipeline) et y
X = df[["designation_cleaned", "description_cleaned"] + meta_cols]
y = df["prdtypecode"].values

# Split entraînement / validation
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Taille X_train :", X_train.shape)
print("Taille X_valid :", X_valid.shape)

# Entraînement de la pipeline complète
print("Entraînement du modèle (pipeline complet)...")
clf_pipeline.fit(X_train, y_train)

# Prédiction sur le jeu de validation
y_pred = clf_pipeline.predict(X_valid)

# Évaluation
weighted_f1 = f1_score(y_valid, y_pred, average="weighted")
print(f"\nWeighted F1 (validation) : {weighted_f1:.4f}\n")

print("Classification report :\n")
print(classification_report(y_valid, y_pred))


Taille X_train : (67932, 22)
Taille X_valid : (16984, 22)
Entraînement du modèle (pipeline complet)...

Weighted F1 (validation) : 0.7628

Classification report :

              precision    recall  f1-score   support

          10       0.32      0.73      0.45       623
          40       0.75      0.59      0.66       502
          50       0.79      0.79      0.79       336
          60       0.90      0.77      0.83       166
        1140       0.67      0.78      0.72       534
        1160       0.79      0.90      0.84       791
        1180       0.57      0.58      0.57       153
        1280       0.77      0.47      0.58       974
        1281       0.57      0.57      0.57       414
        1300       0.82      0.85      0.83      1009
        1301       0.92      0.94      0.93       161
        1302       0.76      0.70      0.73       498
        1320       0.77      0.64      0.70       648
        1560       0.83      0.76      0.79      1015
        1920       0.89  

In [10]:
# from sklearn.model_selection import GridSearchCV


# param_grid = {
#     "model__C": [0.5, 1.0, 2.0],
#     "preprocess__title_tfidf__max_features": [10000, 20000],
#     "preprocess__desc_tfidf__max_features": [20000, 30000],
# }

# grid = GridSearchCV(
#     estimator=clf_pipeline,
#     param_grid=param_grid,
#     scoring="f1_weighted",
#     cv=3,
#     n_jobs=-1,
#     verbose=2,
# )

# print("Lancement de la GridSearch (peut être un peu long)...")
# grid.fit(X_train, y_train)

# print("\nMeilleurs paramètres trouvés :", grid.best_params_)
# print("Meilleur score (F1 pondéré, CV) :", grid.best_score_)

# # Évaluation du meilleur modèle sur le jeu de validation
# best_model = grid.best_estimator_
# y_pred_best = best_model.predict(X_valid)
# best_f1 = f1_score(y_valid, y_pred_best, average="weighted")

# print(f"\nWeighted F1 du meilleur modèle sur validation : {best_f1:.4f}\n")
# print("Classification report du meilleur modèle :\n")
# print(classification_report(y_valid, y_pred_best))
