In [2]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import spacy
from spacy.tokens import Token
from unidecode import unidecode
import nltk
from nltk.stem.porter import *


Vendredi 9 - 12 - 2023
## P6-DS-OC | Classifier automatiquement des biens de consommation
---

<span style="color:#FFD700" font=10>**CONTEXT**</span>

 
Problem : peu d'efforts coté client sur l'upload des photos d'articles en vente entrainnant une mauvaise catégorisation des articles.

Mission : Faire une étude de faisabilité d'une automatisation de categorisation 

Objectif : faciliter la mise en ligne de nouveaux articles et la recherche de produits

</br>

<span style="color:#00FF00" font=10>**OUTILS**</span>

Un datasert d'articles contenant le nom de l'article, sa description et un lien de téléchargement de photo accompagné d'un notebook d’exemple d’étude de faisabilité.


</br>


<span style="color:#FF69B4" font=10>**TODO**</span>


Step 1 : Étude de faisabilité d'un model de catégorisation

1. Faire des analyses exploratoires sur les images ainsi que  les texts decrivant lmes articles.

2. Appliquer une reduction de dimension sur les images

3. Projecter sur un plan 2D pour analyser la possibilité de regrouper.

4. Définir d'évaluation pour confirmer les résultats.

5. Tester deux approches pour la feature extraction : bag-of-words et Tf-idf
6. Tester Bert avec une approhce word/sentence embedding
7. Tester USE



---

In [3]:
df = (
    pd.read_csv(("../data/flipkart_com-ecommerce_sample_1050.csv")).assign(
        category=lambda x: x["product_category_tree"]
        .str.strip('"[]')
        .str.split(" >> ", expand=True)[0]
    )
)[["uniq_id", "product_name", "category", "brand", "description"]]
df.head()


Unnamed: 0,uniq_id,product_name,category,brand,description
0,55b85ea15a1536d46b7190ad6fff8ce7,Elegance Polyester Multicolor Abstract Eyelet ...,Home Furnishing,Elegance,Key Features of Elegance Polyester Multicolor ...
1,7b72c92c2f6c40268628ec5f14c6d590,Sathiyas Cotton Bath Towel,Baby Care,Sathiyas,Specifications of Sathiyas Cotton Bath Towel (...
2,64d5d4a258243731dc7bbb1eef49ad74,Eurospa Cotton Terry Face Towel Set,Baby Care,Eurospa,Key Features of Eurospa Cotton Terry Face Towe...
3,d4684dcdc759dd9cdf41504698d737d8,SANTOSH ROYAL FASHION Cotton Printed King size...,Home Furnishing,SANTOSH ROYAL FASHION,Key Features of SANTOSH ROYAL FASHION Cotton P...
4,6325b6870c54cd47be6ebfbffa620ec7,Jaipur Print Cotton Floral King sized Double B...,Home Furnishing,Jaipur Print,Key Features of Jaipur Print Cotton Floral Kin...


In [4]:
df.describe(include=object)


Unnamed: 0,uniq_id,product_name,category,brand,description
count,1050,1050,1050,712,1050
unique,1050,1050,7,490,1050
top,55b85ea15a1536d46b7190ad6fff8ce7,Elegance Polyester Multicolor Abstract Eyelet ...,Home Furnishing,PRINT SHAPES,Key Features of Elegance Polyester Multicolor ...
freq,1,1,150,11,1


In [5]:
df.groupby("category").count()[["uniq_id", "brand"]]


Unnamed: 0_level_0,uniq_id,brand
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Baby Care,150,134
Beauty and Personal Care,150,41
Computers,150,150
Home Decor & Festive Needs,150,148
Home Furnishing,150,150
Kitchen & Dining,150,79
Watches,150,10


In [6]:
import spacy
from spacy.tokens import Token
import re
from unidecode import unidecode

nlp = spacy.load("en_core_web_sm")


def clean_text(text, force_is_alpha: bool = True, extra_words=[]) -> list[str]:
    """
    Aims to process a raw text clean it,
    Args:
        text (str): _description_
        min_len_word (int, optional): token minimum lenght. Defaults to 3.
        regex (str, optional): _description_. Defaults to r"@[\w\d_]+".
        force_is_alpha (bool, optional): _description_. Defaults to True.

    Returns:
        list : _description_
    """

    # build doc
    doc = nlp(text)
    stemmer = PorterStemmer()

    text_clean = [
        unidecode(token.lemma_.lower())
        for token in doc
        if (
            not token.is_punct
            and not token.is_space
            # and not token.like_url
            and not token.is_stop
            # and not token._.like_handle
        )
    ]

    text_clean = [stemmer.stem(w) for w in text_clean]

    # Filtrer les tokens uniquement alphabétiques si force_is_alpha est True
    if force_is_alpha:
        alpha_tokens = [w for w in text_clean if w.isalpha()]
    else:
        alpha_tokens = text_clean

    # Filtrer les tokens supplémentaires
    final_tokens = [token for token in alpha_tokens if token not in extra_words]

    return final_tokens


In [7]:
def get_size(text):
    doc = nlp(text)
    return len(doc)


In [8]:
df = df.assign(
    size=df["description"].apply(get_size),
    description2=df["description"].apply(clean_text),
)
df["size2"] = df["description2"].apply(len)

df.head()


Unnamed: 0,uniq_id,product_name,category,brand,description,size,description2,size2
0,55b85ea15a1536d46b7190ad6fff8ce7,Elegance Polyester Multicolor Abstract Eyelet ...,Home Furnishing,Elegance,Key Features of Elegance Polyester Multicolor ...,257,"[key, featur, eleg, polyest, multicolor, abstr...",142
1,7b72c92c2f6c40268628ec5f14c6d590,Sathiyas Cotton Bath Towel,Baby Care,Sathiyas,Specifications of Sathiyas Cotton Bath Towel (...,84,"[specif, sathiya, cotton, bath, towel, bath, t...",60
2,64d5d4a258243731dc7bbb1eef49ad74,Eurospa Cotton Terry Face Towel Set,Baby Care,Eurospa,Key Features of Eurospa Cotton Terry Face Towe...,250,"[key, featur, eurospa, cotton, terri, face, to...",142
3,d4684dcdc759dd9cdf41504698d737d8,SANTOSH ROYAL FASHION Cotton Printed King size...,Home Furnishing,SANTOSH ROYAL FASHION,Key Features of SANTOSH ROYAL FASHION Cotton P...,151,"[key, featur, santosh, royal, fashion, cotton,...",110
4,6325b6870c54cd47be6ebfbffa620ec7,Jaipur Print Cotton Floral King sized Double B...,Home Furnishing,Jaipur Print,Key Features of Jaipur Print Cotton Floral Kin...,231,"[key, featur, jaipur, print, cotton, floral, k...",146


In [9]:
from collections import Counter

# On convertit chaque ligne en un ensemble pour supprimer les doublons
unique_words_per_line = df["description2"].apply(set)

# On convertit la colonne des ensembles en une liste de mots
words_list = [word for sublist in unique_words_per_line.tolist() for word in sublist]

# On compte le nombre de lignes où chaque mot apparaît
counter = Counter(words_list)

# On extrait les 10 mots les plus courants
most_common_words = counter.most_common(20)

# On crée un nouveau DataFrame pour stocker les résultats
result_df = pd.DataFrame(most_common_words, columns=["word", "count"])

# On calcule le pourcentage d'apparition de chaque mot
result_df["percentage"] = (result_df["count"] / len(df)) * 100

round(result_df, 2)


Unnamed: 0,word,count,percentage
0,product,667,63.52
1,free,595,56.67
2,buy,578,55.05
3,deliveri,566,53.9
4,ship,565,53.81
5,cash,564,53.71
6,genuin,564,53.71
7,price,542,51.62
8,day,514,48.95
9,replac,493,46.95


In [10]:
def get_word_frequencies(text):
    # Analyser le texte avec SpaCy
    doc = nlp(text)

    # Filtrer les mots vides (stop words) et la ponctuation
    words = [
        token.text.lower() for token in doc if token.is_alpha and not token.is_stop
    ]

    # Utiliser Counter pour compter les occurrences de chaque mot
    word_frequencies = Counter(words)

    return word_frequencies


In [11]:
corpus = df["description2"].apply(" ".join)
corpus


0       key featur eleg polyest multicolor abstract ey...
1       specif sathiya cotton bath towel bath towel re...
2       key featur eurospa cotton terri face towel set...
3       key featur santosh royal fashion cotton print ...
4       key featur jaipur print cotton floral king siz...
                              ...                        
1045    oren empow extra larg self adhes sticker pack ...
1046    wallmantra larg vinyl sticker sticker pack pri...
1047    buy uberlyf extra larg pigment polyvinyl film ...
1048    buy wallmantra medium vinyl sticker sticker on...
1049    buy uberlyf larg vinyl sticker onlin uberlyf l...
Name: description2, Length: 1050, dtype: object

In [None]:
df.head()


In [None]:
df.groupby("category_lvl_1").agg({"image": "count", "brand": "count"})
