In [1]:
import os
import re

import pandas as pd
import numpy as np
import unicodedata
import matplotlib.pyplot as plt

import collections
import wordcloud as wc

import nltk

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
input_path = "data/cdiscount_train.csv.zip"
nb_line=100000  # part totale extraite du fichier initial ici déjà réduit
data_all = pd.read_csv(input_path, sep=",", nrows=nb_line)
data_all = data_all.fillna("") # pour les valeurs nulles
data_all.head()

Unnamed: 0,Categorie1,Categorie2,Categorie3,Description,Libelle,Marque
0,INFORMATIQUE,CONNECTIQUE - ALIMENTATION,BATTERIE,Batterie Acer Aspire One 751H-52Yr - Li-Ion 11...,Batterie Acer Aspire One 751H-52Yr,AUCUNE
1,TELEPHONIE - GPS,ACCESSOIRE TELEPHONE,COQUE - BUMPER - FACADE TELEPHONE,Coque rigide Bleu lagon pour ALCATEL OT / 6033...,Coque rigide Bleu lagon pour ALCATEL OT / 6033 …,MUZZANO
2,TELEPHONIE - GPS,ACCESSOIRE TELEPHONE,COQUE - BUMPER - FACADE TELEPHONE,Facades et coques CELLULAR LINE SHCKGALS 3 MIN...,Facades et coques CELLULAR LINE SHCKGALS 3 MINIP,CELLULAR LINE
3,TELEPHONIE - GPS,ACCESSOIRE TELEPHONE,COQUE - BUMPER - FACADE TELEPHONE,Coque meteore TPU LG Nexus 4 / E960,Coque meteore TPU LG Nexus 4 / E960,AUCUNE
4,TELEPHONIE - GPS,ACCESSOIRE TELEPHONE,COQUE - BUMPER - FACADE TELEPHONE,Coque souple Transparente pour LG G FLEX D959 ...,Coque souple Transparente pour LG G FLEX D959 m…,MUZZANO


In [3]:
def cleaning(voca) :

    #print("Number of unique words in the total vocabulary :" ,len(set(voca.split(" "))))
    
    #STEP 1 : removing special characters and put the vocabulary in lower
    cleaned_voc = voca
    cleaned_voc = cleaned_voc.lower()
    cleaned_voc = cleaned_voc.replace(u'\u2026','.')
    cleaned_voc = cleaned_voc.replace(u'\u00a0',' ')
    cleaned_voc = cleaned_voc.replace(u'\u005F',' ')
    #print("Number of unique words in the total vocabulary, after the FIRST step of cleaning :" ,len(set(cleaned_voc.split(" "))))
    
    #STEP 2 : takes off some punctuation
    cleaned_voc = unicodedata.normalize('NFD', cleaned_voc).encode('ascii', 'ignore').decode("utf-8")
    #print("Number of unique words in the total vocabulary, after the SECOND step of cleaning :" ,len(set(cleaned_voc.split(" "))))
    
    #STEP 3 : keeps only alphabet letters
    cleaned_voc = re.sub('[^a-z_]', ' ', cleaned_voc)
    #print("Number of unique words in the total vocabulary, after the THIRD step of cleaning :" ,len(set(cleaned_voc.split(" "))))
    
    
    ## Words to delete from the descriptions
    ## Using NLTK
    stopwords = nltk.corpus.stopwords.words('french') 
    #stopwords[:10]
    
    #removing what in the stopwords ??? punctuation ?
    stopwords = [unicodedata.normalize('NFD', sw).encode('ascii', 'ignore').decode("utf-8") for sw in stopwords]
    stopwords += ["voir"]
    stopwords += ["presentation"]
    #stopwords[:10]

    
    #creation of tokens and removing the french stop word & words with less than 2 letters
    tokens = [w for w in cleaned_voc.split(" ") if (len(w)>2) and (w not in stopwords)]
    removed_words = [w for w in cleaned_voc.split(" ") if (len(w)<2) or (w in stopwords)]
    
      
    #removed_words and examples of some tokens
    #print("\n Removed words :")
    #print(set(removed_words), '\n')

    #print("Some tokens :")
    #print(tokens[:100])
    
    
    ## Stemming function to get roots of words (racines des mots)
    stemmer=nltk.stem.SnowballStemmer('french')
    tokens_stem = [stemmer.stem(token) for token in tokens]
    #print(tokens_stem[:10])
    
    voca_cleaned = ""
    
    for t in tokens_stem :
        voca_cleaned += t
        voca_cleaned += " "
    
    #print("Number of tokens we have after the cleaning :" ,len(set(tokens_stem)), "\n \n")
    
    return voca_cleaned

In [4]:
data = data_all[["Categorie1","Description"]]
list_data = []
for cat, desc in data.values:
    list_data.append([cat, cleaning(desc)]) 

In [5]:
data_cleaned = pd.DataFrame(list_data,columns= ["Categorie1","Description"]) 
data_cleaned

Unnamed: 0,Categorie1,Description
0,INFORMATIQUE,batter acer aspir one ion mah noir compatibl b...
1,TELEPHONIE - GPS,coqu rigid bleu lagon alcatel motif drapeau li...
2,TELEPHONIE - GPS,facad coqu cellular lin shckgal minip marqu ag...
3,TELEPHONIE - GPS,coqu meteor tpu nexus
4,TELEPHONIE - GPS,coqu soupl transparent flex motif keep calm an...
...,...,...
99995,DECO - LINGE - LUMINAIRE,souvenir franc tour eiffel miniatur prestig co...
99996,LIBRAIRIE,fast track wast fre manufacturing john dav fas...
99997,TELEPHONIE - GPS,fitbag bong giraf houss pochet telephon portab...
99998,DECO - LINGE - LUMINAIRE,grand tableau minn


In [6]:
#item = product, [0] =category, [1]= description
list_tokens = [item[1] for item in list_data]
CV = CountVectorizer()
CV.fit(list_tokens)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

## CountVectorizer Analysis:




In [7]:
CV.vocabulary_

{'batter': 4798,
 'acer': 314,
 'aspir': 3258,
 'one': 38199,
 'ion': 27233,
 'mah': 32322,
 'noir': 37207,
 'compatibl': 11521,
 'coqu': 12113,
 'rigid': 45105,
 'bleu': 6101,
 'lagon': 29830,
 'alcatel': 1266,
 'motif': 35575,
 'drapeau': 16285,
 'liberi': 30911,
 'film': 20526,
 'ultra': 53950,
 'fin': 20546,
 'original': 38497,
 'muzzano': 36203,
 'facad': 19687,
 'cellular': 9232,
 'lin': 31099,
 'shckgal': 47679,
 'minip': 34503,
 'marqu': 32904,
 'agree': 935,
 'samsungmobil': 46295,
 'galaxy': 22058,
 'minimatier': 34493,
 'caoutchouc': 8389,
 'soupl': 49077,
 'meteor': 34000,
 'tpu': 52897,
 'nexus': 36908,
 'transparent': 53059,
 'flex': 20859,
 'keep': 28721,
 'calm': 8124,
 'and': 1994,
 'play': 40920,
 'football': 21157,
 'footbal': 21156,
 'compatibilit': 11514,
 'iphon': 27275,
 'caracterist': 8503,
 'etuis': 19148,
 'corp': 12234,
 'enti': 18369,
 'styl': 50159,
 'couleur': 12452,
 'uni': 54079,
 'blanc': 5976,
 'ros': 45549,
 'seiko': 47194,
 'sfp': 47563,
 'homm': 254

In [8]:
features = CV.transform(list_tokens)
features
#100 000 produits, pour 57 925 tokens

<100000x57925 sparse matrix of type '<class 'numpy.int64'>'
	with 1061100 stored elements in Compressed Sparse Row format>

In [9]:
#indices des features du produit 167
features.getrow(167).indices

array([ 4798,  9230, 11521, 19191, 19660, 27246, 31268, 33643, 46483,
       51731, 52742, 53794], dtype=int32)

In [10]:
#la valeur 1 indique la présence du mot numéro 51 731 dans pour le produit 167
print(features.toarray()[167,51731])

1


In [11]:
#quel est le mot numéror 51 731 ?
[(k,v) for k,v in CV.vocabulary_.items() if v ==51731]

[('tension', 51731)]

In [13]:
# print des mots contenant "ads"
[(k,v) for k,v in CV.vocabulary_.items() if 'ads' in k]


[('adsl', 674),
 ('roadst', 45266),
 ('headset', 24794),
 ('sacochespaceipadspac', 46081),
 ('spreadsheet', 49462),
 ('roadstar', 45267),
 ('headscarv', 24793),
 ('headstomp', 24796),
 ('caadspatep', 7876),
 ('nadsl', 36330),
 ('bradshaw', 7017),
 ('roadsong', 45265),
 ('hadston', 24331),
 ('roadston', 45268),
 ('headsethic', 24795),
 ('roadsid', 45264)]

In [20]:
print(CV.get_feature_names()[51731:51740])

['tension', 'tensoval', 'tent', 'tentat', 'tentediver', 'tention', 'tentur', 'tenu', 'tenueavec']


##  Comments:

    - on a enlevé les tirets '_' dans le cleaning, on avait plein de mots bidons comme "__________ysl"
    - plein de mots mal écrits de base dans les descirption comme "sacochespaceipadspace" ou "tenueavec" 
    - mot qui divergent que d'une seule lettre ? fusion de tokens ???