In [1]:
import pandas as pd

## Récupération de toutes les données

In [2]:
sep = ','

data_train = pd.read_csv("../data/train.csv", sep=sep, encoding='latin-1')
data_test = pd.read_csv("../data/test.csv", sep=sep, encoding='latin-1')
data_attributes = pd.read_csv(
    "../data/attributes.csv", sep=sep, encoding='latin-1')
data_description = pd.read_csv(
    "../data/product_descriptions.csv", sep=sep, encoding='latin-1')

del data_train['id']
del data_test['id']

In [None]:
print(f"Data train : {data_train.isnull().values.sum()} null values:")
print(f"Data test : {data_test.isnull().values.sum()} null values:")
print(f"Data attribute : {data_attributes.isnull().values.sum()} null values:")
print(f"Data description : {data_description.isnull().values.sum()} null values:")

In [3]:
#data_train_test = pd.concat([data_train[['product_uid','product_title']],data_test[['product_uid','product_title']]])
#data_description_title = data_description.join(data_train_test[['product_uid','product_title']].set_index('product_uid'), on='product_uid')
#data_description_title = data_description_title.drop_duplicates(subset ='product_uid')
#data_description_title.to_csv('../data/product_descriptions_title.csv', index=False)

data_description_title = pd.read_csv("../data/product_descriptions_title.csv", sep=sep, encoding='latin-1')

## Spécification

### Input

id, **product_uid**, *product_title*, **search_term**

### Other input

- product_description.csv : **product_uid**, **product_description**
- attribute.csv : **product_uid**,**name**,**value**

### Ouput

Indice de correspondance entre la recherche et le produit


# TODO

- stemmer / clean : 
    - traiter les integers
    - traiter les marques
    


## Démarche

Pour créer notre model, on va le faire en découpant le fichier *train.csv* en 67% pour l'entrainer et 33% pour le tester.


Déterminer ce qui à le plus d'impact: attributs, description, ...

Nettoyer les données.\
Supprimer les mots inutiles (text mining).

Voire Morelli pour discuter des attributs qui peuvent avoir des valeurs très différentes.



#### Questions

Est-ce que la relevance est un entier ou un float ??\
Est-ce qu'il sera nécesaire de faire un moteur de recherche ??


In [4]:
display(data_train)

Unnamed: 0,product_uid,product_title,search_term,relevance
0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.00
1,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.50
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.00
3,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67
...,...,...,...,...
74062,206638,Atlantic Windowpane 576 CD or 192 DVD Blu-Ray ...,tv riser glass,1.00
74063,206639,Philips 40-Watt Halogen R20 Flood Light Bulb (...,r20 halogen light,3.00
74064,206641,Schlage Camelot In-Active Aged Bronze Handlese...,schlage lock siena half dummy knob with,2.33
74065,206648,Plastec 11 in. x 24 in. Rose Garden Wall Decor...,zen garden decor,3.00


In [4]:
import string

def stemmer(texts):
    res = []
    for text in texts:
        restext = ''
        for char in text:
            if char in string.punctuation:
                restext += ' '
            else:
                restext += char

        res.append(restext.lower())

    return res

In [9]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

from nltk import download as ntlk_download
ntlk_download('stopwords')


def stemmer_ntlk(texts):
    res = []

    porter = PorterStemmer()

    stop_w = set(stopwords.words('english'))

    for text in texts:
        text_lower = text.lower()

        # Suppresion de la ponctuation
        text_no_ponc = ''
        for char in text_lower:
            if char in string.punctuation:
                text_no_ponc += ' '
            else:
                text_no_ponc += char

        # Supression mot à la con (exemple: a, the, ...)
        words = text_no_ponc.split()
        words_filtered = [w for w in words if not w in stop_w]

        # read, reading => read
        words_stem = []
        restext = ''
        for word in words_filtered:
            restext += porter.stem(word) + ' '

        res.append(restext)

    return res

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
from os.path import exists

path = '../data/tmp/product_descriptions_title_clean.csv'
if exists(path):
    data_description_title = pd.read_csv(path, sep=sep)
else:
    columns = ["product_uid", "product_description",
           "clean_product_description", "product_title", "clean_product_title", ]

    data_description_title_clean = pd.DataFrame(
        columns=columns, data=data_description_title)

    data_description_title_clean[columns[2]] = stemmer_ntlk(
        data_description_title_clean.product_description)

    data_description_title_clean[columns[4]] = stemmer_ntlk(
        data_description_title_clean.product_title)

    data_description_title_clean.to_csv(path, index=False)

In [23]:
display(data_description_title_clean)

Unnamed: 0,product_uid,product_description,clean_product_description,product_title,clean_product_title
0,100001,"Not only do angles make joints stronger, they ...",angl make joint stronger also provid consist s...,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie 12 gaug angl
1,100002,BEHR Premium Textured DECKOVER is an innovativ...,behr premium textur deckov innov solid color c...,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,behr premium textur deckov 1 gal sc 141 tugboa...
2,100003,Classic architecture meets contemporary design...,classic architectur meet contemporari design e...,STERLING Ensemble 33-1/4 in. x 60 in. x 75-1/4...,sterl ensembl 33 1 4 x 60 x 75 1 4 bath shower...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...,grape solar 265 watt polycrystallin pv solar p...,Grape Solar 265-Watt Polycrystalline Solar Pan...,grape solar 265 watt polycrystallin solar pane...
4,100005,Update your bathroom with the Delta Vero Singl...,updat bathroom delta vero singl handl shower f...,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,delta vero 1 handl shower faucet trim kit chro...
...,...,...,...,...,...
124423,224424,Create a neat yet stylish storage space for or...,creat neat yet stylish storag space organ bath...,stufurhome Norma 24 in. W x 16 in. D x 34 in. ...,stufurhom norma 24 w x 16 x 34 h linen storag ...
124424,224425,Our Bullnose Adirondack Chair Cushions fit Adi...,bullnos adirondack chair cushion fit adirondac...,Home Decorators Collection 49 in. D Alessandro...,home decor collect 49 alessandro spiceberri po...
124425,224426,Joist hangers are designed to provide support ...,joist hanger design provid support underneath ...,Simpson Strong-Tie HB 3-1/2 x 14 in. Top Flang...,simpson strong tie hb 3 1 2 x 14 top flang joi...
124426,224427,These socket cap screws are ideal for applicat...,socket cap screw ideal applic requir well tool...,1/4 in. -20 tpi x 1-1/2 in. Stainless Steel Bu...,1 4 20 tpi x 1 1 2 stainless steel button head...


In [15]:
dico_word = {}

for index, desc in data_description_title_clean.iterrows():
    for word in desc['clean_product_description'].split():
        if word in dico_word:
            dico_word[word] += 1
        else:
            dico_word[word] = 1

In [17]:
dico_word = dict(sorted(dico_word.items(), key=lambda item: item[1]))

In [18]:
import json

with open("../data/tmp/description_words.json", "w") as fp:
    json.dump(dico_word, fp)

In [15]:
data_train[data_train.relevance >= 2.5]

Unnamed: 0,product_uid,product_title,search_term,relevance
0,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.00
1,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.50
2,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.00
4,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67
5,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,convection otr,3.00
...,...,...,...,...
74058,206627,25 in. Stainless Tip-Out Sink Front Tray,sink tip-out tray,3.00
74059,206631,Masonite New Haven Three Quarter Oval Lite Pri...,fiberglass front doors by masonite,3.00
74061,206637,Schluter Rondec Stainless Steel 3/8 in. x 1 in...,rondec stainless steel 3/8 edge protection,3.00
74063,206639,Philips 40-Watt Halogen R20 Flood Light Bulb (...,r20 halogen light,3.00


In [None]:
from collections import Counter
# split_it = data_attributes.split()
Counter = Counter(data_attributes['name'])
most_occur = Counter.most_common(20)
print(most_occur)

In [16]:
# Découpage en 2 jeu de données
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    data_train, test_size=0.2, random_state=42)


In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.sparse import csc_matrix

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data_description_title['product_description'].to_list())


tfidf_transformer = TfidfTransformer()
tfidf_X = tfidf_transformer.fit_transform(X)
tfidf_X

MemoryError: Unable to allocate 249. GiB for an array with shape (124428, 268859) and data type float64

In [10]:
data_description_title['product_description'].to_list()

['Not only do angles make joints stronger, they also provide more consistent, straight corners. Simpson Strong-Tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. Some can be bent (skewed) to match the project. For outdoor projects or those where moisture is present, use our ZMAX zinc-coated connectors, which provide extra resistance against corrosion (look for a "Z" at the end of the model number).Versatile connector for various 90 connections and home repair projectsStronger than angled nailing or screw fastening aloneHelp ensure joints are consistently straight and strongDimensions: 3 in. x 3 in. x 1-1/2 in.Made from 12-Gauge steelGalvanized for extra corrosion resistanceInstall with 10d common nails or #9 x 1-1/2 in. Strong-Drive SD screws',
 'BEHR Premium Textured DECKOVER is an innovative solid color coating. It will bring your old, weathered wood or concrete back to life. The advanced