### Table of Contents

* [Chapter 1](#chapter1) Nettoyage des données


# Importation et analyse des données

In [None]:
# import des librairies
import numpy as np
import pandas as pd
import glob
import scipy.stats as stats

## Les figures sont réalisées avec Plotly
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from   plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.collections import LineCollection

from sklearn.model_selection import train_test_split

import logging as lg
import os
from PIL import Image,ImageOps,ImageFilter

import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Préparation des données pour le CNN

In [None]:
path_data='/content/drive/MyDrive/OC Asma - Tahri/P06/Data/'
path_image= '/content/drive/MyDrive/OC Asma - Tahri/P06/Data/Flipkart/Images/'
path_res=path_res=path_image+'resized/'

In [None]:
df_desc_image = pd.read_csv(path_data+"df_desc_image.csv",index_col=0)
df_desc_image.sample(2)

Unnamed: 0,uniq_id,cat_level_0,product_name,description,brand,image
386,59d964c38c787f829c6cfa5629e4df90,Beauty and Personal Care,French Factor Man of the Year Deodorant Gift S...,Flipkart.com: Buy French Factor Man of the Yea...,,59d964c38c787f829c6cfa5629e4df90.jpg
578,5338c00e0ddaec0f3af737077e0bbd91,Watches,Timewel 1100-N1949_B Analog Watch - For Women,Timewel 1100-N1949_B Analog Watch - For Women...,,5338c00e0ddaec0f3af737077e0bbd91.jpg


In [None]:
# Mettre les id_image en index:
df_desc_image.set_index('image',inplace = True)

In [None]:
# Liste des classes pour etiquetter les images
classes=df_desc_image['cat_level_0'].unique().tolist()
classes

['Home Furnishing',
 'Baby Care',
 'Watches',
 'Home Decor & Festive Needs',
 'Kitchen & Dining',
 'Beauty and Personal Care',
 'Computers']

In [None]:
path_cat=path_data+'Flipkart/Images/'

In [None]:
# Fonction pour créer des nouveaux dossiers:
def new_folder(path,name):
  try:
    os.mkdir(path+name)
    print('Le dossier:{}'.format(name)+' est créé')
  except FileExistsError as e:
    lg.warning('Message: {}'.format(os.strerror(e.errno)))
  except:
    print(os.strerror(e.errno))

In [None]:
# Créer un dossier par catégorie pour stocker les images de chaque categorie:
for classe in classes:
  new_folder(path_data+'Flipkart/Images/',classe)



In [None]:
# Sauvegarder chaque image dans le dossier qui correspond à sa catégorie:
for im in df_desc_image.index.tolist():
  for classe in classes:
    if df_desc_image['cat_level_0'][im]==classe:
      path_classe=path_cat+classe+'/'
      image=Image.open(path_res+im)
      image.save(path_classe+ im.split("/")[-1])


In [None]:
#fonction pour créer un dataframe par classe:

def create_df(path,classe):
  list_df=[]
  path_classe=path+classe+'/'
  
  all_files = glob.glob(path_classe + "/*.jpg")
 
  list_df.append(all_files)

  list_id=[]
  for i in all_files:
    img_id=i.split("/")[-1]
    list_id.append(img_id)

  df_classe=pd.DataFrame(list_id,columns=['img_id'])
  df_classe[classe]=1
  df_classe.set_index('img_id', inplace = True)

  return df_classe

In [None]:
#créer le dataset qui regroupe toutes les classes:
data=pd.concat([(create_df(path_cat,classe)) for classe in classes],axis=1)

In [None]:
# Completer par des 0:
data=data.fillna(0)
data

Unnamed: 0,Home Furnishing,Baby Care,Watches,Home Decor & Festive Needs,Kitchen & Dining,Beauty and Personal Care,Computers
55b85ea15a1536d46b7190ad6fff8ce7.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6325b6870c54cd47be6ebfbffa620ec7.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0
d4684dcdc759dd9cdf41504698d737d8.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0
a4b09aa7927c59fbb39960b3d7919909.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0
81e739c7f3be737152f2e6f520e3494b.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
9b805665a791f40e7946a73dcea17e35.jpg,0.0,0.0,0.0,0.0,0.0,0.0,1.0
b5834f1e97c74d2833dbcb09a3f27926.jpg,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7815fb39feb773db944c9f03bbdcbf64.jpg,0.0,0.0,0.0,0.0,0.0,0.0,1.0
d407ab825e1a266635214bfe893f29c4.jpg,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Nous avons obtenu ici une base de données de nos 1050 photos etiquettées avec 1 pour la catégorie conrrespondante et 0 pour les autres catégories.

In [None]:
classes=data.columns.tolist()
classes

['Home Furnishing',
 'Baby Care',
 'Watches',
 'Home Decor & Festive Needs',
 'Kitchen & Dining',
 'Beauty and Personal Care',
 'Computers']

In [None]:
#création de 3 sous groupes: train, test et validation:
trainset, test_valid = train_test_split(data, test_size=0.3, shuffle=True, random_state=1)
testset, validset=train_test_split(test_valid,test_size=0.5,shuffle=True, random_state=1)
print('trainset size:',trainset.shape,'validset size:',validset.shape,'testset size:',testset.shape)

trainset size: (735, 7) validset size: (158, 7) testset size: (157, 7)


In [None]:
files=['train','test','val']

In [None]:
# Sauvegarder les 3 datasets:
trainset.to_csv(path_data+"train.csv")
testset.to_csv(path_data+"test.csv")
validset.to_csv(path_data+"valid.csv")
data.to_csv(path_data+"data.csv")

In [None]:
new_folder(path_data,'Multi_labels')

Le dossier:Multi_labels est créé


In [None]:

for file in files:
  new_folder(path_data+'Multi_labels/',file)
  new_folder(path_data,file)


Le dossier:train est créé
Le dossier:train est créé
Le dossier:test est créé
Le dossier:test est créé
Le dossier:val est créé
Le dossier:val est créé


In [None]:
# Fonction pour sauvegarder les images:
def save_all_images(dataset,path):
  for i in range(len(dataset)):
    im=Image.open(path_res+dataset.index[i])
    im.save(path+ dataset.index[i].split("/")[-1])

In [None]:
# les chemins des test, train et valid: 
path_test=path_data+'Multi_labels/test/'
path_train=path_data+'Multi_labels/train/'
path_val=path_data+'Multi_labels/val/'

In [None]:
# Sauvegarder les images correspondantes aux 3 groupes test, train et valid:
save_all_images(trainset,path_data+'train/')
save_all_images(testset,path_data+'test/')
save_all_images(validset,path_data+'val/')

In [None]:
# Sauegarder les images correspondantes aux 3 groupe en faisant la distinction des catégories:
for classe in classes:
  new_folder(path_data+'Multi_labels/train/',classe)
  new_folder(path_data+'Multi_labels/test/',classe)
  new_folder(path_data+'Multi_labels/val/',classe)


Le dossier:Home Furnishing est créé
Le dossier:Home Furnishing est créé
Le dossier:Home Furnishing est créé
Le dossier:Baby Care est créé
Le dossier:Baby Care est créé
Le dossier:Baby Care est créé
Le dossier:Watches est créé
Le dossier:Watches est créé
Le dossier:Watches est créé
Le dossier:Home Decor & Festive Needs est créé
Le dossier:Home Decor & Festive Needs est créé
Le dossier:Home Decor & Festive Needs est créé
Le dossier:Kitchen & Dining est créé
Le dossier:Kitchen & Dining est créé
Le dossier:Kitchen & Dining est créé
Le dossier:Beauty and Personal Care est créé
Le dossier:Beauty and Personal Care est créé
Le dossier:Beauty and Personal Care est créé
Le dossier:Computers est créé
Le dossier:Computers est créé
Le dossier:Computers est créé


In [None]:
# Fonction pour sauvegarder les images resizées:
def save_resized_image(dataset,path):
  for i in range(len(dataset)):
    im=Image.open(path_res+dataset.index[i])
    for classe in classes:
      if dataset[classe][i]==1:
        im.save(path+classe+'/'+ dataset.index[i].split("/")[-1])



Application de la fonction de sauveragde pour chaque groupe d'images:

In [None]:
save_resized_image(testset,path_test)

In [None]:
save_resized_image(trainset,path_train)

In [None]:
save_resized_image(validset,path_val)