In [1]:
import numpy as np
import pandas as pd
import altair as alt
alt.data_transformers.enable('default', max_rows=None)

from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

from warnings import filterwarnings
filterwarnings('ignore')
from scipy import stats

## Import the datasets

In [2]:
women = pd.read_csv("/Volumes/LOVUN/George Mason Files/Fall 2021/AIT 664/Project /data/women.csv")
men = pd.read_csv("/Volumes/LOVUN/George Mason Files/Fall 2021/AIT 664/Project /data/men.csv")
kids = pd.read_csv("/Volumes/LOVUN/George Mason Files/Fall 2021/AIT 664/Project /data/kids.csv")
bags = pd.read_csv("/Volumes/LOVUN/George Mason Files/Fall 2021/AIT 664/Project /data/bags.csv")
shoes= pd.read_csv("/Volumes/LOVUN/George Mason Files/Fall 2021/AIT 664/Project /data/shoes.csv")


## Concatenate the datasets

In [3]:
frames = [women, men, kids,bags,shoes]

fdata = pd.concat(frames)


In [4]:
fdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47193 entries, 0 to 11822
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   category               47193 non-null  object 
 1   subcategory            47193 non-null  object 
 2   name                   47193 non-null  object 
 3   current_price          47193 non-null  float64
 4   raw_price              47193 non-null  float64
 5   currency               47193 non-null  object 
 6   discount               47193 non-null  int64  
 7   likes_count            47193 non-null  int64  
 8   is_new                 47193 non-null  bool   
 9   brand                  9018 non-null   object 
 10  brand_url              5608 non-null   object 
 11  codCountry             41389 non-null  object 
 12  variation_0_color      46752 non-null  object 
 13  variation_1_color      38535 non-null  object 
 14  variation_0_thumbnail  46752 non-null  object 
 15  va

# Feature Engineering 

## Drope unrelated columns

In [5]:
#1. drop columns from df1
fdata=fdata.drop(['currency',
                    'brand_url',
                    'variation_0_thumbnail',
                    'variation_0_image',
                    'variation_1_thumbnail',
                    'variation_1_image',
                    'image_url',
                    'url',
                    'brand',
                    'is_new',
                    'codCountry'], axis =1 )

## Add two_colors column

In [6]:
fdata['two_colors']= np.where((fdata['variation_1_color'].notna()), 1,0)
fdata['two_colors'] = fdata['two_colors'].astype('bool')


### Remove the non-understandable colors

In [7]:
fdata['variation_0_color'] = fdata['variation_0_color'].str.replace('\W+','')
fdata['variation_0_color'] = fdata['variation_0_color'].str.replace('\d+','')
fdata['variation_1_color'] = fdata['variation_1_color'].str.replace('\W+','')
fdata['variation_1_color'] = fdata['variation_1_color'].str.replace('\d+','')

In [8]:
fdata = fdata[~fdata['variation_0_color'].isin(['# 01','#01','4' ,'#03', '86', '\xa0Frontière colorée','# 2','Comme le montre l&#39;image', '#1', '02', '\xa0# 24', '# 05' , '01', '1', '# 02','\xa0Point','001', '1#' ,'# 03','07','#7','03','2','# 04' , '# 07', '# 5','11','59','25','08' ,'53', '#06','#05','# 06', 'd&#39;or', '05', '8','#02','6 #','17','09','2 #', '5','23', '# 4', '\xa0Bordure bleue', 'As shown', '04', '63', '# 08','Figure 1 color','18' ,'\xa0Croix Beige','14', '#3','Or # 2',"# 04"])]
fdata = fdata[~fdata['variation_1_color'].isin(['# 01','4' ,'#03', '86', 'Jaune d&#39;armée', '\xa0Frontière colorée','# 2','Comme le montre l&#39;image', '#1', '02', '\xa0# 24', '# 05' , '01', '1', '# 02','\xa0Point','001', '1#' ,'# 03','07','#7','03','2','# 04' , '# 07', '# 5','11','59','25','08' ,'53', '#06','#05','# 06', 'd&#39;or', '05', '8','#02','6 #','17','09','2 #', '5','23', '# 4', '\xa0Bordure bleue', 'As shown', '04', '63', '# 08','Figure 1 color','18' ,'\xa0Croix Beige','14', '#3','Or # 2',"# 04"])]


## Handel the color columns

In [9]:
fdata.dropna(subset = ["variation_0_color"], inplace=True) #remove NA's
fdata = fdata[fdata.variation_0_color != ""]

## Unify colors

In [10]:
fdata['variation_0_color'] = fdata['variation_0_color'].replace(['White' , 'Blanccrème','OffWhite',  'white', 'Navetblanc',  'Blackwhite' , 'blanc' ],'White')
fdata['variation_0_color'] = fdata['variation_0_color'].replace(['Green','Commelemontrelimage' ,'chameau','Vertdherbe', 'Vert','LightGreen',  'ArmyGreen', 'DarkGreen', 'LakeGreen',   'vert' , 'PeacockGreen' ,  'Vertfoncé' ,  'arméeverte' , 'Mentheverte' ,  'Vertclair' ,   'vertjaune' , 'JauneVert' ,  'Vertfluorescent' ,  'Vertdherbe'   'Vert' ,    'OliveDrab',  'Vertgris',  'Loup',  'Camouflage',  'BrightGreen',  'Vertprofond',  'Vertjaunâtre' ,  'Vertor',  'BleuVert',  'Kakigris',  'CamouflageVert', 'CamouflageNoirVert' , 'Camouflageblanc' ,  'Oretvert' ,  'RougeVert' ,  'CamouflageBlanc' , 'Arbre', 'Jungle',  'Pommeverte' ,  'Vertetorange' , 'Serpentine' ],'Green')
fdata['variation_0_color'] = fdata['variation_0_color'].replace([ 'Red' , 'rougebrique','RED','WineRed', 'BrickRed' ,  'RustRed' , 'JujubeRouge',  'Rouge', 'banderouge',  'Rosefoncé', 'Floral' , 'Rougeorange' , 'Vinrouge' , 'Claret' , 'marron',  'Rouillerouge', 'Roserouge',  'Rougeetmarine' ,  'Lettrerouge' ,  'Bordeaux' , 'Pastèquerouge',  'roseblanche', 'Rougefoncé',  'Bourgogne',  'Camouflagerouge' ,  'Rougeclair',  'RougeBlanc',  'rougebrique' 'RED', 'Rougeàvin' , 'Melonrouge' , 'RedBrown' ],'Red')
fdata['variation_0_color'] = fdata['variation_0_color'].replace([ 'Black' ,'NoirVert', 'black', 'Noir',  'Noirrouge', 'Fleurnoir',  'NoirBlanc' ,  'Sansfilnoir',  'Blancnoir' , 'CamoNoir', 'Blackstrip' ,  'BK',  'Noiretbleu', 'rougeetnoir',  'RougeNoir' , 'lagrille', 'Noirblancrouge' , 'Feràrepasser',  'Brunfoncé' ,  'NoirRouge' , 'BlackPlushLining',  'Ornoir' ],'Black')
fdata['variation_0_color'] = fdata['variation_0_color'].replace(['Blue' ,'Bordurebleue' ,'BleuRouge','Bleuetorange','NoirBleu',  'SkyBlue' , 'Navy', 'Bleu',  'Bleufoncé', 'LightBlue',  'LakeBlue', 'DarkBlue', 'Bleublanc', 'Bleumarin',  'BleuRose', 'Bleuciel', 'Bleuprofond',  'NoirBleu' 'BleuRouge', 'Bleuviolet', 'royalBlue' , 'JauneBleu', 'RoyalBlue' , 'BleuNoir',  'BleuGray' , 'BleuFloral',  'Fleurbleue', 'NavyBlue',  'Marine' , 'Bleuclair' , 'Unjeanbleu' , 'BlancJaune' ,  'Royal' ,  'lacbleu' , 'Cyan' , 'Bleuvert', 'BleuOr', 'OrBleuocéan',  'JeanBlue' ,'Bleukaki',  'Rayurebleue',  'Bleurouge', 'blancbleu' , 'Bleuroyal' , 'Blancbleu' , 'Rayuresbleues', 
'Bleujaune' , 'Navystripes', 'Largebluebeltgreennet', 'SanBlue',  'Océanbleu', 'Paonbleu', 'blue' ,  'BluePlushLining' , 'Bulelac'],'Blue')
fdata['variation_0_color'] = fdata['variation_0_color'].replace([ 'Grey','Couleurdupistolet', 'Gray',  'LightGrey',  'DarkGray',  'Silver', 'DarkGrey' , 'Gris' ,  'gris' , 'Grisfoncé',  'Grisclair', 'argent',  'Rayuregrise', 'Grisargenté' , 'grisBleu',  'Grisnoir' , 'grisnoir' , 'Grisetvert' , 'Blancetgris', 'MarronGris' ,'NoirGrisBlanc',  'GrisBleu', 'bleuGris',  'NoirGris',  'CamouflageGris' ,  'GrisNoir' ,  'GrisRose' ,   'Éléphant',  'Grisdelarmée', 'Noirargent' ,  'GrisViolet'],'Gray')
fdata['variation_0_color'] = fdata['variation_0_color'].replace([ 'Yellow' ,'Jaunedarmée','CroixBeige','Jaunecitron', 'Gold' ,  'BlancGris',  'Blanc' , 'Jaune' ,  'Or' , 'Cameo', 'camel', 'Fleurjaune',  'Champagne',  'Jauneclair',  'EarthYellow' ,  'Jaunevif', 'LightYellow',  'Jaunemarron', 'Largerhombicgold' , 'ChampagneOr' , 'YellowBrown' ,  'BrownYellow' ,  'Crème',  'Jaunefoncé'],'Yellow')
fdata['variation_0_color'] = fdata['variation_0_color'].replace(['Orange' ,  'OrangeRed' ,'OrangeBleu', 'Nacarat', 'LumièreOrange',  'Orangesombre',  'OrangePink', 'Pêche',  'OrangeMarron' , 'NoirOrange'  ],'Orange')
fdata['variation_0_color'] = fdata['variation_0_color'].replace(['Brown','Rougenoir','Coffelégère','Rougenoir', 'Marronprofond', 'café',  'Coffee', 'Nude' ,  'Noirblanc', 'Cafénoir','Nu', 'Khaki' ,'kaki' ,'Nu',  'Apricot' , 'Abricot',  'Beige' ,  'Armée', 'Army' ,  'Rouille' ,  'Camée', 'Gingembre' , 'Sousnu',  'Bronze' ,  'Bandenoire', 'Marronfoncé',  'Taupe' ,  'Darkbrown' , 'Skin',  'Camo',  'Pente' ,  'MarronRouge' , 'Chocolat', 'Kakiclair',  'Couleurdepeau' ,'Brownsuit', 'Arméelégère',  'Orblanc',  'CaféLéger', 'VertRouge' , 'Rayureblanche' , 'Beigeclair' , 'Couleurdesabledeharicot', 'Noirkaki' , 'Marronclair' ,
'Rougeetgris' ,   'Caféprofond' , 'Marron',  'Rougemarron' ,  'Ombre' , 'Beigestitching' , 'lattice' ,  'MarronNoir',  'Roseetchocolat',  'Beanpaste' ,  'ACU' , 'bronzer', 'Bronzageléger', 'BeigeRouge', 'RougeBleu'  ,'Noiretblanc'],'Brown')
fdata['variation_0_color'] = fdata['variation_0_color'].replace(['Pink',  'LightPink' , 'Magenta' ,  'Rose', 'Roseclair', 'Roseprofond', 'rouge' , 'Crevette' , 'Fleur' , 'Floralvert', 'Fleurrosefoncé',  'fleurblanche' , 'Rosevif', 'Fleurs' ,  'Shell' ,  'Rosepourpre' , 'RoseNoir' , 'Roseetblanc' ,  'Orrose'  ],'Pink')
fdata['variation_0_color'] = fdata['variation_0_color'].replace(['MultiColor' , 'Colorful' , 'BlackWhite' ,  'Multicolore',  'MultiCouleur', 'Colourful',  'Print' , 'Imprimé',  'Striped',  'Bande' , 'Multicolor', 'Duvin' ,  'Coloré',  'PhotoColor', 'picturecolor', 'Recoloriée', 'Couleurnu', 'cem' ],'MultiColor')
fdata['variation_0_color'] = fdata['variation_0_color'].replace(['Purple',  'Violet' , 'DarkPurple', 'LightPurple',  'Violetclair','VioletGris' ,  'Violetfoncé' ,   'Grisviolet',  'vinrouge' , 'Rougeviolet',  'UNE' , 'Marguerite',  'BlancViolet' ,  'PurpleRed'],'Purple')
fdata['variation_0_color'] = fdata['variation_0_color'].replace([ 'AsPicture','Figurecolor','Asshown','Frontièrecolorée','Point', 'dix', 'dor' ,  'Commeimage' , 'Léopard',  'Leopard',  'Plaid', 'Leopardnoir',  'étoileblanche',   'arcenciel', 'Rainbow', 'Graindeléopard',  'Graffiti' ,  'Crossredcolor','LongSleeve','Transparent'],'other')
####
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['White' ,'BlancVert','Panda','ArgentWhtie', 'Blanccrème','OffWhite',  'white', 'Navetblanc',  'Blackwhite' , 'blanc','BlackandWhite' ],'White')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['Green','Beigeetvert','Noirbrillant','Greenlittle','Blancetvert','Rayureverte','Oliveverte','Vertbleu' ,'VertBlanc','Lemon','DarkArmyGreen','PeaGreen','VertViolet','VertJaune','chameau','Vertdherbe', 'Vert','LightGreen',  'ArmyGreen', 'DarkGreen', 'LakeGreen',   'vert' , 'PeacockGreen' ,  'Vertfoncé' ,  'arméeverte' , 'Mentheverte' ,  'Vertclair' ,   'vertjaune' , 'JauneVert' ,  'Vertfluorescent' ,  'Vertdherbe'   'Vert' ,    'OliveDrab',  'Vertgris',  'Loup',  'Camouflage',  'BrightGreen',  'Vertprofond',  'Vertjaunâtre' ,  'Vertor',  'BleuVert',  'Kakigris',  'CamouflageVert', 'CamouflageNoirVert' , 'Camouflageblanc' ,  'Oretvert' ,  'RougeVert' ,  'CamouflageBlanc' , 'Arbre', 'Jungle',  'Pommeverte' ,  'Vertetorange' , 'Serpentine' ],'Green')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['Red' ,'RD','Àpois','Jaspe','Pointnoir', 'rougebrique','RED','WineRed', 'BrickRed' ,  'RustRed' , 'JujubeRouge',  'Rouge', 'banderouge',  'Rosefoncé', 'Floral' , 'Rougeorange' , 'Vinrouge' , 'Claret' , 'marron',  'Rouillerouge', 'Roserouge',  'Rougeetmarine' ,  'Lettrerouge' ,  'Bordeaux' , 'Pastèquerouge',  'roseblanche', 'Rougefoncé',  'Bourgogne',  'Camouflagerouge' ,  'Rougeclair',  'RougeBlanc',  'rougebrique' 'RED', 'Rougeàvin' , 'Melonrouge' , 'RedBrown' ],'Red')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['Black','Noiretviolet','Noirdoublureenpeluche','NoiravecRouge','Noirmarron','Largerhombicblack','B','NoirOr','Croixnoire' ,'Noirjaune','Sousvêtementnoir','NoirVert', 'black', 'Noir',  'Noirrouge', 'Fleurnoir',  'NoirBlanc' ,  'Sansfilnoir',  'Blancnoir' , 'CamoNoir', 'Blackstrip' ,  'BK',  'Noiretbleu', 'rougeetnoir',  'RougeNoir' , 'lagrille', 'Noirblancrouge' , 'Feràrepasser',  'Brunfoncé' ,  'NoirRouge' , 'BlackPlushLining',  'Ornoir' ],'Black')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['Blue','Verticalbluecolor','Largegreenbeltbluenet','Merbleue','Bleusaphir','CamouflageBleu','BleuBlanc','Bleuverdâtre','Bleugris','OrLacBleu','Noirrayureblanche','rougeBleu','SansfilNu','Bleuacide' ,'BleuRouge','Bleuetorange','NoirBleu',  'SkyBlue' , 'Navy', 'Bleu',  'Bleufoncé', 'LightBlue',  'LakeBlue', 'DarkBlue', 'Bleublanc', 'Bleumarin',  'BleuRose', 'Bleuciel', 'Bleuprofond',  'NoirBleu' 'BleuRouge', 'Bleuviolet', 'royalBlue' , 'JauneBleu', 'RoyalBlue' , 'BleuNoir',  'BleuGray' , 'BleuFloral',  'Fleurbleue', 'NavyBlue',  'Marine' , 'Bleuclair' , 'Unjeanbleu' , 'BlancJaune' ,  'Royal' ,  'lacbleu' , 'Cyan' , 'Bleuvert', 'BleuOr', 'OrBleuocéan',  'JeanBlue' ,'Bleukaki',  'Rayurebleue',  'Bleurouge', 'blancbleu' , 'Bleuroyal' , 'Blancbleu' , 'Rayuresbleues', 
'Bleujaune' , 'Navystripes', 'Largebluebeltgreennet', 'SanBlue',  'Océanbleu', 'Paonbleu', 'blue' ,  'BluePlushLining' , 'Bulelac'],'Blue')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['Grey','NoirLettresblanches','BeigeGrey','Grisetjaune','GrisBlanc','Grisetorange','Cameratower','Grenaille','Grisvert','Grisetbleu','GrisRouge','Grisetrouge','Couleurdupistolet', 'Gray',  'LightGrey',  'DarkGray',  'Silver', 'DarkGrey' , 'Gris' ,  'gris' , 'Grisfoncé',  'Grisclair', 'argent',  'Rayuregrise', 'Grisargenté' , 'grisBleu',  'Grisnoir' , 'grisnoir' , 'Grisetvert' , 'Blancetgris', 'MarronGris' ,'NoirGrisBlanc',  'GrisBleu', 'bleuGris',  'NoirGris',  'CamouflageGris' ,  'GrisNoir' ,  'GrisRose' ,   'Éléphant',  'Grisdelarmée', 'Noirargent' ,  'GrisViolet'],'Gray')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['Yellow','CroixBeige','Jaunedarmée','NoirJaune','Couleursable','JauneOrange','Terrejaune','citron','lapin','Jauneor','OrArgent' ,'Champagna','Jaunecitron', 'Gold' ,  'BlancGris',  'Blanc' , 'Jaune' ,  'Or' , 'Cameo', 'camel', 'Fleurjaune',  'Champagne',  'Jauneclair',  'EarthYellow' ,  'Jaunevif', 'LightYellow',  'Jaunemarron', 'Largerhombicgold' , 'ChampagneOr' , 'YellowBrown' ,  'BrownYellow' ,  'Crème',  'Jaunefoncé'],'Yellow')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['Orange','Peachpuff','PeachRouge' ,'OrangeRose',  'OrangeRed' ,'OrangeBleu', 'Nacarat', 'LumièreOrange',  'Orangesombre',  'OrangePink', 'Pêche',  'OrangeMarron' , 'NoirOrange'  ],'Orange')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['Brown','BrownSnake','BrownPlushLining','blancrouge','Maroon','Lesable','Rougevert','CP','Caférouge','DarkCoffee','CameoBrown','Coffeecolor','DeepKhaki','Vague','DarkKhaki','LightKhaki','Rougekaki','Cafévert','Couleurdepeausombre','rougenoir','Rougeblanc','Rougenoir','Coffelégère','Rougenoir', 'Marronprofond', 'café',  'Coffee', 'Nude' ,  'Noirblanc', 'Cafénoir','Nu', 'Khaki' ,'kaki' ,'Nu',  'Apricot' , 'Abricot',  'Beige' ,  'Armée', 'Army' ,  'Rouille' ,  'Camée', 'Gingembre' , 'Sousnu',  'Bronze' ,  'Bandenoire', 'Marronfoncé',  'Taupe' ,  'Darkbrown' , 'Skin',  'Camo',  'Pente' ,  'MarronRouge' , 'Chocolat', 'Kakiclair',  'Couleurdepeau' ,'Brownsuit', 'Arméelégère',  'Orblanc',  'CaféLéger', 'VertRouge' , 'Rayureblanche' , 'Beigeclair' , 'Couleurdesabledeharicot', 'Noirkaki' , 'Marronclair' ,
'Rougeetgris' ,   'Caféprofond' , 'Marron',  'Rougemarron' ,  'Ombre' , 'Beigestitching' , 'lattice' ,  'MarronNoir',  'Roseetchocolat',  'Beanpaste' ,  'ACU' , 'bronzer', 'Bronzageléger', 'BeigeRouge', 'RougeBleu'  ,'Noiretblanc'],'Brown')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['Pink','rosenoire','Rosebleu','Rosenoire','Fluorescenceverte','Feuille','RoseRougeS','BlancRose','Roseetgris','FluorescentRouge','Fleurrose','Fleurrouge','Floralrouge','NoirRose',  'LightPink' , 'Magenta' ,  'Rose', 'Roseclair', 'Roseprofond', 'rouge' , 'Crevette' , 'Fleur' , 'Floralvert', 'Fleurrosefoncé',  'fleurblanche' , 'Rosevif', 'Fleurs' ,  'Shell' ,  'Rosepourpre' , 'RoseNoir' , 'Roseetblanc' ,  'Orrose'  ],'Pink')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['MultiColor','ZebraBlanc','ZebraBlanc','Lumière','Coralred','Colorébleu','MultiColor' , 'Colorful' , 'BlackWhite' ,  'Multicolore',  'MultiCouleur', 'Colourful',  'Print' , 'Imprimé',  'Striped',  'Bande' , 'Multicolor', 'Duvin' ,  'Coloré',  'PhotoColor', 'picturecolor', 'Recoloriée', 'Couleurnu', 'cem' ],'MultiColor')
fdata['variation_1_color'] = fdata['variation_1_color'].replace(['Purple','PurplrBlanc','pourpre','Indigo','Aubergine','Lavande',  'Violet' , 'DarkPurple', 'LightPurple',  'Violetclair','VioletGris' ,  'Violetfoncé' ,   'Grisviolet',  'vinrouge' , 'Rougeviolet',  'UNE' , 'Marguerite',  'BlancViolet' ,  'PurpleRed'],'Purple')
fdata['variation_1_color'] = fdata['variation_1_color'].replace([ 'AsPicture','Asshown', 'dix', 'dor','Point','Figurecolor','Frontièrecolorée','Point','dor','Vertcoloré','Transparent','blackstrips','Blackandwhitestitching' ,'Motifléopard','Blackstripes',  'Commeimage' , 'Léopard',  'Leopard',  'Plaid', 'Leopardnoir',  'étoileblanche',   'arcenciel', 'Rainbow', 'Graindeléopard',  'Graffiti' ,  'Crossredcolor','ShortSleeve'],'other')




In [11]:
fdata['variation_0_color'].unique()

array(['White', 'Brown', 'Green', 'Red', 'Black', 'Blue', 'Gray',
       'Yellow', 'Orange', 'Pink', 'MultiColor', 'Purple', 'other'],
      dtype=object)

In [12]:
fdata.head()

Unnamed: 0,category,subcategory,name,current_price,raw_price,discount,likes_count,variation_0_color,variation_1_color,id,model,two_colors
0,women,T-shirts,T-shirt boutonné à manches courtes et imprimé ...,23.99,46.99,49,313,White,Blue,1690810,SKUF40137,True
1,women,Soutiens-gorge,Plus Soutiens-gorge avec fermeture à l'avant,15.99,40.36,60,4603,White,Gray,1533303,SKUC91583,True
2,women,Pantalons & Shorts,Pantalon décontracté à taille élastique de cou...,25.99,50.99,49,5564,Brown,Black,1661710,SKUE94621,True
3,women,Robes imprimées,Robe midi décontractée à imprimé floral,23.99,46.99,49,1262,Brown,White,1691484,SKUF41372,True
4,women,T-shirts,T-shirt brodé de fleurs,15.99,38.88,59,4485,Green,Blue,1655044,SKUE83526,True


# Export the final dataset

In [22]:
fdata.to_csv('/Volumes/LOVUN/George Mason Files/Fall 2021/AIT 664/Project /data/fdata.csv', index=False)

# To import the final dataset

In [25]:
fdata= pd.read_csv("/Volumes/LOVUN/George Mason Files/Fall 2021/AIT 664/Project /data/fdata.csv")


## labeled categorical data

In [26]:
df1=fdata

In [27]:
df1['variation_1_color'].unique()

array(['Blue', 'Gray', 'Black', 'White', 'Yellow', 'Brown', 'Green',
       'Purple', 'Red', 'Pink', 'Orange', nan, 'other'], dtype=object)

In [28]:
df1['category'] = pd.Categorical(df1['category'])
df1['subcategory'] = pd.Categorical(df1['subcategory'])
df1['variation_0_color'] = pd.Categorical(df1['variation_0_color'])
df1['variation_1_color'] = pd.Categorical(df1['variation_1_color'])

In [29]:
def label_encode_fit(data, columns):
    result = df1.copy()
    cat_col = ['category','subcategory','variation_0_color','variation_1_color','model']
    encoders = {}
    for column in cat_col:
        encoder = preprocessing.LabelEncoder()
        result[column] = encoder.fit_transform(result[column])
        encoders[column] = encoder
    return result, encoders
    

'White', 'Brown', 'Green', 'Red', 'Black', ..., 'Orange', 'Pink', 'MultiColor', 'Purple', 'other'
10,  2,  4,  9,  0,  1,  3, 11,  6,  7,  5,  8, 12

In [30]:
data1, encoders1 = label_encode_fit(df1,df1.columns)



In [31]:
data1.head()

Unnamed: 0,category,subcategory,name,current_price,raw_price,discount,likes_count,variation_0_color,variation_1_color,id,model,two_colors
0,4,153,T-shirt boutonné à manches courtes et imprimé ...,23.99,46.99,49,313,9,1,1690810,41258,True
1,4,140,Plus Soutiens-gorge avec fermeture à l'avant,15.99,40.36,60,4603,9,3,1533303,24919,True
2,4,89,Pantalon décontracté à taille élastique de cou...,25.99,50.99,49,5564,2,0,1661710,36999,True
3,4,109,Robe midi décontractée à imprimé floral,23.99,46.99,49,1262,2,9,1691484,41452,True
4,4,153,T-shirt brodé de fleurs,15.99,38.88,59,4485,4,1,1655044,36141,True


In [32]:

data1['variation_1_color'] =data1['variation_1_color'].map({0: 12, 1:0,2:1,3:2,4:3,5:4,6:5,7:6,8:7,9:8,10:9,11:10,12:11 })


In [33]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45425 entries, 0 to 45424
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   category           45425 non-null  int64  
 1   subcategory        45425 non-null  int64  
 2   name               45425 non-null  object 
 3   current_price      45425 non-null  float64
 4   raw_price          45425 non-null  float64
 5   discount           45425 non-null  int64  
 6   likes_count        45425 non-null  int64  
 7   variation_0_color  45425 non-null  int64  
 8   variation_1_color  45425 non-null  int64  
 9   id                 45425 non-null  int64  
 10  model              45425 non-null  int64  
 11  two_colors         45425 non-null  bool   
dtypes: bool(1), float64(2), int64(8), object(1)
memory usage: 3.9+ MB


# Export labeled data

In [31]:
data1.to_csv('/Volumes/LOVUN/George Mason Files/Fall 2021/AIT 664/Project /data/data1.csv', index=False)

# Import lableled data

In [34]:
data1= pd.read_csv("/Volumes/LOVUN/George Mason Files/Fall 2021/AIT 664/Project /data/data1.csv")


# Data distribution

In [21]:
fdata.describe()

Unnamed: 0,current_price,raw_price,discount,likes_count,id
count,45425.0,45425.0,45425.0,45425.0,45425.0
mean,28.738523,60.719223,52.222609,224.296599,1465872.0
std,16.025378,39.93623,10.40863,631.966251,204527.5
min,0.14,0.0,0.0,0.0,27928.0
25%,18.04,39.27,47.0,29.0,1311385.0
50%,24.99,53.04,52.0,75.0,1506630.0
75%,35.69,73.99,59.0,189.0,1657185.0
max,314.59,5089.0,100.0,21547.0,1724666.0


## Visualization

Number of items per color

In [None]:
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(fdata).mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x='variation_0_color:N',
    y='count(variation_0_color):Q',
   
        )

In [None]:
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(fdata).mark_bar().encode(
    x='category:N',
    y='count(category):Q'
).properties(width =200)

In [None]:
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(fdata).mark_bar().encode(
    x='variation_0_color:N',
    y='current_price:Q'
).properties(width =400)

In [None]:
alt.data_transformers.enable('default', max_rows=None)
alt.Chart(fdata).mark_bar().encode(
    x='category:N',
    y='likes_count:Q'
).properties(width =300)

# The final Dataset

In [35]:
len (data1['variation_0_color'].unique())


12

In [7]:
data1.describe()

Unnamed: 0.1,Unnamed: 0,category,subcategory,current_price,raw_price,discount,likes_count,variation_0_color,variation_1_color
count,45425.0,45425.0,45425.0,45425.0,45425.0,45425.0,45425.0,45425.0,45425.0
mean,22712.0,2.561431,87.277248,28.738523,60.719223,52.222609,224.296599,4.155817,5.561695
std,13113.212326,1.346878,48.888451,16.025378,39.93623,10.40863,631.966251,3.736403,4.792106
min,0.0,0.0,0.0,0.14,0.0,0.0,0.0,0.0,0.0
25%,11356.0,2.0,42.0,18.04,39.27,47.0,29.0,0.0,1.0
50%,22712.0,3.0,99.0,24.99,53.04,52.0,75.0,3.0,4.0
75%,34068.0,4.0,128.0,35.69,73.99,59.0,189.0,8.0,10.0
max,45424.0,4.0,176.0,314.59,5089.0,100.0,21547.0,11.0,13.0


In [8]:
data1.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45425 entries, 0 to 45424
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         45425 non-null  int64  
 1   category           45425 non-null  int64  
 2   subcategory        45425 non-null  int64  
 3   current_price      45425 non-null  float64
 4   raw_price          45425 non-null  float64
 5   discount           45425 non-null  int64  
 6   likes_count        45425 non-null  int64  
 7   variation_0_color  45425 non-null  int64  
 8   variation_1_color  45425 non-null  int64  
 9   two_colors         45425 non-null  bool   
dtypes: bool(1), float64(2), int64(7)
memory usage: 3.2 MB


## Test the data normal distribution

### to test the data we will take the mean of values
the values will group by based on subcategory column

The first test = main colors with current price 

In [36]:
groupby_color1 = fdata.groupby('variation_0_color')
for category, value in groupby_color1['current_price']:
    print((category, value.mean()))

('Black', 31.831592486037604)
('Blue', 28.02874250408874)
('Brown', 33.444498806682915)
('Gray', 29.807514450866968)
('Green', 26.985924308588114)
('Orange', 26.63955375253543)
('Pink', 25.536003394145265)
('Purple', 25.203478260869655)
('Red', 31.120098826436312)
('White', 24.08693936017869)
('Yellow', 28.271422138836858)
('other', 24.824582463465678)


In [37]:

mean_col_lable1 =data1.groupby(['variation_0_color'], as_index=False).mean()
mean_col_lable1

Unnamed: 0.2,variation_0_color,Unnamed: 0,Unnamed: 0.1,category,subcategory,current_price,raw_price,discount,likes_count,variation_1_color,two_colors
0,0,25900.036639,25900.036639,2.62261,84.481977,31.831592,66.740596,51.818751,193.039178,6.0088,0.870198
1,1,20606.35726,20606.35726,2.636198,86.840087,28.028743,59.786916,52.419226,253.534799,6.390333,0.727603
2,2,22977.220167,22977.220167,2.433174,91.964499,33.444499,70.784138,52.787888,222.444212,4.907816,0.843675
3,3,20744.662042,20744.662042,2.578035,87.290173,29.807514,61.960304,51.590751,221.099807,4.783815,0.847013
4,4,19329.666303,19329.666303,2.781659,89.925036,26.985924,57.467169,52.26492,259.100073,5.511645,0.820597
5,5,19533.981744,19533.981744,2.78499,90.880325,26.639554,57.556572,52.608519,284.63286,6.296146,0.742394
6,6,22189.978362,22189.978362,2.396266,86.883114,25.536003,55.948594,54.289563,272.483029,4.766228,0.869325
7,7,21790.8431,21790.8431,2.344991,90.876181,25.203478,56.185019,54.535917,188.417769,4.453686,0.867675
8,8,25251.51853,25251.51853,2.269302,86.832613,31.120099,66.564537,52.521618,231.139284,4.413836,0.849907
9,9,20630.982652,20630.982652,2.601305,88.784657,24.086939,49.607404,51.215502,201.796594,5.081012,0.849594
