In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

In [3]:
cd '/content/drive/My Drive/Colab Notebooks/dw_matrix'

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [0]:
df = pd.read_csv('data/women_shoes.csv')

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  X = df[feats].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [108]:
df['brand'] = df['brand'].map(lambda x: str(x).lower())
df['brand_cat'] = df['brand'].map(lambda x: x).factorize()[0]
run_model(['brand_cat'])

(-51.57118869808353, 0.6390743725376244)

In [109]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-51.356432361171095, 0.7207640114801125)

In [110]:
df.head()

Unnamed: 0,id,asins,brand,categories,colors,count,dateadded,dateupdated,descriptions,dimension,ean,features,flavors,imageurls,isbn,keys,manufacturer,manufacturernumber,merchants,name,prices_amountmin,prices_amountmax,prices_availability,prices_color,prices_condition,prices_count,prices_currency,prices_dateadded,prices_dateseen,prices_flavor,prices_issale,prices_merchant,prices_offer,prices_returnpolicy,prices_shipping,prices_size,prices_source,prices_sourceurls,prices_warranty,quantities,reviews,sizes,skus,sourceurls,upc,websiteids,weight,brand_cat
0,AVpfBXx21cnluZ0-cKxs,,zoot,"Shoes,Clothing,Women's Shoes,All Women's Shoes","Blue,Multicolor",,2016-11-11 09:49:00,2016-11-11 09:49:00,"[{""dateSeen"":[""2016-11-11T09:49:00Z""],""sourceU...",,,"[{""key"":""Season"",""value"":[""All-Season""]},{""key...",,https://i5.walmartimages.com/asr/e04c07b2-5222...,,zoot/z130103201090,Zoot,Z130103201090,"[{""dateSeen"":[""2016-11-11T09:49:00Z""],""name"":""...",Zoot Tt Trainer 2.0 Round Toe Synthetic Sne...,71.99,71.99,,,,,USD,2016-11-11T09:49:00Z,2016-11-02T00:00:00Z,,True,,CLEARANCE USD 178.01,,,,,https://www.walmart.com/ip/Zoot-TT-TRAINER-2.0...,,,,109,,https://www.walmart.com/ip/Zoot-TT-TRAINER-2.0...,,,,0
1,AVpfBXx21cnluZ0-cKxs,,zoot,"Shoes,Clothing,Women's Shoes,All Women's Shoes","Blue,Multicolor",,2016-11-11 09:49:00,2016-11-11 09:49:00,"[{""dateSeen"":[""2016-11-11T09:49:00Z""],""sourceU...",,,"[{""key"":""Season"",""value"":[""All-Season""]},{""key...",,https://i5.walmartimages.com/asr/e04c07b2-5222...,,zoot/z130103201090,Zoot,Z130103201090,"[{""dateSeen"":[""2016-11-11T09:49:00Z""],""name"":""...",Zoot Tt Trainer 2.0 Round Toe Synthetic Sne...,250.0,250.0,,,new,,USD,2016-11-11T09:49:00Z,2016-10-27T00:00:00Z,,False,ApparelSave - Walmart.com,CLEARANCE USD 178.01,,,,,https://www.walmart.com/ip/Zoot-TT-TRAINER-2.0...,,,,109,,https://www.walmart.com/ip/Zoot-TT-TRAINER-2.0...,,,,0
2,AVpfBXx21cnluZ0-cKxs,,zoot,"Shoes,Clothing,Women's Shoes,All Women's Shoes","Blue,Multicolor",,2016-11-11 09:49:00,2016-11-11 09:49:00,"[{""dateSeen"":[""2016-11-11T09:49:00Z""],""sourceU...",,,"[{""key"":""Season"",""value"":[""All-Season""]},{""key...",,https://i5.walmartimages.com/asr/e04c07b2-5222...,,zoot/z130103201090,Zoot,Z130103201090,"[{""dateSeen"":[""2016-11-11T09:49:00Z""],""name"":""...",Zoot Tt Trainer 2.0 Round Toe Synthetic Sne...,75.99,75.99,,,new,,USD,2016-11-11T09:49:00Z,2016-10-27T00:00:00Z,,True,ApparelSave - Walmart.com,CLEARANCE USD 174.01,,,,,https://www.walmart.com/ip/Zoot-TT-TRAINER-2.0...,,,,109,,https://www.walmart.com/ip/Zoot-TT-TRAINER-2.0...,,,,0
3,AVpfBXx21cnluZ0-cKxs,,zoot,"Shoes,Clothing,Women's Shoes,All Women's Shoes","Blue,Multicolor",,2016-11-11 09:49:00,2016-11-11 09:49:00,"[{""dateSeen"":[""2016-11-11T09:49:00Z""],""sourceU...",,,"[{""key"":""Season"",""value"":[""All-Season""]},{""key...",,https://i5.walmartimages.com/asr/e04c07b2-5222...,,zoot/z130103201090,Zoot,Z130103201090,"[{""dateSeen"":[""2016-11-11T09:49:00Z""],""name"":""...",Zoot Tt Trainer 2.0 Round Toe Synthetic Sne...,79.98,79.98,,,new,,USD,2016-11-11T09:49:00Z,2016-11-09T00:00:00Z,,True,ApparelSave - Walmart.com,CLEARANCE USD 170.02,,,,,https://www.walmart.com/ip/Zoot-TT-TRAINER-2.0...,,,,109,,https://www.walmart.com/ip/Zoot-TT-TRAINER-2.0...,,,,0
4,AVpfEqFRLJeJML431x7w,,wild pair,"Shoes,Women's Shoes,Clothing,All Women's Shoes",Brown,,2016-11-16 12:56:36,2016-11-16 12:56:36,"[{""dateSeen"":[""2016-11-16T12:56:36Z""],""sourceU...",,763181000000.0,"[{""key"":""Heel Height"",""value"":[""High (3 in. an...",,https://i5.walmartimages.com/asr/8b2c73f5-939f...,,"wildpair/colfaxcog,0763181233352,763181233352",Wild Pair,COLFAX-COG,"[{""dateSeen"":[""2016-11-16T12:56:36Z""],""name"":""...",Wild Pair Colfax Women Peep-toe Synthetic Bro...,26.98,26.98,,,,,USD,2016-11-16T12:56:36Z,2016-11-09T00:00:00Z,,True,,CLEARANCE USD 63.02,,,,,https://www.walmart.com/ip/Wild-Pair-Colfax-Wo...,,,,"8.5,7,7.5,8",,https://www.walmart.com/ip/Wild-Pair-Colfax-Wo...,763181000000.0,,,1


In [111]:
literal_eval(df.features.head().values[0])

[{'key': 'Season', 'value': ['All-Season']},
 {'key': 'Material', 'value': ['Synthetic']},
 {'key': 'Gender', 'value': ['Women']},
 {'key': 'Shoe Size', 'value': ['9']},
 {'key': 'Size', 'value': ['9']},
 {'key': 'Color', 'value': ['Blue']},
 {'key': 'Model', 'value': ['Z130103201090']},
 {'key': 'Manufacturer Part Number', 'value': ['Z130103201090']},
 {'key': 'Brand', 'value': ['Zoot']},
 {'key': 'Age Group', 'value': ['Adult']}]

In [0]:
def parse_features(x):
  if str(x) == 'nan': return {}
  return dict({x['key'].lower().strip().replace(' ', '_'): x['value'][0].lower().strip() for x in literal_eval(x.replace('\\"', '"'))})

df['features_parsed'] = df['features'].map(parse_features)

In [113]:
df['features_parsed'].head().values

array([{'season': 'all-season', 'material': 'synthetic', 'gender': 'women', 'shoe_size': '9', 'size': '9', 'color': 'blue', 'model': 'z130103201090', 'manufacturer_part_number': 'z130103201090', 'brand': 'zoot', 'age_group': 'adult'},
       {'season': 'all-season', 'material': 'synthetic', 'gender': 'women', 'shoe_size': '9', 'size': '9', 'color': 'blue', 'model': 'z130103201090', 'manufacturer_part_number': 'z130103201090', 'brand': 'zoot', 'age_group': 'adult'},
       {'season': 'all-season', 'material': 'synthetic', 'gender': 'women', 'shoe_size': '9', 'size': '9', 'color': 'blue', 'model': 'z130103201090', 'manufacturer_part_number': 'z130103201090', 'brand': 'zoot', 'age_group': 'adult'},
       {'season': 'all-season', 'material': 'synthetic', 'gender': 'women', 'shoe_size': '9', 'size': '9', 'color': 'blue', 'model': 'z130103201090', 'manufacturer_part_number': 'z130103201090', 'brand': 'zoot', 'age_group': 'adult'},
       {'heel_height': 'high (3 in. and up)', 'material': 's

In [114]:
keys = set()
df['features_parsed'].map(lambda x: keys.update(x.keys()))
  
len(keys)

341

In [115]:
def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=341), HTML(value='')))




In [116]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_hips', 'feat_display', 'feat_leg', 'feat_heel_type',
       'feat_authenticity', 'feat_shoe_category', 'feat_shoe_type',
       'feat_extended_calf', 'feat_exact_colour', 'feat_finish'],
      dtype='object', length=390)

In [0]:
keys_stat = {}
for key in keys:
  keys_stat[key] = df[False == df[get_name_feat(key)].isnull()].shape[0]/df.shape[0]*100

In [157]:
features_selected = {k:v for k,v in keys_stat.items() if v>40}
features_selected

{'age_group': 42.54208099550025,
 'brand': 57.22459863340925,
 'color': 53.1414921393256,
 'gender': 58.835620243319816,
 'manufacturer_part_number': 45.65301927670685,
 'material': 48.458418976723515}

In [0]:
for key in keys:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].map(lambda x: str(x).lower()).factorize()[0]

In [120]:
df[ df.brand != df.feat_brand][['brand', 'feat_brand']]

Unnamed: 0,brand,feat_brand
7,sutton studio,
8,sutton studio,
9,laleela.com,
10,laleela.com,
32,under armour,
...,...,...
17990,fuse lenses,
17991,bgood,
17992,bgood,
17993,steve madden,


In [124]:
feats = ['brand_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(feats)

(-51.57118869808353, 0.6390743725376244)

In [125]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_age_group_cat',
         'feat_color_cat', 'feat_gender_cat', 'feat_manufacturer_part_number_cat', 'feat_material_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(feats)

(-50.7704068676065, 0.9906601285965294)

In [126]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_age_group_cat',
         'feat_color_cat', 'feat_gender_cat', 'feat_manufacturer_part_number_cat', 'feat_material_cat']

X = df[feats].values
y = df['prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X, y)

perm = PermutationImportance(m, random_state=1).fit(X, y)
eli5.show_weights(perm, feature_names=feats)

Weight,Feature
0.6451  ± 0.0104,brand_cat
0.3329  ± 0.0276,feat_brand_cat
0.0671  ± 0.0039,feat_material_cat
0.0250  ± 0.0019,feat_gender_cat
0.0143  ± 0.0038,feat_color_cat
0.0002  ± 0.0000,feat_manufacturer_part_number_cat
0.0001  ± 0.0000,feat_age_group_cat


In [129]:
feats = ['brand_cat', 'feat_brand_cat',
         'feat_color_cat', 'feat_gender_cat', 'feat_material_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(feats)

(-50.77961613563851, 0.9551551311118129)

In [130]:
df[df['brand'] == 'nike'].features_parsed.head().values

array([{'gender': 'women', 'brand': 'nike', 'color': 'noble red/clear jade-pink blast-black'},
       {'style': 'shirts & tops', 'country/region_of_manufacture': 'sri lanka', 'sleeve_length': 'long sleeve', 'condition': 'new with tags'},
       {'style': 'shirts & tops', 'country/region_of_manufacture': 'sri lanka', 'sleeve_length': 'long sleeve', 'condition': 'new with tags'},
       {'style': 'shirts & tops', 'country/region_of_manufacture': 'sri lanka', 'sleeve_length': 'long sleeve', 'condition': 'new with tags'},
       {'material': 'synthetic', 'gender': 'women', 'size': '8.5', 'color': 'black', 'model': '698181 007', 'manufacturer_part_number': '698181 007', 'brand': 'nike', 'age_group': 'adult'}],
      dtype=object)

In [142]:
feats = ['brand_cat', 'feat_brand_cat',
         'feat_color_cat', 'feat_gender_cat', 'feat_material_cat',
         'feat_style_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(feats)

(-50.228443527597605, 0.7754478399851661)

In [143]:
feats = ['brand_cat', 'feat_brand_cat',
         'feat_color_cat', 'feat_gender_cat', 
         'feat_material_cat', 'feat_style_cat']

X = df[feats].values
y = df['prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X, y)

perm = PermutationImportance(m, random_state=1).fit(X, y)
eli5.show_weights(perm, feature_names=feats)

Weight,Feature
0.6095  ± 0.0114,brand_cat
0.3295  ± 0.0163,feat_brand_cat
0.0580  ± 0.0112,feat_style_cat
0.0291  ± 0.0011,feat_material_cat
0.0178  ± 0.0030,feat_gender_cat
0.0026  ± 0.0011,feat_color_cat


In [145]:
df[df['brand'] == 'nike'].features_parsed.sample(5).values

array([{'material': 'synthetic', 'gender': 'women', 'shoe_size': '9.5m', 'size': '9.5', 'shoe_category': "women's shoes", 'color': 'red', 'model': 'nikw-initiator-wolfgrey171.9.5m', 'manufacturer_part_number': 'nikw-initiator-wolfgrey171.9.5m', 'brand': 'nike', 'age_group': 'adult'},
       {'sport': 'soccer', 'type': 'cleats'}, {},
       {'season': 'all-season', 'material': 'leather', 'gender': 'women', 'shoe_size': '6', 'size': '6', 'color': 'purple', 'model': '631461 500', 'manufacturer_part_number': '631461 500', 'brand': 'nike', 'age_group': 'adult'},
       {'sport': 'running shoes', 'occasion': 'running shoes', 'material': 'mesh', 'gender': 'women', 'shoe_size': '9 us women', 'shoe_category': "women's shoes", 'assembled_product_dimensions_(l_x_w_x_h)': '12.00 x 8.00 x 6.00 inches', 'fabric_content': 'mesh', 'color': 'black/white', 'model': '833666 010', 'shoe_closure': 'lace-up', 'casual_&_dress_shoe_style': 'running shoes', 'manufacturer_part_number': '833666 010', 'brand': 'n

In [168]:
feats_cat = {k:v for k,v in keys_stat.items() if v>1}
feats_cat = [get_name_feat(x) + '_cat' for x in feats_cat] + ['brand_cat']
feats_cat

['feat_heel_height_cat',
 'feat_occasion_cat',
 'feat_material_cat',
 'feat_assembled_product_weight_cat',
 'feat_battery_type_cat',
 'feat_shipping_weight_(in_pounds)_cat',
 'feat_country_of_origin_-_components_cat',
 'feat_size_cat',
 'feat_brand_cat',
 'feat_casual_&_dress_shoe_style_cat',
 'feat_assembled_product_dimensions_(l_x_w_x_h)_cat',
 'feat_clothing_size_cat',
 'feat_juniors_shoe_cat',
 'feat_mpn_cat',
 'feat_designer_cat',
 'feat_inseam_cat',
 'feat_theme_cat',
 'feat_product_in_inches_(l_x_w_x_h)_cat',
 'feat_type_cat',
 'feat_fabric_material_cat',
 'feat_pattern_cat',
 'feat_country_of_origin_-_assembly_cat',
 'feat_assembled_in_country_of_origin_cat',
 'feat_walmart_no._cat',
 'feat_comfort_technology_cat',
 'feat_condition_cat',
 'feat_shoe_width_cat',
 'feat_upper_material_cat',
 'feat_sport_cat',
 'feat_color_cat',
 'feat_origin_of_components_cat',
 'feat_multi_pack_indicator_cat',
 'feat_shoe_size_cat',
 'feat_primary_color_cat',
 'feat_age_group_cat',
 'feat_manufa

In [169]:
df[feats_cat]

Unnamed: 0,feat_heel_height_cat,feat_occasion_cat,feat_material_cat,feat_assembled_product_weight_cat,feat_battery_type_cat,feat_shipping_weight_(in_pounds)_cat,feat_country_of_origin_-_components_cat,feat_size_cat,feat_brand_cat,feat_casual_&_dress_shoe_style_cat,feat_assembled_product_dimensions_(l_x_w_x_h)_cat,feat_clothing_size_cat,feat_juniors_shoe_cat,feat_mpn_cat,feat_designer_cat,feat_inseam_cat,feat_theme_cat,feat_product_in_inches_(l_x_w_x_h)_cat,feat_type_cat,feat_fabric_material_cat,feat_pattern_cat,feat_country_of_origin_-_assembly_cat,feat_assembled_in_country_of_origin_cat,feat_walmart_no._cat,feat_comfort_technology_cat,feat_condition_cat,feat_shoe_width_cat,feat_upper_material_cat,feat_sport_cat,feat_color_cat,feat_origin_of_components_cat,feat_multi_pack_indicator_cat,feat_shoe_size_cat,feat_primary_color_cat,feat_age_group_cat,feat_manufacturer_part_number_cat,feat_shoe_closure_cat,feat_model_cat,feat_season_cat,feat_manufacturer_cat,feat_model_no._cat,feat_fabric_content_cat,feat_style_cat,feat_gender_cat,feat_country/region_of_manufacture_cat,feat_shoe_category_cat,brand_cat
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17996,0,0,1,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,2,0,0,2,0,1,2,0,2,1,0,0,0,0,2,0,0,1953
17997,4,0,0,0,0,0,0,16,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,4,0,0,20,0,0,3975,0,3068,1,0,0,0,0,0,0,0,1161
17998,4,0,0,0,0,0,0,16,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,4,0,0,20,0,0,3975,0,3068,1,0,0,0,0,0,0,0,1161
17999,4,0,0,0,0,0,0,16,596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,4,0,0,20,0,0,3975,0,3068,1,0,0,0,0,0,0,0,1161


In [170]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(feats_cat)

(-50.29200262092139, 1.1058673818739577)

In [171]:
X = df[feats_cat].values
y = df['prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X, y)

perm = PermutationImportance(m, random_state=1).fit(X, y)
eli5.show_weights(perm, feature_names=feats_cat)

Weight,Feature
0.6186  ± 0.0133,brand_cat
0.3179  ± 0.0335,feat_brand_cat
0.0358  ± 0.0104,feat_condition_cat
0.0244  ± 0.0018,feat_material_cat
0.0163  ± 0.0028,feat_gender_cat
0.0062  ± 0.0010,feat_mpn_cat
0.0057  ± 0.0023,feat_style_cat
0.0027  ± 0.0009,feat_heel_height_cat
0.0019  ± 0.0003,feat_model_cat
0.0015  ± 0.0000,feat_shoe_size_cat


In [178]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_condition_cat', 'feat_material_cat',
         'feat_gender_cat', 'feat_heel_height_cat', 'feat_mpn_cat',
         'feat_style_cat', 'feat_model_cat', 'feat_shoe_size_cat',
         'feat_country/region_of_manufacture_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats)

(-50.36296141137174, 0.9656323223365115)

In [179]:
feats = ['brand_cat', 'feat_brand_cat', 'feat_condition_cat', 'feat_material_cat',
         'feat_gender_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats)

(-50.636100059553414, 0.7541685444460559)

In [180]:
feats = ['brand_cat']
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(feats)

(-51.57118869808353, 0.6390743725376244)

In [181]:
git add 

[0m[01;34mdata[0m/  HelloGithub.ipynb  LICENSE  [01;34mmatrix_one[0m/  README.md
