In [90]:
!pip install eli5



In [0]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

In [92]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix"

/content/drive/My Drive/Colab Notebooks/dw_matrix


In [93]:
ls data

men_shoes.csv


In [94]:
df = pd.read_csv('data/men_shoes.csv', low_memory=False)
df.shape

(18280, 48)

In [0]:
df['brand_cat']=df.brand.map(lambda x: str(x).lower()).factorize()[0]

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  X = df[feats].values
  y=df['prices_amountmin']
  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [97]:
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [98]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [99]:
df.features.head().values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict

  features = literal_eval(x.replace('\\"','"'))
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()
    output_dict[key] = value
  
  return output_dict

df['features_parsed'] = df['features'].map(parse_features)

In [101]:
df['features_parsed'].head()

0    {'gender': 'men', 'shoe size': 'm', 'shoe cate...
1    {'gender': 'men', 'shoe size': 'm', 'shoe cate...
2    {'gender': 'men', 'color': 'black', 'shipping ...
3    {'gender': 'men', 'color': 'black', 'shipping ...
4    {'gender': 'men', 'color': 'black', 'shipping ...
Name: features_parsed, dtype: object

In [102]:
df['features_parsed'][0].keys()

dict_keys(['gender', 'shoe size', 'shoe category', 'color', 'manufacturer part number', 'brand'])

In [103]:
keyss = set()

df['features_parsed'].map(  lambda x: keyss.update(x.keys()))
len(keyss)

476

In [104]:
def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook(keyss):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan )


HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [105]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_work shoes', 'feat_uv protection', 'feat_thick, warm, cozy',
       'feat_fits styles', 'feat_boxed-product dimensions', 'feat_product #',
       'feat_clothing product type', 'feat_manufacturer',
       'feat_golf shoe type', 'feat_bridge width'],
      dtype='object', length=526)

In [106]:
keys_stat = {}
for key in tqdm_notebook(keyss):
  keys_stat[key] = df[False == df[get_name_feat(key)].isnull()].shape[0] / df.shape[0]*100


HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [107]:
{k:v for k,v in keys_stat.items() if v> 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [0]:
df['feat_brand_cat']=df['feat_brand'].factorize()[0]
df['feat_brand_color']=df['feat_color'].factorize()[0]
df['feat_brand_manufacturer part number']=df['feat_manufacturer part number'].factorize()[0]
df['feat_brand_gender']=df['feat_gender'].factorize()[0]
df['feat_brand_material']=df['feat_material'].factorize()[0]

df['feat_sport_cat']=df['feat_sport'].factorize()[0]
df['feat_style_cat']=df['feat_style'].factorize()[0]

for key in keyss:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]

In [0]:
df.brand = df.brand.map(lambda x : str(x).lower())

In [110]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
feats = ['brand_cat']
run_model(feats, model)

(-57.31783843165656, 4.181246596160967)

In [118]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
feats = ['brand_cat', 'feat_brand_cat', 'feat_brand_gender', 'feat_brand_material', 'feat_style_cat', 'feat_sport_cat']
run_model(feats, model)

(-57.16468774262015, 4.235810616060932)

In [119]:
X = df[feats].values
y = df['prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X,y)

perm = PermutationImportance(m, random_state=1).fit(X,y);
eli5.show_weights(perm, feature_names=feats)

Weight,Feature
0.2600  ± 0.0110,brand_cat
0.1047  ± 0.0084,feat_brand_material
0.0458  ± 0.0047,feat_brand_gender
0.0213  ± 0.0011,feat_brand_cat
0.0069  ± 0.0016,feat_style_cat
0.0002  ± 0.0000,feat_sport_cat


In [113]:
df['brand'].value_counts(normalize=True)

nike               0.097210
puma               0.033315
ralph lauren       0.028775
vans               0.021116
new balance        0.020295
                     ...   
iecool             0.000055
g.h. bass & co.    0.000055
kingshow           0.000055
sansha             0.000055
designer           0.000055
Name: brand, Length: 1732, dtype: float64

In [0]:
all_feats = [x for x in df.columns if 'cat' in x]

In [134]:
all_feats

['categories',
 'brand_cat',
 'feat_location - city/state',
 'feat_location - country',
 'feat_fabrication',
 'feat_clothing category',
 'feat_catalog',
 'feat_shoe category',
 'feat_recommended location',
 'feat_multi pack indicator',
 'feat_certifications and listings',
 'feat_brand_cat',
 'feat_sport_cat',
 'feat_style_cat',
 'feat_age group_cat',
 'feat_shoe size_cat',
 'feat_clothing size_cat',
 'feat_assembled product dimensions (l x w x h)_cat',
 'feat_material_cat',
 'feat_country of origin - components_cat',
 'feat_heel height_cat',
 'feat_shoe category_cat',
 'feat_shoe closure_cat',
 'feat_fabric material_cat',
 'feat_model_cat',
 'feat_gender_cat',
 'feat_pattern_cat',
 'feat_condition_cat',
 'feat_theme_cat',
 'feat_manufacturer part number_cat',
 'feat_shoe width_cat',
 'feat_country of origin - assembly_cat',
 'feat_size_cat',
 'feat_fabric content_cat',
 'feat_shipping weight (in pounds)_cat',
 'feat_manufacturer_part_number_cat',
 'feat_color_cat',
 'feat_casual & dres

In [138]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
feats = ['brand_cat', 'feat_brand_cat', 'feat_brand_gender', 'feat_brand_material', 'feat_style_cat', 'feat_sport_cat']
feats += [ 'feat_attachment_cat',
 'feat_country of origin assembly:_cat',
 'feat_removable hood_cat',
 'feat_wheeled_cat',
 'feat_mpn_cat',
 'feat_ul safety listing_cat',
 'feat_number of pockets_cat',
 'feat_date first available at amazon.ca_cat',
 'feat_hammer loop_cat',
 'feat_authenticity_cat',
 'feat_age range_cat',
 'feat_kids backpacks_cat',
 'feat_frame depth_cat',
 'feat_model no._cat',
 'feat_battery voltage_cat',
 'feat_age segment_cat',
 'feat_digital camera_cat',
 'feat_lens width_cat',
 'feat_lining material_cat',
 'feat_colour_cat',
 'feat_issued/ not-issued_cat',
 'feat_ring style_cat',
 'feat_is recyclable_cat',
 'feat_band material_cat',
 'feat_adidas_cat',
 'feat_frame type_cat',
 'feat_crown_cat',
 'feat_feature_cat',
 'feat_wind resistant_cat',
 'feat_case thickness_cat',
 'feat_rise_cat',
 'feat_mechanic_cat',
 'feat_package_cat',
 'feat_lining_cat',
 'feat_construction_cat',
 'feat_number of items_cat',
 'feat_type_cat',
 'feat_assembled in country of origin_cat',
 'feat_is waterproof_cat',
 'feat_wheel type_cat',
 'feat_clasp type_cat',
 'feat_has adaptive lenses_cat',
 'feat_domestic shipping_cat',
 'feat_sku_cat',
 'feat_reinforced knee_cat',
 'feat_style code_cat',
 'feat_batteries required?_cat',
 'feat_impact resistant_cat',
 'feat_leather grade_cat',
 'feat_box_cat',
 'feat_safety features_cat',
 'feat_ean_cat',
 'feat_retail price_cat',
 'feat_charger included_cat',
 'feat_cleaning, care & maintenance_cat',
 'feat_case type:_cat',
 'feat_adjustable_cat',
 'feat_closure_cat',
 'feat_bezel_cat',
 'feat_sub style_cat',
 'feat_item color_cat',
 'feat_vendor description_cat',
 'feat_foot arch_cat',
 'feat_display_cat',
 'feat_terrain_cat',
 'feat_count_cat',
 'feat_transactionid_cat',
 'feat_jacket length_cat',
 'feat_stability_cat',
 'feat_pattern or design_cat',
 'feat_is weather-resistant_cat',
 'feat_jewelry setting_cat',
 'feat_guaranteed authentic_cat',
 'feat_leg_cat',
 'feat_packageweight_cat',
 'feat_sub type_cat',
 'feat_year made_cat',
 'feat_fit:_cat',
 'feat_dial color_cat',
 'feat_number of compartments_cat',
 'feat_antiscratch lens coating_cat',
 'feat_recommended location_cat',
 'feat_euro size_cat',
 'feat_weather resistant_cat',
 'feat_lens type_cat',
 'feat_spikes type_cat',
 'feat_alarm_cat',
 'feat_movement_cat',
 'feat_fabric care_cat',
 'feat_multi pack indicator_cat',
 'feat_pocket_cat',
 'feat_case tone_cat',
 'feat_pant style_cat',
 'feat_outer material_cat',
 'feat_model number_cat',
 'feat_number of heat settings_cat',
 'feat_frame color_cat',
 'feat_type 2_cat',
 'feat_measurements:_cat',
 'feat_материал_cat',
 'feat_high visibility (ansi compliant)_cat',
 'feat_quantity in set_cat',
 'feat_release_cat',
 'feat_length_cat']
feats = list(set(feats))
run_model(feats, model)

(-57.53523467834916, 4.374322351702676)

In [141]:
m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
X = df[feats].values
y = df['prices_amountmin'].values
m.fit(X,y)

perm = PermutationImportance(m, random_state=1).fit(X,y);
eli5.show_weights(perm, feature_names=feats)

Weight,Feature
0.2508  ± 0.0093,brand_cat
0.1015  ± 0.0083,feat_brand_material
0.0323  ± 0.0044,feat_adjustable_cat
0.0166  ± 0.0012,feat_brand_cat
0.0067  ± 0.0008,feat_case thickness_cat
0.0049  ± 0.0009,feat_jewelry setting_cat
0.0044  ± 0.0004,feat_brand_gender
0.0038  ± 0.0003,feat_jacket length_cat
0.0029  ± 0.0003,feat_movement_cat
0.0021  ± 0.0003,feat_case type:_cat


In [0]:
!git add 