In [0]:
#!pip install --upgrade tables
#!pip install eli5

In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

Using TensorFlow backend.


In [4]:
cd "drive/My Drive/Colab Notebooks/matrix_one/dw_matrix"

/content/drive/My Drive/Colab Notebooks/matrix_one/dw_matrix


In [5]:
ls

[0m[01;34mdata[0m/  day3.ipynb  day4.ipynb  day5.ipynb  LICENSE  README.md


In [0]:
df = pd.read_csv('data/men_shoes.csv', low_memory=False)

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  x = df[feats].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [9]:
df['brand_cat'] = df['brand'].factorize()[0]
run_model(['brand_cat'])

(-58.38655694633361, 4.223555478221712)

In [10]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.47223572384038, 4.328288468270897)

In [11]:
df.head()

Unnamed: 0,id,asins,brand,categories,colors,count,dateadded,dateupdated,descriptions,dimension,ean,features,flavors,imageurls,isbn,keys,manufacturer,manufacturernumber,merchants,name,prices_amountmin,prices_amountmax,prices_availability,prices_color,prices_condition,prices_count,prices_currency,prices_dateadded,prices_dateseen,prices_flavor,prices_issale,prices_merchant,prices_offer,prices_returnpolicy,prices_shipping,prices_size,prices_source,prices_sourceurls,prices_warranty,quantities,reviews,sizes,skus,sourceurls,upc,vin,websiteids,weight,brand_cat
0,AVpfHrJ6ilAPnD_xVXOI,,Josmo,"Clothing,Shoes,Men's Shoes,All Men's Shoes",,,2016-11-07T00:45:12Z,2016-11-07T00:45:12Z,"[{""dateSeen"":[""2016-11-07T00:45:12Z""],""sourceU...",,699302000000.0,"[{""key"":""Gender"",""value"":[""Men""]},{""key"":""Shoe...",,https://i5.walmartimages.com/asr/13ac3d61-003c...,,"josmo/8190wnavy75,699302044036,0699302044036",,8190-W-NAVY-7.5,"[{""dateSeen"":[""2016-11-07T00:45:12Z""],""name"":""...","Josmo 8190 Plain Infant Walking Shoes, Navy - ...",39.89,39.89,,,,,USD,2016-11-07T00:45:12Z,2016-11-05T00:00:00Z,,True,,REDUCED USD 12.10,,,,,https://www.walmart.com/ip/Josmo-8190-Plain-In...,,,,,,https://www.walmart.com/ip/Josmo-8190-Plain-In...,699302000000.0,,,,0
1,AVpfHrJ6ilAPnD_xVXOI,,Josmo,"Clothing,Shoes,Men's Shoes,All Men's Shoes",,,2016-11-07T00:45:12Z,2016-11-07T00:45:12Z,"[{""dateSeen"":[""2016-11-07T00:45:12Z""],""sourceU...",,699302000000.0,"[{""key"":""Gender"",""value"":[""Men""]},{""key"":""Shoe...",,https://i5.walmartimages.com/asr/13ac3d61-003c...,,"josmo/8190wnavy75,699302044036,0699302044036",,8190-W-NAVY-7.5,"[{""dateSeen"":[""2016-11-07T00:45:12Z""],""name"":""...","Josmo 8190 Plain Infant Walking Shoes, Navy - ...",51.99,51.99,,,new,,USD,2016-11-07T00:45:12Z,2016-11-05T00:00:00Z,,False,UnbeatableSale - Walmart.com,REDUCED USD 12.10,,,,,https://www.walmart.com/ip/Josmo-8190-Plain-In...,,,,,,https://www.walmart.com/ip/Josmo-8190-Plain-In...,699302000000.0,,,,0
2,AVpfHsWP1cnluZ0-eVZ7,,SERVUS BY HONEYWELL,"All Men's Shoes,Shoes,Men's Shoes,Clothing",,,2016-06-14T04:29:57Z,2016-07-09T20:26:48Z,"[{""dateSeen"":[""2016-07-09T20:26:48Z""],""sourceU...",,,"[{""key"":""Gender"",""value"":[""Men""]},{""key"":""Colo...",,http://i5.walmartimages.com/dfw/dce07b8c-5844/...,,servusbyhoneywell/zsr101blmlg,,ZSR101BLMLG,"[{""dateSeen"":[""2016-06-14T04:29:57Z""],""name"":""...",Servus By Honeywell Shoe Studs Zsr101blmlg,40.02,40.02,,,new,,USD,2016-06-14T04:29:57Z,2016-03-08T00:00:00Z,,False,SIM Supply Inc - Walmart.com,,,,,,http://www.walmart.com/ip/Studs-Shoe-Large-Pr-...,,,,,,http://www.walmart.com/ip/Studs-Shoe-Large-Pr-...,,,,,1
3,AVpfHsWP1cnluZ0-eVZ7,,SERVUS BY HONEYWELL,"All Men's Shoes,Shoes,Men's Shoes,Clothing",,,2016-06-14T04:29:57Z,2016-07-09T20:26:48Z,"[{""dateSeen"":[""2016-07-09T20:26:48Z""],""sourceU...",,,"[{""key"":""Gender"",""value"":[""Men""]},{""key"":""Colo...",,http://i5.walmartimages.com/dfw/dce07b8c-5844/...,,servusbyhoneywell/zsr101blmlg,,ZSR101BLMLG,"[{""dateSeen"":[""2016-06-14T04:29:57Z""],""name"":""...",Servus By Honeywell Shoe Studs Zsr101blmlg,50.31,50.31,,,new,,USD,2016-06-14T04:29:57Z,2015-11-30T00:00:00Z,,False,SIM Supply Inc - Walmart.com,,,,,,http://www.walmart.com/ip/Studs-Shoe-Large-Pr-...,,,,,,http://www.walmart.com/ip/Studs-Shoe-Large-Pr-...,,,,,1
4,AVpfHsWP1cnluZ0-eVZ7,,SERVUS BY HONEYWELL,"All Men's Shoes,Shoes,Men's Shoes,Clothing",,,2016-06-14T04:29:57Z,2016-07-09T20:26:48Z,"[{""dateSeen"":[""2016-07-09T20:26:48Z""],""sourceU...",,,"[{""key"":""Gender"",""value"":[""Men""]},{""key"":""Colo...",,http://i5.walmartimages.com/dfw/dce07b8c-5844/...,,servusbyhoneywell/zsr101blmlg,,ZSR101BLMLG,"[{""dateSeen"":[""2016-06-14T04:29:57Z""],""name"":""...",Servus By Honeywell Shoe Studs Zsr101blmlg,46.26,46.26,,,new,,USD,2016-06-14T04:29:57Z,2016-04-29T00:00:00Z,,False,SIM Supply Inc - Walmart.com,,,,,,http://www.walmart.com/ip/Studs-Shoe-Large-Pr-...,,,,,,http://www.walmart.com/ip/Studs-Shoe-Large-Pr-...,,,,,1


In [13]:
df['brand_cat'] = df['brand'].map(lambda x: str(x).lower()).factorize()[0]
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [14]:
df.features.head().values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [0]:
def parse_features(x):
  if str(x) == 'nan': return[]
  return literal_eval(x.replace('\\"', '"'))

df['features_parsed'] = df['features'].map(parse_features) 

In [16]:
df['features_parsed'].head().values 

array([list([{'key': 'Gender', 'value': ['Men']}, {'key': 'Shoe Size', 'value': ['M']}, {'key': 'Shoe Category', 'value': ["Men's Shoes"]}, {'key': 'Color', 'value': ['Multicolor']}, {'key': 'Manufacturer Part Number', 'value': ['8190-W-NAVY-7.5']}, {'key': 'Brand', 'value': ['Josmo']}]),
       list([{'key': 'Gender', 'value': ['Men']}, {'key': 'Shoe Size', 'value': ['M']}, {'key': 'Shoe Category', 'value': ["Men's Shoes"]}, {'key': 'Color', 'value': ['Multicolor']}, {'key': 'Manufacturer Part Number', 'value': ['8190-W-NAVY-7.5']}, {'key': 'Brand', 'value': ['Josmo']}]),
       list([{'key': 'Gender', 'value': ['Men']}, {'key': 'Color', 'value': ['Black']}, {'key': 'Shipping Weight (in pounds)', 'value': ['0.45']}, {'key': 'Condition', 'value': ['New']}, {'key': 'Brand', 'value': ['SERVUS BY HONEYWELL']}, {'key': 'manufacturer_part_number', 'value': ['ZSR101BLMLG']}]),
       list([{'key': 'Gender', 'value': ['Men']}, {'key': 'Color', 'value': ['Black']}, {'key': 'Shipping Weight (in

In [0]:
def parse_features(x):
  output_dict = {} 
  if str(x) == 'nan': return output_dict

  features = literal_eval(x.replace('\\"', '"'))
  for item in features:
    key = item[ 'key'].lower().strip()
    value = item[ 'value'][0].lower().strip()

    output_dict[key] = value
  return output_dict

df['features_parsed'] = df['features'].map(parse_features) 

In [18]:
keys = set()

df['features_parsed'].map(lambda x: keys.update(x.keys()))

len(keys)

476

In [19]:
def get_name_feat(key):
  return 'feat_' + key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df['features_parsed'].map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=476), HTML(value='')))




In [20]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_country of manufacture', 'feat_conflict', 'feat_frame shape',
       'feat_fine or fashion', 'feat_clothing type', 'feat_size/dimensions',
       'feat_technician', 'feat_lens width', 'feat_to fit', 'feat_profession'],
      dtype='object', length=526)

In [21]:
df [df['feat_athlete'].isnull() ].shape

(18272, 526)

In [22]:
df.shape[0]

18280

In [23]:
df[False == df['feat_athlete'].isnull() ].shape[0] / df.shape[0] * 100

0.0437636761487965

In [0]:
keys_stat = {}
for key in keys:
  keys_stat[key] = df[ False == df[get_name_feat(key)].isnull() ].shape[0] / df.shape[0] * 100

In [25]:
{ k:v for k,v in keys_stat.items() if v > 30 }

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

## Feature Engineering - some kind of


In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]

df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_style_cat'] = df['feat_style'].factorize()[0]

for key in keys:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]

In [27]:
df[df.brand != df.feat_brand ].shape

(18228, 1002)

In [28]:
df[df.brand == df.feat_brand ].shape

(52, 1002)

## After normalizing

In [29]:
df[ 'brand'] = df['brand'].map(lambda x: str(x).lower())
df[ df.brand != df.feat_brand ][ ['brand', 'feat_brand'] ].head()

Unnamed: 0,brand,feat_brand
21,rubies,generic
22,rubies,generic
23,rubies,generic
24,unbranded,
31,american fighter,


In [30]:
df[ 'brand'] = df['brand'].map(lambda x: str(x).lower())
df[ df.brand == df.feat_brand ].shape

(8846, 1002)

In [0]:
feats = ['']

In [32]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [0]:
feats_cat = [x for x in df.columns if 'cat' in x]
#feats_cat

In [34]:
df['weight'].unique()

array([nan, '3.0 lbs', '9 g', '1.45 lbs', '0.45 lbs', '1.0 lbs',
       '0.23 lbs', '5.0 lbs', '5.5 lbs', '7.45 lbs', '4.0 lbs',
       '2.7969 lbs', '3.9 lbs', '4.6 pounds', '2.1 lbs', '1.1057 lbs',
       '15.0 lbs', '2.4 ounces', '454 g', '0.105 lbs', '9.1 ounces',
       '4.8 lbs', '6.1 lbs', '6.5 lbs', '1.1041 lbs', '1.3 Kg', '91 g',
       '20.0 lbs', '6.0 lbs', '386 g', '0.81 lbs', '4.5 lbs',
       '0.5 ounces', '2.0 lbs', '3.13 lbs', '5.9 lbs', '6.15 lbs',
       '1 pounds', '1.95 lbs', '2.15 lbs', '2 pounds', '2.1 pounds',
       '14 Kg', '0.4788 lbs', '10.0 lbs', '0.38 lbs', '2.5 lbs',
       '68.912 lbs', '45 g', '13.09 lbs', '2.5 pounds', '0.21 lbs',
       '16.75 lbs', '6.3 lbs', '272 g', '1.8 Kg', '2.8 pounds', '0.1 lbs',
       '5.05 lbs', '0.28 lbs', '76.08 lbs', '0.15 lbs', '200 g',
       '7.8 pounds', '399 g', '4.95 lbs', '64.144 lbs', '24 pounds',
       '73.696 lbs', '1.6 lbs', '6.6 ounces', '5 g', '1.2 Kg', '862 g',
       '3.05 lb', '8.6 ounces', '3.6 lbs', '71.

In [0]:
feats = ['brand_cat', 'feat_lens_cat', 'feat_brand_cat', 'feat_fabric material_cat',  'feat_color_cat', 'feat_gender_cat', 'feat_manufacturer part number_cat', 'feat_material_cat', 'feat_style_cat', 'feat_sport_cat']

model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
result = run_model(feats, model)

In [36]:
X = df[ feats ].values
y = df['prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X, y)

print(result)
perm = PermutationImportance(m, random_state=1).fit(X, y);
eli5.show_weights(perm, feature_names=feats)

(-57.39382093676572, 4.499047943311525)


Weight,Feature
0.2541  ± 0.0062,brand_cat
0.0917  ± 0.0103,feat_material_cat
0.0449  ± 0.0040,feat_gender_cat
0.0128  ± 0.0009,feat_brand_cat
0.0073  ± 0.0017,feat_color_cat
0.0063  ± 0.0010,feat_style_cat
0.0057  ± 0.0007,feat_fabric material_cat
0.0036  ± 0.0002,feat_lens_cat
0.0005  ± 0.0002,feat_manufacturer part number_cat
0.0002  ± 0.0000,feat_sport_cat


In [0]:
feats = ['brand_cat', 'feat_lens_cat', 'feat_brand_cat', 'feat_fabric material_cat',  'feat_color_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_style_cat', 'feat_sport_cat']

model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
result = run_model(feats, model)

In [38]:
df['brand'].value_counts(normalize=True)

nike                   0.097210
puma                   0.033315
ralph lauren           0.028775
vans                   0.021116
new balance            0.020295
                         ...   
augusta sportswear     0.000055
tyr                    0.000055
concepts sports        0.000055
columbia sportswear    0.000055
es                     0.000055
Name: brand, Length: 1732, dtype: float64

In [39]:
df [df ['brand'] == 'nike'].features_parsed.head().values

array([{'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'sport': 'soccer', 'main color': 'orange', 'type': 'cleats', 'condition': 'new without box'},
       {'style': 'athletic sneakers', 'condition': 'new with box'}, {}],
      dtype=object)