In [98]:
import pandas as pd

# read in data 
womens_df = pd.read_csv('/Users/anikamisra/Desktop/personal-projects/pink-tax/gymshark/w_df_scraped_jul_14.csv')
mens_df = pd.read_csv('/Users/anikamisra/Desktop/personal-projects/pink-tax/gymshark/m_df_scraped_jul_14.csv')
womens_df.head()

Unnamed: 0.1,Unnamed: 0,Product,Price
0,1,Silicone Grip Lifting Straps,$16
1,2,Vital Seamless 2.0 Leggings,$54
2,3,Crew Socks 3pk,$16
3,4,Crew Socks 5pk,$26
4,5,Strong Girl Lifting Club Oversized Graphic Crew,$42


In [99]:
womens_df = womens_df.drop(columns = ["Unnamed: 0"])
mens_df = mens_df.drop(columns = ["Unnamed: 0"])
print("Number of womens products: ", womens_df.shape[0], "\nNumber of mens products: ", mens_df.shape[0])

Number of womens products:  960 
Number of mens products:  960


According to the website, there are over 960 products for both mens and womens clothing. However, when I try to webscrape on a page number greater than 16 it tells me that the page does not exist, and when I try to manually load all the products it stops at page 3. So for now, we can only gain access to 960 of the products for both mens and womens products. 

However, this is good because there is no class imbalance. 

## Part 1: Create feature matrix 
BoW technique 
1. Each row is associated with a unique product description
2. Each column represents a potential word in the vocabulary 
3. If the product description contains a certain word, put 1 in that column (or >1 if it appears more than once). Otherwise, put 0. 
4. For all women's products, put 1 in the "womens" column. For all men's products, put 0 in the "mens" column. 

# finish these descrptions tomorrow babe! 

In [100]:
## create list of words 
words_womens = womens_df['Product'].str.split()
words_mens = mens_df['Product'].str.split()
unique_words = set()
for item in words_womens: 
    for word in item: 
        unique_words.add(word.lower())
for item in words_mens: 
    for word in item: 
        unique_words.add(word.lower())
print(unique_words)
print(len(unique_words))


{'flex', 'zip', 'sleek', 'peek', 'elevate', 'strappy', 'v', 'contour', 'jersey', 'glute', 'heritage', 'seamless', 'out', 'gs', 'boxer', 'leggings', 'joggers', 'tracktop', 'heart', 'fit)', '2.0', 'tank', 'cover', 'stringer', 'shorts', 'everyday', 'rest', 'muscle', 'silicone', 'gym', 'ruched', 'adapt', 'graphic', 'boo', 'high', 'crest', 'marl', 'small', 'regular', 'top', '5pk', 'fit', 'lifting', 'arm', 'sport', 'legacy', 'washed', 'american', 'club', 'extreme', 'strength', 'pump', 'minimal', 'react', 't-shirt', 'waisted', 'vital', 'long', 'pocket', 'cap', 'crop', 'sports', 'training', 'socks', 'hoodie', 'lightweight', 'studio', 'strong', 'cycling', 'day', 'sharkhead', 'a', '(reg', 'crew', 'cut', 'drop', 'bra', 'essential', 'bag', 'backpack', 'power', 'neck', 'boost', 'grip', 'arrival', 'brief', 'bandeau', 'cargo', '3pk', '5"', 'oversized', 'straps', 'fleck', 'girl', 'mesh', 'department', '7"', 'woven', 'sleeve', 'fleece'}
100


In [101]:
# remove nouns
import spacy  
nlp = spacy.load("en_core_web_sm")

no_nouns = []
for word in list(unique_words): 
    doc = nlp(word)
    is_noun = False
    for token in doc: 
        if token.pos_ == "NOUN": 
            is_noun = True 
            break 
    if is_noun == False: 
        no_nouns.append(word)

print(len(no_nouns))
            

[W095] Model 'en_core_web_sm' (3.5.0) was trained with spaCy v3.5.0 and may not be 100% compatible with the current version (3.7.4). If you see errors or degraded performance, download a newer compatible model or retrain your custom model with the current spaCy version. For more details and available updates, run: python -m spacy validate


58


In [122]:
# create feature matrix 

feat_mat = pd.DataFrame(columns=no_nouns)
feat_mat['womens'] = 0
feat_mat['Price'] = 0

# mens_df['Price'] = mens_df['Price'].str.replace('$', '')

for index, row in womens_df.iterrows(): 
    product = row['Product']
    price = row['Price']
    
    # initialize all column values to 0 
    newrow = {word: 0 for word in no_nouns}
    newrow['Price'] = float(price.replace("$", ""))

    words = set(product.lower().split())
    for word in words: 
        if word in newrow: 
            newrow[word] = 1 
    # since we are pulling from womens df add 1 to the "womens" word 
    newrow['womens'] = 7

    feat_mat.loc[index] = newrow

NEW_INDEX = feat_mat.shape[0] # now we append mens products from the bottom of previously created dataframe 
for index, row in mens_df.iterrows(): 
    product = row['Product']
    price = row['Price']
    
    # initialize all column values to 0 
    newrow = {word: 0 for word in no_nouns}
    newrow['Price'] = float(price.replace("$", ""))

    words = set(product.lower().split())
    for word in words: 
        if word in newrow: 
            newrow[word] = 1 
    newrow['womens'] = 0
    feat_mat.loc[index+NEW_INDEX] = newrow

# shuffle rows for randomness 
feat_mat = feat_mat.sample(frac=1).reset_index(drop=True)
feat_mat.tail()

Unnamed: 0,flex,zip,sleek,peek,elevate,strappy,v,contour,jersey,glute,...,3pk,"5""",oversized,fleck,"7""",woven,sleeve,fleece,womens,Price
1915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,36.0
1916,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,26.0
1917,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,16.0
1918,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,40.0
1919,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24.0


In [123]:
df = feat_mat
w_df_all_data = df[df['womens'] == 7]
m_df_all_data = df[df['womens'] == 0]
print("Womens median price: ", w_df_all_data.describe().loc['50%', 'Price'],
 "\nMens median price: ", m_df_all_data.describe().loc['50%', 'Price'])

print(df.isnull().sum().sum())

Womens median price:  30.0 
Mens median price:  26.3
0


In [124]:
# test train split 
from sklearn.model_selection import train_test_split
x = df.iloc[:, :-1]
y = df.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)
x_train.to_csv("x_train.csv", index=False)
x_test.to_csv("x_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)
# save these files as csv so that we use the same data for all different types of models

In [125]:
x_train = pd.read_csv("x_train.csv")
x_test = pd.read_csv("x_test.csv")
y_train = pd.read_csv("y_train.csv")
y_test = pd.read_csv("y_test.csv")

print(" X train shape: ", x_train.shape, 
      "\n Y train shape: ", y_train.shape,
       "\n X test shape: ", x_test.shape, 
         "\n Y test shape: ", y_test.shape)

y_test = y_test.values.flatten()

 X train shape:  (1440, 59) 
 Y train shape:  (1440, 1) 
 X test shape:  (480, 59) 
 Y test shape:  (480, 1)


In [126]:
# lasso (linear pricing model)
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.01)
lasso.fit(x_train, y_train)
y_pred = lasso.predict(x_test)
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
print("MSE: ", mean_squared_error(y_test, y_pred))
corr, _ = pearsonr(y_test, y_pred)
print("Correlation: ", corr)

MSE:  33.146373482112296
Correlation:  0.8357839275061998


In [127]:
coefs = lasso.coef_
feature_names = x_train.columns
coefs_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefs})
womens_coef = coefs_df[coefs_df['Feature'].str.contains('womens', case=False, na=False)]
print(womens_coef)

   Feature  Coefficient
58  womens      0.08726


So according to a linear pricing model, the womens label is not influential at all. 

In [92]:
# random forest 

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=16)
rfr.fit(x_train, y_train)
y_pred = rfr.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
corr, _ = pearsonr(y_test, y_pred)
print("Pearson correlation coefficient:", corr)
print("Mean squared error: ", mse)

Pearson correlation coefficient: 0.9160473036947532
Mean squared error:  13.938015439933672


  rfr.fit(x_train, y_train)


In [93]:
import shap
import numpy as np 

x = df.iloc[:, :-1]
y = df.iloc[:, -1]

explainer = shap.Explainer(rfr)

shap_values = explainer.shap_values(x_train, check_additivity=False)

import pickle

# save SHAP values to a file
with open('shap_values.pkl', 'wb') as f:
    pickle.dump(shap_values, f)

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [97]:
# RUN THIS WHEN U WANT TO LOAD UP SHAP VALUES AGAIN 
import pickle
with open('shap_values.pkl', 'rb') as f:
    shap_values = pickle.load(f)

shap_values = shap.Explanation(values=shap_values, base_values=explainer.expected_value, data=x_train, feature_names=x_train.columns)
mean_shap_values = pd.DataFrame({
    'Feature': x_train.columns, 
    'Mean SHAP Value': np.mean(shap_values.values, axis=0)
})

mean_shap_values = mean_shap_values.sort_values(by='Mean SHAP Value', ascending=True)
mean_shap_values.head()

Unnamed: 0,Feature,Mean SHAP Value
58,womens,-0.283924
52,oversized,-0.100318
12,gs,-0.092023
42,hoodie,-0.05968
50,3pk,-0.043141
