# Possible Bias in product recommendation: 
Due to the selection of a
particular human model in a product image. These correlations may
result in the underrepresentation of particular niche markets in the
interaction data; for example, a female user who would potentially
like motorcycle products may be less likely to interact with them if
they are promoted using stereotypically ‘male’ images.

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import chi2

In [2]:
elec = pd.read_csv("electronics.csv")
mod = pd.read_csv("modcloth.csv")

# **`Market bias`**
1.   **Hypothesis H0 when there is no market bias**
2.   **Hypothesis H1 when there is market bias**

In [3]:
#Function to detect market bias between user attribute and model attribute
def marketbias(df):
    tab = pd.crosstab(df['model_attr'], df['user_attr'])
    val = chi2_contingency(tab.values)
    expected_values = val[3]
    dof = val[2]
    print("Expected values: ",expected_values)
    chi_square=sum([(o-e)**2./e for o,e in zip(tab.values,expected_values)])
    chi_square_statistic=chi_square[0]+chi_square[1]
    print( "-------------------------------------")
    print("Chi-square statistic: ",chi_square_statistic)
    print( "-------------------------------------")
    critical_value=chi2.ppf(q=1-0.05,df=dof)
    print("Critical value: ",critical_value)
    p_value=1-chi2.cdf(x=chi_square_statistic,df=dof)
    print( "-------------------------------------")
    print("P-value: ",p_value)
    if (chi_square_statistic>=critical_value) & (p_value<=0.05):
        print("Reject H0,There is Market bias")
    else:
        print("Retain H0,There is no Market bias")

## Checking Market bias in ModCloth 

In [4]:
mod.head()

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
3,7443,De,4,2010-02-13 08:00:00+00:00,,,,Small,Dresses,,2012,0
4,7443,tasha,4,2010-02-18 08:00:00+00:00,,,Small,Small,Dresses,,2012,0


In [5]:
mod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99893 entries, 0 to 99892
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   item_id     99893 non-null  int64  
 1   user_id     99892 non-null  object 
 2   rating      99893 non-null  int64  
 3   timestamp   99893 non-null  object 
 4   size        78133 non-null  float64
 5   fit         81387 non-null  object 
 6   user_attr   91526 non-null  object 
 7   model_attr  99893 non-null  object 
 8   category    99893 non-null  object 
 9   brand       25913 non-null  object 
 10  year        99893 non-null  int64  
 11  split       99893 non-null  int64  
dtypes: float64(1), int64(4), object(7)
memory usage: 9.1+ MB


In [6]:
mod2 = mod[mod.user_attr.isnull() == False]

In [7]:
mod2.isna().sum()

item_id           0
user_id           1
rating            0
timestamp         0
size          13393
fit           14588
user_attr         0
model_attr        0
category          0
brand         68365
year              0
split             0
dtype: int64

In [8]:
marketbias(mod2)

Expected values:  [[ 7792.97543867 31045.02456133]
 [10572.02456133 42115.97543867]]
-------------------------------------
Chi-square statistic:  158.94978970213168
-------------------------------------
Critical value:  3.841458820694124
-------------------------------------
P-value:  0.0
Reject H0,There is Market bias


# Checking Market bias in electronics

In [13]:
elec.head()

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
0,0,0,5.0,1999-06-13,Female,Portable Audio & Video,,1999,,0
1,0,1,5.0,1999-06-14,Female,Portable Audio & Video,,1999,,0
2,0,2,3.0,1999-06-17,Female,Portable Audio & Video,,1999,,0
3,0,3,1.0,1999-07-01,Female,Portable Audio & Video,,1999,,0
4,0,4,2.0,1999-07-06,Female,Portable Audio & Video,,1999,,0


In [9]:
elec2 = elec[elec.user_attr.isnull() == False]

In [10]:
elec2.isna().sum()

item_id            0
user_id            0
rating             0
timestamp          0
model_attr         0
category           0
brand         128958
year               0
user_attr          0
split              0
dtype: int64

In [11]:
marketbias(elec2)

Expected values:  [[32786.10760148 33059.89239852]
 [25597.12388872 25810.87611128]
 [28316.7685098  28553.2314902 ]]
-------------------------------------
Chi-square statistic:  581.8488687434312
-------------------------------------
Critical value:  5.991464547107979
-------------------------------------
P-value:  0.0
Reject H0,There is Market bias


# Thus, Both the datasets have market bias.