In [None]:
import itertools

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.spatial import distance

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

## Load data

In [None]:
pf = pd.read_csv("pooled_features.csv")
pf.head(5)

In [None]:
scaler=MinMaxScaler()
pf.iloc[:,2:]=scaler.fit_transform(pf.iloc[:,2:])

In [None]:
pf.groupby('1').size()

## Calculate within class distance

In [None]:
def within(product_type_name):
    df = pf.loc[pf["1"]==product_type_name].iloc[:,2:]
    similarity = np.ravel(euclidean_distances(df))
    similarity = similarity[similarity!=0]
    print(f"Within category distance for {product_type_name}", similarity.mean())

In [None]:
within('Sweater')

In [None]:
within('Trousers')

In [None]:
within('Dress')

In [None]:
within('Underwear bottom')

In [None]:
within('Skirt')

In [None]:
within('T-shirt')

In [None]:
within('Shorts')

In [None]:
within('Shirt')

## Calculate between class distance

In [None]:
def between(p1, p2):
    df1 = pf.loc[pf["1"]==p1].iloc[:,2:]
    df2 = pf.loc[pf["1"]==p2].iloc[:,2:]
    
    min_n = min(df1.shape[0], df2.shape[0])
    
    df1 = df1.iloc[:min_n,:]
    df2 = df2.iloc[:min_n,:]
    
    indices = np.arange(min_n)
    combs = list(itertools.combinations_with_replacement(indices, 2))
    
    sim_sum = 0
    
    for tup in combs:
        sim_sum += 1 - distance.cosine(df1.iloc[tup[0],:], df2.iloc[tup[1],:])
    
    print(f"Between category distance for {product_type_name}", sim_sum / len(combs))

In [None]:
between('Sweater','Trousers')

In [None]:
def between(p1, p2, n_shuffle=1):
    df1 = pf.loc[pf["1"]==p1].iloc[:,2:]
    df2 = pf.loc[pf["1"]==p2].iloc[:,2:]
    
    min_n = min(df1.shape[0], df2.shape[0])
    
    sim_score = 0
    
    for _ in range(n_shuffle):    
        df1 = df1.iloc[:min_n,:]
        df2_s = df2.iloc[:min_n,:].sample(frac=1)
        sim_score += euclidean_distances(df1, df2_s).mean()
    
    print(f"Between category distance for {p1}, {p2}", sim_score / n_shuffle)

In [None]:
between('Sweater','Trousers', 5)

In [None]:
between('Sweater','T-shirt', 5)

In [None]:
between('Dress','T-shirt', 5)

In [None]:
between('Dress','Trousers', 5)

In [None]:
between('Dress','Underwear bottom', 5)

In [None]:
between('Shorts','Underwear bottom', 5)

In [None]:
between('Shorts','Skirt', 5)

In [None]:
between('Skirt','Underwear bottom', 5)

In [None]:
between('Shorts','Trousers', 5)

In [None]:
between('Shirt','T-shirt', 5)

In [None]:
between('Shirt','Trousers', 5)

## Color comparison

In [None]:
def check_similarity(a1, a2):
    v1 = pf.loc["0"+pf["0"].astype(str)==a1].iloc[0,2:].values
    v2 = pf.loc["0"+pf["0"].astype(str)==a2].iloc[0,2:].values
    print(np.linalg.norm(v1-v2))

### Gray dresses

In [None]:
check_similarity("0212629047","0212629036")

### Red dresses

In [None]:
check_similarity("0212629035","0212629043")

In [None]:
check_similarity("0212629040","0212629048")

### Gray and Red dresses

In [None]:
check_similarity("0212629047","0212629043")

### Gray dress and black trousers

In [None]:
check_similarity("0212629047","0186372042")

### Red dress and black trousers

In [None]:
check_similarity("0212629043","0186372042")

### Gray trousers

In [None]:
check_similarity("0326885026","0326885032")

### Gray shorts

In [None]:
check_similarity("0219075014","0219075017")

### Yellow and gray t-shirt

In [None]:
check_similarity("0203027048","0203027047")

### Gray dress and gray trouser

In [None]:
check_similarity("0212629047","0326885026")

### Yellow t-shirt and red dress

In [None]:
check_similarity("0203027048","0212629043")

## Texture comparison

### Dinosaur print sweater with same color solid pants

In [None]:
check_similarity("0311059024","0326885030")

### Dinosaur print sweater with matching color and print pants

In [None]:
check_similarity("0311059024","0331474015")

### Dinosaur print sweater with different color but same print pants

In [None]:
check_similarity("0311059024","0331474016")