## Classification Model

In [98]:
import pandas as pd
import numpy as np
from numpy import array
from numpy import asarray
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
data=pd.read_csv('full_data_joined_attr_processed_1.csv', index_col=0)

In [4]:
data.shape

(107791, 6)

### Get document vector using gensim doc2vec

In [5]:
# get unique data from our cleaned data description, details, etc.
doc=pd.Series(data['final'].unique()).str.split()
doc

0       [teah, stretch-silk, camisole, beige, stretch-...
1       [layered, velvet, mini, dress, black, velvet, ...
2       [faux-leather, pleated, midi, skirt, accordion...
3       [parker, logo-embellished, snake-effect, leath...
4       [silk-crepe, midi, dress, black, silk-crepe, c...
                              ...                        
3670    [micro, twill, pull, pant, casual, trouser, cu...
3671    [brushed, twill, crop, wide, leg, pant, brushe...
3672    [slim, crop, pant, tailored, soft, dense, knit...
3673    [camo, print, silk, skirt, flatlock-stitched, ...
3674    [stretch, pima, cotton, baby, tee, made, atm, ...
Length: 3675, dtype: object

In [6]:
# import gensim and get document vector
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(doc)]
model = Doc2Vec(documents, vector_size=50, window=4, min_count=2, workers=4)

In [7]:
doc_vec=[]

In [8]:
# get doc2vec
for sentence in doc:
    vec=model.infer_vector(sentence).reshape(1, -1)
    doc_vec.append(vec)

In [9]:
vec_df = pd.DataFrame({
            'final': data['final'].unique(),
            'doc': doc,
            'doc_vec': doc_vec
        })

In [10]:
# merge with original data to link all the rows with doc2vec
data=pd.merge(data, vec_df, on='final', how='left')

### 1. Classification model_style

In [69]:
# import one hot encoded file 
style_file=pd.read_csv('style_code.csv')

In [70]:
style_file=style_file.fillna(0)

In [71]:
# get all style tags
style_cat=['androgynous','athleisure', 'boho','businesscasual','casual','classic','edgy','glam','modern','retro','romantic']

In [72]:
style_file['doc_vec']=data['doc_vec'].map(lambda x: x[0])

In [73]:
# run logistic regression model for each tag in style
style_acc=[]
for style in style_cat:
    new_style=style_file.sort_values(by=['product_id',style], ascending=False).drop_duplicates(subset='product_id',keep='first')
    y=new_style[style]
    model=LogisticRegression()
    new_style["TARGET"] = y
    train_df, test_df = train_test_split(new_style)
    X_train=train_df['doc_vec'].tolist()
    X_test=test_df['doc_vec'].tolist()
    y_train = train_df["TARGET"]
    y_test = test_df["TARGET"]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'accuracy for {style}: {np.mean(y_pred == y_test)}')
    style_acc.append(np.mean(y_pred == y_test))

accuracy for androgynous: 0.823766364551863
accuracy for athleisure: 0.932527693856999
accuracy for boho: 0.8831822759315207
accuracy for businesscasual: 0.7109768378650554
accuracy for casual: 0.6626384692849949
accuracy for classic: 0.6354481369587109
accuracy for edgy: 0.7955689828801611
accuracy for glam: 0.9073514602215509
accuracy for modern: 0.5861027190332326
accuracy for retro: 0.9365558912386707
accuracy for romantic: 0.8660624370594159


### 2. model_embellishment

In [74]:
# import one hot encoded file 
embel_file=pd.read_csv('embel_code.csv')

In [75]:
embel_file=embel_file.fillna(0)

In [76]:
# get all embellishment tags
embel_cat=['buckles','embroidery','fringe','lace','mesh','ruffles','sequins','studs','trim']

In [77]:
embel_file['doc_vec']=data['doc_vec'].map(lambda x: x[0])

In [78]:
# run logistic regression model for each tag in embellishment
embel_acc=[]
for embel in embel_cat:
    new_embel=embel_file.sort_values(by=['product_id',embel], ascending=False).drop_duplicates(subset='product_id',keep='first')
    y=new_embel[embel]
    model=LogisticRegression()
    new_embel["TARGET"] = y
    train_df, test_df = train_test_split(new_embel)
    X_train=train_df['doc_vec'].tolist()
    X_test=test_df['doc_vec'].tolist()
    y_train = train_df["TARGET"]
    y_test = test_df["TARGET"]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'accuracy for {embel}: {np.mean(y_pred == y_test)}')
    embel_acc.append(np.mean(y_pred == y_test))

accuracy for buckles: 0.9969788519637462
accuracy for embroidery: 0.9949647532729103
accuracy for fringe: 0.998992950654582
accuracy for lace: 0.998992950654582
accuracy for mesh: 0.9979859013091642
accuracy for ruffles: 0.9909365558912386
accuracy for sequins: 1.0
accuracy for studs: 0.998992950654582
accuracy for trim: 0.998992950654582


### 3.model_occasion

In [79]:
# import one hot encoded file 
occ_file=pd.read_csv('occ_code.csv')

In [80]:
occ_file=occ_file.fillna(0)

In [81]:
# get all occasion tags
occ_cat=['coldweather','daytonight','nightout','vacation','weekend','work','workout']

In [82]:
occ_file['doc_vec']=data['doc_vec'].map(lambda x: x[0])

In [83]:
# run logistic regression model for each tag in occasion
occ_acc=[]
for occ in occ_cat:
    new_occ=occ_file.sort_values(by=['product_id',occ], ascending=False).drop_duplicates(subset='product_id',keep='first')
    y=new_occ[occ]
    model=LogisticRegression()
    new_occ["TARGET"] = y
    train_df, test_df = train_test_split(new_occ)
    X_train=train_df['doc_vec'].tolist()
    X_test=test_df['doc_vec'].tolist()
    y_test = test_df["TARGET"]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'accuracy for {occ}: {np.mean(y_pred == y_test)}')
    occ_acc.append(np.mean(y_pred == y_test))

accuracy for coldweather: 0.9355488418932527
accuracy for daytonight: 0.297079556898288
accuracy for nightout: 0.7905337361530715
accuracy for vacation: 0.837865055387714
accuracy for weekend: 0.2588116817724068
accuracy for work: 0.6525679758308157
accuracy for workout: 0.9607250755287009


### 4. model_category

In [84]:
# import one hot encoded file 
cat_file=pd.read_csv('cat_code.csv')

In [85]:
cat_file=cat_file.fillna(0)

In [86]:
# get all category tags
cat_cat=['accessory','blazerscoatsjackets','bottom','onepiece','shoe','sweater','sweatshirthoodie','top']

In [87]:
cat_file['doc_vec']=data['doc_vec'].map(lambda x: x[0])

In [88]:
# run logistic regression model for each tag in category
cat_acc=[]
for cat in cat_cat:
    new_cat=cat_file.sort_values(by=['product_id',cat], ascending=False).drop_duplicates(subset='product_id',keep='first')
    y=new_cat[cat]
    model=LogisticRegression()
    new_cat["TARGET"] = y
    train_df, test_df = train_test_split(new_cat)
    X_train=train_df['doc_vec'].tolist()
    X_test=test_df['doc_vec'].tolist()
    y_train = train_df["TARGET"]
    y_test = test_df["TARGET"]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'accuracy for {cat}: {np.mean(y_pred == y_test)}')
    cat_acc.append(np.mean(y_pred == y_test))

accuracy for accessory: 0.9144008056394763
accuracy for blazerscoatsjackets: 0.9295065458207452
accuracy for bottom: 0.7764350453172205
accuracy for onepiece: 0.9093655589123867
accuracy for shoe: 0.8157099697885196
accuracy for sweater: 0.918429003021148
accuracy for sweatshirthoodie: 0.9617321248741189
accuracy for top: 0.7492447129909365


### 5. model_dryclean

In [89]:
# import one hot encoded file 
dry_file=pd.read_csv('dry_code.csv')

In [90]:
dry_file=dry_file.fillna(0)

In [91]:
# get all dryclean tags
dry_cat=['yes','no']

In [92]:
dry_file['doc_vec']=data['doc_vec'].map(lambda x: x[0])

In [93]:
# run logistic regression model for each tag in dryclean
dry_acc=[]
for dry in dry_cat:
    new_dry=dry_file.sort_values(by=['product_id',dry], ascending=False).drop_duplicates(subset='product_id',keep='first')
    y=new_dry[dry]
    model=LogisticRegression()
    new_dry["TARGET"] = y
    train_df, test_df = train_test_split(new_dry)
    X_train=train_df['doc_vec'].tolist()
    X_test=test_df['doc_vec'].tolist()
    y_train = train_df["TARGET"]
    y_test = test_df["TARGET"]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'accuracy for {dry}: {np.mean(y_pred == y_test)}')
    dry_acc.append(np.mean(y_pred == y_test))

accuracy for yes: 0.6696878147029205
accuracy for no: 0.6696878147029205


### Get average accuracy for all tags

In [94]:
list=style_acc+embel_acc+occ_acc+cat_acc+dry_acc

In [95]:
sum(list) / len(list)

0.8314689311668164