## Import Packages

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import pickle
from glove import Corpus, Glove

%matplotlib inline

  from numpy.core.umath_tests import inner1d


## Multiclass Prediction: Random Forest (text)

In [4]:
import re
def concat_words(string):

    # Indonesian
    
    # Pattern
    string = re.sub("paisley", "paisley", string)
    string = re.sub("kotak-kotak", "plaid", string)
    string = re.sub("threadwork", "threadwork", string)
    string = re.sub("tambal sulam", "patchwork", string)
    string = re.sub("polos", "plain", string)
    string = re.sub("grafik", "graphic", string)
    string = re.sub("cetak", "print", string)
    string = re.sub("kamuflase", "camouflage", string)
    string = re.sub("titik gelombang", "wave point", string)
    string = re.sub("simpul", "knot", string)
    string = re.sub("kartun", "cartoon", string)
    string = re.sub("surat", "letter", string)
    string = re.sub("periksa", "check", string)
    string = re.sub("sulaman","embroidery",  string)
    
    # Collar Type
    string = re.sub("berkerudung", "hooded", string)
    string = re.sub("leher tinggi", "high neck", string)
    string = re.sub("selendang kerah", "shawl collar", string)
    string = re.sub("leher perahu", "boat neck", string)
    string = re.sub("tombol bawah", "button down", string)
    string = re.sub("leher kuadrat", "square neck", string)
    string = re.sub("vagina busur", "pussy bow", string)
    string = re.sub("kerah kemeja", "shirt collar", string)
    string = re.sub("berlekuk", "notched", string)
    
    # Fashion Trend
    string = re.sub("kantor", "office", string)
    string = re.sub("street style", "street style", string)
    string = re.sub("tropis", "tropical", string)
    string = re.sub("warisan preppy", "preppy heritage", string)
    string = re.sub("pesta", "party", string)

    # Clothing Material
    string = re.sub("nilon", "nylon", string)
    string = re.sub("beludru", "velvet", string)
    string = re.sub("renda", "lace", string)
    string = re.sub("poliester", "polyester", string)
    string = re.sub("sutera", "silk", string)
    string = re.sub("kapas poli", "poly cotton", string)
    string = re.sub("wol", "wool", string)
    string = re.sub("kapas", "cotton", string)

    # Sleeves
    string = re.sub("tanpa lengan", "sleeveless", string)
    string = re.sub("lengan 3 4", "sleeve 3 4", string)
    string = re.sub("lengan pendek", "short sleeve", string)
    string = re.sub("lengan panjang", "long sleeve", string)


    # General
    string = re.sub("tank top", "tank_top", string)
    string = re.sub("t shirt", "t_shirt", string)
    string = re.sub("t-shirt", "t_shirt", string)
    string = re.sub("tshirt", "t_shirt", string)
    
    # Collar Type
    string = re.sub("polka dot", "polka_dot", string)
    string = re.sub("wave point", "wave_point", string)    
    string = re.sub("high neck", "high_neck", string)
    string = re.sub("shawl collar", "shawl_collar", string)
    string = re.sub("o neck", "o_neck", string)    
    string = re.sub("scoop neck", "scoop_neck", string)
    string = re.sub("boat neck", "boat_neck", string)
    string = re.sub("off the shoulder", "off_the_shoulder", string)
    string = re.sub("v neck", "v_neck", string)
    string = re.sub("button down", "button_down", string)
    string = re.sub("square neck", "square_neck", string)
    string = re.sub("pussy bow", "pussy_bow", string)
    string = re.sub("shirt collar", "shirt_collar", string)
    string = re.sub("peter pan", "peter_pan", string)
    
    # Fashion Trend
    string = re.sub("street style", "street_style", string)
    string = re.sub("retro vintage", "retro_vintage", string)
    string = re.sub("preppy heritage", "preppy_heritage", string)

    # Clothing Material
    string = re.sub("poly cotton", "poly_cotton", string)    
    string = re.sub("v neck", "v_neck", string)

    # Sleeves
    string = re.sub("sleeve 3 4", "sleeve_three_quarters", string)
    string = re.sub("", "v_neck", string)
    string = re.sub("v neck", "v_neck", string)
    string = re.sub("short sleeve", "short_sleeve", string)
    string = re.sub("long sleeve", "long_sleeve", string)

    return string

In [None]:
def multiclass(filename):
    # Save name of dataset (fashion, beauty or mobile)
    df_name = filename[:filename.index("_")]
    df = pd.read_csv(filename)
    df['title_image'].fillna(" ", inplace = True)
    df['title'] = df['title'] + " " + df['title_image'] # new
    df.fillna(-1, inplace = True)
    X = df['title']
    X = [concat_words(string) for string in X]
    X = [re.sub(r'\b\w{1,2}\b', '', string) for string in X] # new
    df_filt = df.iloc[:,3:]
    accuracy = []
    
    # %cd "/Users/benjaminlim/Dropbox (MIT)/Fashion_Model2"
    
    for column in df_filt:
        # Train-test split
        y = df_filt[column]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
        p = Pipeline([
                ('bow',CountVectorizer(token_pattern = r'(?u)\b[A-Za-z_]+\b',
                                      stop_words = 'english')), # new
            ('tfidf',TfidfTransformer()),
            ('classifier',RandomForestClassifier(class_weight="balanced",
                                                 n_estimators = 200, verbose = 2, n_jobs=-1))  # new
        ])
        
        # Train prediction model for each feature
        p.fit(X_train,y_train)
        pred = p.predict(X_test)
        
        # p.fit(X,y)
        conf_mat = confusion_matrix(y_test,pred)
        a = np.sum(np.diagonal(conf_mat))/np.sum(conf_mat)
        print(column)
        print("Accuracy: ", a)
        accuracy.append(a)
    
        # Save prediction models
        filename = df_name + "_" + column + '.sav'
        pickle.dump(p, open(filename, 'wb'))
        
    col = ['Attribute','Accuracy']
    val = [df_filt.columns,accuracy] 
    dictionary = dict(zip(col, val))
    
    # Export accuracy table
    accuracy_df = pd.DataFrame.from_dict(dictionary)
    accuracy_df.to_csv(df_name + "_" + 'accuracy.csv')
    
    return

In [None]:
multiclass("/Users/benjaminlim/Dropbox (MIT)/Shopee Data Hack 2019/Data/Fashion/fashion_data_info_train_competition_wimage2.csv")

  if self.run_code(code, result):


building tree 1 of 200building tree 3 of 200building tree 2 of 200building tree 4 of 200



building tree 5 of 200
building tree 6 of 200
building tree 7 of 200
building tree 8 of 200
building tree 9 of 200
building tree 10 of 200
building tree 11 of 200
building tree 12 of 200
building tree 13 of 200
building tree 14 of 200
building tree 15 of 200
building tree 16 of 200
building tree 17 of 200
building tree 18 of 200
building tree 19 of 200
building tree 20 of 200
building tree 21 of 200
building tree 22 of 200
building tree 23 of 200
building tree 24 of 200
building tree 25 of 200
building tree 26 of 200
building tree 27 of 200
building tree 28 of 200
building tree 29 of 200
building tree 30 of 200
building tree 31 of 200
building tree 32 of 200
building tree 33 of 200
building tree 34 of 200
building tree 35 of 200
building tree 36 of 200


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.5min


building tree 37 of 200
building tree 38 of 200
building tree 39 of 200
building tree 40 of 200
building tree 41 of 200
building tree 42 of 200
building tree 43 of 200
building tree 44 of 200
building tree 45 of 200
building tree 46 of 200
building tree 47 of 200
building tree 48 of 200
building tree 49 of 200
building tree 50 of 200
building tree 51 of 200
building tree 52 of 200
building tree 53 of 200
building tree 54 of 200
building tree 55 of 200
building tree 56 of 200
building tree 57 of 200
building tree 58 of 200
building tree 59 of 200
building tree 60 of 200
building tree 61 of 200
building tree 62 of 200
building tree 63 of 200
building tree 64 of 200
building tree 65 of 200
building tree 66 of 200
building tree 67 of 200
building tree 68 of 200
building tree 69 of 200
building tree 70 of 200
building tree 71 of 200
building tree 72 of 200
building tree 73 of 200
building tree 74 of 200
building tree 75 of 200
building tree 76 of 200
building tree 77 of 200
building tree 78

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 25.3min


building tree 159 of 200
building tree 160 of 200
building tree 161 of 200
building tree 162 of 200


In [None]:
def multiclass_subset(filename):
    # Save name of dataset (fashion, beauty or mobile)
    df_name = filename[:filename.index("_")]
    df = pd.read_csv(filename)
    df['title_image'].fillna(" ", inplace = True)
    df['title'] = df['title'] + " " + df['title_image'] # new
    df.fillna(-1, inplace = True)
    X = df['title']
    X = [concat_words(string) for string in X]
    X = [re.sub(r'\b\w{1,2}\b', '', string) for string in X] # new
    df_filt = df.iloc[:,3:]
        
    # Train-test split
    y = df_filt["Sleeves"]
    p = Pipeline([
            ('bow',CountVectorizer(token_pattern = r'(?u)\b[A-Za-z_]+\b',
                                  stop_words = 'english',
                                  max_features = 1000)), # new
        ('tfidf',TfidfTransformer()),
        ('classifier',RandomForestClassifier(n_estimators = 200, verbose = 2, n_jobs=-1))  # new
    ])

    p.fit(X,y)
    
    column = "Sleeves"

    print(column)

    # Save prediction models
    filename = df_name + "_" + column + '.sav'
    pickle.dump(p, open(filename, 'wb'))

    
    return

In [None]:
multiclass_subset("/Users/benjaminlim/Dropbox (MIT)/Shopee Data Hack 2019/Data/Fashion/fashion_data_info_train_competition_wimage2.csv")

In [None]:
from fancyimpute import KNN, SoftImpute, IterativeImputer
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def impute_remaining_labels(path, method = "mode"):
    train_df = pd.read_csv(path)
    train_df = train_df.iloc[:,3:]
    train_df = train_df.drop('title_image', axis = 1)
    train_df_colnames = list(train_df)
    
    if method == "mode":
        mode_df = train_df.apply(lambda x: pd.Series(x.value_counts().index[:2]))
        for i in range(np.shape(train_df)[1]):
            replacement = mode_df.iloc[1:,i]
            replacement = replacement.astype(float)
            train_df.iloc[:,i] = train_df.iloc[:,i].replace(-1, replacement[1])
    
    else:
        if method == "IterativeImputer": 
            train_df = train_df.replace(-1, np.nan)
            print(train_df)

            train_df = train_df.apply(lambda x : pd.factorize(x)[0]).as_matrix()
            
            print(train_df)
            train_df = pd.DataFrame(IterativeImputer().fit_transform(train_df))
            train_df.columns = train_df_colnames

        elif method == "KNN": # Causes kernel to die
            train_df = train_df.replace(-1, np.nan).as_matrix()
            train_df = pd.DataFrame(KNN(k=10).fit_transform(train_df))
        
        elif method == "SoftImpute":
            scaler = MinMaxScaler()
            for i in range(np.shape(train_df)[1]):
                column_mean = mean(train_df.iloc[:,i])
                train_df.iloc[:,i] = train_df.iloc[:,i].replace(-1, column_mean)
            # need extra code to render entries of column_means null.
            train_df_scaled = scaler.fit_transform(train_df) # scale matrix for imputation
            train_df_scaled = pd.DataFrame(SoftImpute().fit_transform(train_df_scaled))
            train_df = scaler.inverse_transform(train_df_scaled) # unscale imputed matrix
        
    return train_df

## Multiclass Prediction: CNN (text)

In [None]:
filename = "fashion_data_info_train_competition.csv"
df_name = filename[:filename.index("_")]
df = pd.read_csv(filename)
df.fillna(-1,inplace=True)
X = df['title']
df_filt = df.iloc[:,3:]
y = df_filt["Fashion Trend"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_tokenized = tokenizer.texts_to_sequences(X_train)
X_test_tokenized = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
print(X_train[0:1])
print(X_train_tokenized[0:1])

In [None]:
from keras.preprocessing.sequence import pad_sequences
maxlen = max([len(sequence) for sequence in X_train_tokenized])
X_train_tokenized = pad_sequences(X_train_tokenized, padding = 'post', maxlen = maxlen)
X_test_tokenized = pad_sequences(X_test_tokenized, padding = 'post', maxlen = maxlen)
print(X_train_tokenized[0,:])

In [None]:
import keras
y_train_tokenized = keras.utils.to_categorical(y_train, len(np.unique(y_train)))
y_test_tokenized = keras.utils.to_categorical(y_test, len(np.unique(y_test)))

In [None]:
from keras.models import Sequential
from keras import layers

embedding_dim = 10
model = Sequential()
model.add(layers.Embedding(input_dim = vocab_size,
                          output_dim = embedding_dim,
                          input_length = maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam',
             loss = 'categorical_crossentropy',
             metrics = ['accuracy'])
model.summary()

In [None]:
X_train_tokenized

In [None]:
history = model.fit(X_train_tokenized, y_train.values,
                    epochs = 20,
                    verbose = True,
                    validation_data = (X_test_tokenized, y_test.values),
                    batch_size = 10)

In [None]:
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = glove.word_vectors.shape[0])
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
MAX_SEQUENCE_LENGTH = max([len(sequence) for sequence in sequences])
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(y_train))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


In [None]:
from keras.layers import Embedding
from keras.layers import Input
from keras.layers.convolutional import Conv1D
from keras.layers import MaxPooling1D
embedding_layer = Embedding(len(word_index) + 1,
                            np.shape(glove.word_vectors)[1],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [None]:
glove.word_vectors.shape

In [None]:
Embedding