In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from scipy.optimize import minimize

In [3]:
label = pd.read_csv("data/label.csv")
print(label.shape[1])

28


In [4]:
train = pd.read_csv("data/train.tsv", sep="\t")
test = pd.read_csv("data/test.tsv", sep="\t")

In [5]:
train.drop("tag", axis=1, inplace=True)

In [6]:
full = pd.concat([train, test], axis=0).reset_index(drop=True)

## Prepare Seller

In [7]:
seller_cnt = full["Seller"].value_counts()
selected_seller = seller_cnt[seller_cnt > 100].index.tolist()

In [8]:
full.loc[~full["Seller"].isin(selected_seller), "Seller"] = ""
full.loc[:, "Seller"] = LabelEncoder().fit_transform(full["Seller"])

## Prepare Actors

In [9]:
full.loc[:, "Actors"] = ~full["Actors"].isnull() * 1

In [10]:
print("# of Not Null:", full[full["Actors"]==1].shape[0])

# of Not Null: 501


## Prepare ISBN

In [11]:
full.loc[:, "ISBN"] = ~full["ISBN"].isnull() * 1

In [12]:
print("# of Not Null:", full[full["ISBN"]==1].shape[0])

# of Not Null: 534


## Prepare Genre

In [13]:
full.loc[:, "Genre ID"] = (full["Genre ID"] == 5065) * 1

In [14]:
print("# of Not Null:", full[full["Genre ID"]==1].shape[0])

# of Not Null: 149


## Prepare Item Class ID

In [15]:
item_class_cnt = full["Item Class ID"].value_counts()
selected_item_class = item_class_cnt[item_class_cnt > 100].index.tolist()

In [16]:
full.loc[~full["Item Class ID"].isin(selected_item_class), "Item Class ID"] = ""
full.loc[:, "Item Class ID"] = LabelEncoder().fit_transform(full["Item Class ID"])

## Prepare Recommended Location

In [17]:
full.loc[:, "Recommended Location"] = ~full["Recommended Location"].isnull() * 1

In [18]:
print("# of Not Null:", full[full["Recommended Location"]==1].shape[0])

# of Not Null: 224


## Prepare MPAA Rating

In [19]:
mpaa_cnt = full["MPAA Rating"].value_counts()
selected_mpaa = mpaa_cnt[mpaa_cnt > 100].index.tolist()

In [20]:
full.loc[~full["MPAA Rating"].isin(selected_mpaa), "MPAA Rating"] = ""
full.loc[:, "MPAA Rating"] = LabelEncoder().fit_transform(full["MPAA Rating"])

## Prepare Recommend Use

In [21]:
selected_uses = ["Television", "LCD display", "Televison", "2 LCD / plasma panels", "TV", "Flat Panel Display",
                "LCD / plasma panel", "LCD TV", "Office, Home, Televisions", "Plasma / LCD / TV"]

In [22]:
full["TV"] = 0
full.loc[full["Recommended Use"].isin(selected_uses), "TV"] = 1

In [23]:
print("# of Not Null:", full[full["TV"]==1].shape[0])

# of Not Null: 460


In [24]:
selected_columns = ['Seller', 'Actors', 'Genre ID', 'ISBN', 
                     'Item Class ID', 'MPAA Rating',
                     'Recommended Location', 'TV']

In [25]:
train_dataset = full.loc[:(train.shape[0]-1),selected_columns]
test_dataset = full.loc[train.shape[0]:,selected_columns]

## One-Hot Encode

In [26]:
full_encoded = pd.get_dummies(full[selected_columns], columns=["Seller", "MPAA Rating", "Item Class ID"])

In [27]:
label_names = label.columns.tolist()
label_names.remove("item_id")

### Utility Function

In [28]:
def get_f1_score(threshold, y_true, y_prob):
    return f1_score(y_true, (y_prob >= threshold) * 1)

## Simple Text Mining

In [29]:
import re
tag_pattern = re.compile(r'<.+?>')
full["SD"] = full["Short Description"].apply(
    lambda x: "" if not isinstance(x, str) or x == "short description is not available" else tag_pattern.sub('', x))
full["PSD"] = full["Product Short Description"].apply(
    lambda x: "" if not isinstance(x, str) or x == "short description is not available" else tag_pattern.sub('', x))
full["PLD"] = full["Product Long Description"].apply(
    lambda x: "" if not isinstance(x, str) or x == "short description is not available" else tag_pattern.sub('', x))
full.loc[full["Product Name"] == full["PSD"], "PSD"] = ""
full.loc[full["PSD"].isnull(), "PSD"] = ""
full.loc[full["Product Name"] == full["SD"], "SD"] = ""
full.loc[full["SD"].isnull(), "SD"] = ""
full.loc[full["Product Name"] == full["PLD"], "PLD"] = ""
full.loc[full["PLD"].isnull(), "PLD"] = ""
full.loc[full["Synopsis"].isnull(), "Synopsis"] = ""
full["Full Text"] = full["Product Name"] + " " + full["SD"] + " " + full["PLD"] + " " + full["Synopsis"]

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [31]:
TARGET_COLUMN = "Full Text"

In [32]:
full.loc[full[TARGET_COLUMN].isnull(), TARGET_COLUMN] = ""

In [33]:
vec = TfidfVectorizer(max_features=5000, ngram_range=(1,3), analyzer="word", 
                      stop_words="english", norm="l2").fit(full[TARGET_COLUMN])

In [34]:
len(vec.vocabulary_.keys())

5000

### Now mix with the other features

In [35]:
product_name_features = pd.DataFrame(vec.transform(full.loc[:, TARGET_COLUMN]).todense(), columns=vec.vocabulary_)

### Keras Bags of Words

In [36]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalAveragePooling1D
from sklearn.preprocessing import MinMaxScaler
from keras.layers.advanced_activations import PReLU
from keras.optimizers import SGD

Using Theano backend.
Using gpu device 0: GeForce GTX 960 (CNMeM is enabled with initial size: 50.0% of memory, cuDNN 5005)


In [37]:
train_dataset = pd.concat([full_encoded.iloc[:train.shape[0]], 
                           product_name_features.iloc[:train.shape[0]]], axis=1)
test_dataset = pd.concat([full_encoded.iloc[train.shape[0]:], 
                           product_name_features.iloc[train.shape[0]:]], axis=1)
# scaler = MinMaxScaler()
# train_dataset = pd.DataFrame(scaler.fit_transform(train_dataset),
#                              columns = train_dataset.columns)
# test_dataset = pd.DataFrame(scaler.transform(test_dataset),
#                              columns = test_dataset.columns)
train_dataset.shape

(10593, 5038)

In [38]:
def get_bow_model(input_shape, output_dim):
    model = Sequential()
    model.add(Dense(768, input_shape=input_shape))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(512))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(output_dim))
    model.add(Activation('sigmoid'))
    model.compile(
        loss='binary_crossentropy',
        # optimizer=SGD(lr=0.02, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=1.),
        optimizer="Adadelta",
        metrics=['fbeta_score'])
    return model

#### Full prediction in one Model
Do CV first:

In [39]:
from collections import defaultdict
scores = []
np.random.seed(888)
thresholds, scores = defaultdict(list), defaultdict(list)
skf = StratifiedKFold(n_splits=5, random_state=888, shuffle=True)
for train_index, val_index in skf.split(train_dataset.values, label["4537"]):
    model = get_bow_model((train_dataset.shape[1],), label.shape[1]-1)
    model.fit(train_dataset.iloc[train_index].values, 
              label.drop(["item_id"], axis=1).iloc[train_index].values,
              validation_data=(
                  train_dataset.iloc[val_index].values,
                  label.drop(["item_id"], axis=1).iloc[val_index].values
              ),
              batch_size=32, nb_epoch=20, verbose=2)
    pred = model.predict(train_dataset.iloc[val_index].values)
    for i in range(1, label.shape[1]):
        best_score, best_threshold = get_f1_score(0, label.iloc[val_index, i], pred[:, i-1]), 0
        for threshold in np.arange(min(pred[:, i-1]), max(pred[:, i-1]), 0.01):
            tmp_score = get_f1_score(threshold, label.iloc[val_index, i], pred[:, i-1])
            if tmp_score > best_score:
                best_score, best_threshold = tmp_score, threshold
        thresholds[label.columns[i]].append(best_threshold)
        scores[label.columns[i]].append(best_score)
for target_label in label_names:
    print(target_label, np.mean(scores[target_label]), np.std(scores[target_label]))
print("Overall score:", np.mean([x for x in scores.values()]), 
      np.std([x for x in scores.values()]))

Train on 8474 samples, validate on 2119 samples
Epoch 1/20
2s - loss: 0.6734 - fbeta_score: 0.1710 - val_loss: 0.2050 - val_fbeta_score: nan
Epoch 2/20
2s - loss: 0.1669 - fbeta_score: 0.4929 - val_loss: 0.1172 - val_fbeta_score: nan
Epoch 3/20
2s - loss: 0.0919 - fbeta_score: 0.6314 - val_loss: 0.0676 - val_fbeta_score: 0.6536
Epoch 4/20
2s - loss: 0.0729 - fbeta_score: 0.6903 - val_loss: 0.0576 - val_fbeta_score: 0.7524
Epoch 5/20
2s - loss: 0.0642 - fbeta_score: 0.7365 - val_loss: 0.0542 - val_fbeta_score: 0.7721
Epoch 6/20
2s - loss: 0.0568 - fbeta_score: 0.7620 - val_loss: 0.0531 - val_fbeta_score: 0.7786
Epoch 7/20
2s - loss: 0.0516 - fbeta_score: 0.7841 - val_loss: 0.0509 - val_fbeta_score: 0.7869
Epoch 8/20
2s - loss: 0.0475 - fbeta_score: 0.8011 - val_loss: 0.0504 - val_fbeta_score: 0.7910
Epoch 9/20
2s - loss: 0.0442 - fbeta_score: 0.8112 - val_loss: 0.0506 - val_fbeta_score: 0.7925
Epoch 10/20
2s - loss: 0.0410 - fbeta_score: 0.8252 - val_loss: 0.0503 - val_fbeta_score: 0.79

Overall score: 0.749847965618 0.16437249334

Now fit on the full dataset:

In [40]:
pred = None
N_BAGGING = 6
for seed in range(N_BAGGING):
    np.random.seed(seed+888)
    model = get_bow_model((train_dataset.shape[1],), label.shape[1]-1)
    model.fit(train_dataset.values, 
              label.drop(["item_id"], axis=1).values,
              batch_size=32, nb_epoch=20, verbose=2)
    pred = model.predict(test_dataset.values) if pred is None else pred + model.predict(test_dataset.values)
pred /= N_BAGGING
prediction = pd.concat([test[["item_id"]], pd.DataFrame(pred, columns=label.drop(["item_id"], axis=1).columns)],
                       axis=1)

Epoch 1/20
2s - loss: 0.5822 - fbeta_score: nan
Epoch 2/20
2s - loss: 0.1242 - fbeta_score: nan
Epoch 3/20
2s - loss: 0.0814 - fbeta_score: nan
Epoch 4/20
2s - loss: 0.0677 - fbeta_score: nan
Epoch 5/20
2s - loss: 0.0589 - fbeta_score: nan
Epoch 6/20
2s - loss: 0.0533 - fbeta_score: nan
Epoch 7/20
2s - loss: 0.0487 - fbeta_score: nan
Epoch 8/20
2s - loss: 0.0452 - fbeta_score: nan
Epoch 9/20
2s - loss: 0.0426 - fbeta_score: nan
Epoch 10/20
2s - loss: 0.0393 - fbeta_score: nan
Epoch 11/20
2s - loss: 0.0368 - fbeta_score: nan
Epoch 12/20
2s - loss: 0.0349 - fbeta_score: nan
Epoch 13/20
2s - loss: 0.0326 - fbeta_score: nan
Epoch 14/20
2s - loss: 0.0300 - fbeta_score: nan
Epoch 15/20
2s - loss: 0.0289 - fbeta_score: nan
Epoch 16/20
2s - loss: 0.0273 - fbeta_score: nan
Epoch 17/20
2s - loss: 0.0255 - fbeta_score: nan
Epoch 18/20
2s - loss: 0.0240 - fbeta_score: nan
Epoch 19/20
2s - loss: 0.0228 - fbeta_score: nan
Epoch 20/20
2s - loss: 0.0219 - fbeta_score: nan
Epoch 1/20
2s - loss: 0.5855 

In [41]:
def generate_prediction(row):    
    result = []
    for target_label in label_names:
        if row[target_label] >= np.median(thresholds[target_label]):
            result.append(target_label)    
    # if len(result) == 0:
    #     result.append(max([(x, row[x]) for x in label_names], key=lambda x:x[1])[0])
    return "[" + ", ".join(result) + "]"

In [42]:
prediction = prediction.assign(tag = prediction.apply(generate_prediction, axis=1))
prediction["tag"]

0                                [581514]
1                                  [4537]
2                                  [4483]
3                                  [4483]
4                                  [4537]
5                                  [4537]
6                                  [4483]
7                                  [4537]
8                                [581514]
9                                  [4537]
10                               [581514]
11                [106546, 95987, 522484]
12                         [522484, 4457]
13                               [106546]
14                                 [4483]
15                               [529295]
16                                 [4483]
17                      [1229821, 447913]
18                               [581514]
19                                 [4483]
20       [106546, 95987, 522484, 3304195]
21                      [1229821, 447913]
22                               [581514]
23                                

In [43]:
prediction[["item_id", "tag"]].to_csv("tags.tsv",sep="\t", index=False)