In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Utility functions

Installed version of sklearn can't one-hot encode strings, so define function.

In [2]:
def get_str_ohe_transform(col):
    """Takes a pandas column, and creates a one-hot-encoding function.
    Example: 
        # Create the transform.
        my_ohe_transform = get_str_ohe_transform(df.my_col)
        # Apply the transform to the original column, and another one.
        my_col_ohe_transformed = my_ohe_transform(df.my_col)
        another_col_ohe_transformed = my_ohe_transform(df.another_col)
    """
    label_encoder_transform = LabelEncoder().fit(col.values.tolist()).transform
    # LabelEncode and col and put it into the format needed for OneHotEncode.
    def preprocess_for_one_hot_encoder(x):
        return label_encoder_transform(x.values.tolist()).reshape(-1, 1)
    one_hot_encoder_transform = OneHotEncoder().fit(preprocess_for_one_hot_encoder(col)).transform
    def _inner(c):
        le = label_encoder_transform(c.values.tolist()).reshape(-1, 1)
        return one_hot_encoder_transform(preprocess_for_one_hot_encoder(c))
    return _inner

In [3]:
def assign_matrix(df, m, colname_prefix):
    """Add a matrix as columns of a data frame.
    Params:
        df: The dataframe.
        m: The matrix.
        colname_prefix: Column names will be this with suffix 0, 1, ..., number of columns.
    """
    for i_col, col in enumerate(m.todense().transpose()):
        colname = colname_prefix + str(i_col)
        df = df.assign(**{colname:col.transpose()})
    return df

Define function to split category names.

In [4]:
def split_category_names(df):
    return (df
             .assign(category_name_0=df.category_name.str.split('/', n=2).str[0])
             .assign(category_name_1=df.category_name.str.split('/', n=2).str[1])
             .assign(category_name_2=df.category_name.str.split('/', n=2).str[2]))

# Load train data

In [6]:
train_0 = (pd.read_table("../../data/train.tsv", sep="\t"))
train_1 = (train_0
         .assign(target=np.log1p(train_0.price))
         .drop('price', axis=1) # To avoid accidentally training on it.
         .pipe(split_category_names)) 

Create one-hot encodings of category names. (They are matrices, not single columns.)

In [7]:
#category_name_0_ohe_transform = get_str_ohe_transform(train_1.category_name_0) # This will be used for test too.
#category_name_1_ohe_transform = get_str_ohe_transform(train_1.category_name_1) # This will be used for test too.
#category_name_2_ohe_transform = get_str_ohe_transform(train_1.category_name_2) # This will be used for test too.

In [8]:
#train = (train_1
#         .pipe(assign_matrix, category_name_0_ohe_transform(train_1.category_name_0), 'category_name_0_ohe_')
#         .pipe(assign_matrix, category_name_1_ohe_transform(train_1.category_name_1), 'category_name_1_ohe_')
#         .pipe(assign_matrix, category_name_2_ohe_transform(train_1.category_name_2), 'category_name_2_ohe_'))

Runs out of memory.

In [10]:
train = train_1

In [11]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description,target,category_name_0,category_name_1,category_name_2
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,No description yet,2.397895,Men,Tops,T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...,3.970292,Electronics,Computers & Tablets,Components & Parts
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...,2.397895,Women,Tops & Blouses,Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...,3.583519,Home,Home Décor,Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity,3.806662,Women,Jewelry,Necklaces


Get mean of target for each category.

In [63]:
target_by_category_name_df = (train
          .groupby('category_name', as_index = False)
          .agg({'target': [np.mean, np.var]}))
target_by_category_name_df.columns = [''.join(t) for t in target_by_category_name_df.columns]
target_by_category_name_df = target_by_category_name_df.set_index('category_name')

In [64]:
train_2 = train_1.join(target_by_category_name_df, on = 'category_name')

In [65]:
train_2

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,shipping,item_description,target,category_name_0,category_name_1,category_name_2,targetmean,targetvar
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,1,No description yet,2.397895,Men,Tops,T-shirts,2.771819,0.321845
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...,3.970292,Electronics,Computers & Tablets,Components & Parts,3.336755,0.769778
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...,2.397895,Women,Tops & Blouses,Blouse,2.647801,0.277260
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,1,New with tags. Leather horses. Retail for [rm]...,3.583519,Home,Home Décor,Home Décor Accents,2.908575,0.415984
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,0,Complete with certificate of authenticity,3.806662,Women,Jewelry,Necklaces,2.729020,0.889770
5,5,Bundled items requested for Ruie,3,Women/Other/Other,,0,"Banana republic bottoms, Candies skirt with ma...",4.094345,Women,Other,Other,2.974339,0.532470
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,0,Size small but straps slightly shortened to fi...,4.174387,Women,Swimwear,Two-Piece,2.937108,0.340336
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,1,You get three pairs of Sophie cheer shorts siz...,1.945910,Sports & Outdoors,Apparel,Girls,2.723993,0.225168
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,0,Girls Size small Plus green. Three shorts total.,2.995732,Sports & Outdoors,Apparel,Girls,2.723993,0.225168
9,9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,,0,I realized his pants are on backwards after th...,2.197225,Vintage & Collectibles,Collectibles,Doll,3.123443,0.609430


# Load test data

In [66]:
test = (pd.read_table("../../data/test.tsv")
       .pipe(split_category_names))
#test = test.pipe(assign_matrix, category_name_0_ohe_transform(test.category_name_0), 'category_name_0_ohe_')

# Train model

In [12]:
#category_name_0_ohe_cols = [x for x in train.columns if x.startswith('category_name_0_ohe_')]
# model = LinearRegression().fit(X = train.loc[:, category_name_0_ohe_cols],
#                                y = train.target)

In [13]:
#y = model.predict(X = train.loc[:, category_name_0_ohe_cols])

'Model' that is just prediction using mean for each category.

In [87]:
test_predict = (test
     .join(target_by_category_name_df, on = 'category_name'))

In [88]:
test_predict

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description,category_name_0,category_name_1,category_name_2,targetmean,targetvar
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7,Women,Jewelry,Rings,2.933243,0.871088
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined...",Other,Office supplies,Shipping Supplies,2.346270,0.277682
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...,Vintage & Collectibles,Bags and Purses,Handbag,4.183580,1.426672
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...,Women,Sweaters,Cardigan,3.042581,0.525095
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...,Other,Books,Religion & Spirituality,2.537237,0.227757
5,5,iPhone 6 Plus or 6s Plus Vodka pink case,1,"Electronics/Cell Phones & Accessories/Cases, C...",,1,One Absolut Vodka in Pink for iPhone 6 Plus an...,Electronics,Cell Phones & Accessories,"Cases, Covers & Skins",2.509256,0.258812
6,6,Vintage Cameo Pendant & Brooch Pin,3,Women/Jewelry/Necklaces,Vintage,1,Two vintage Cameo pieces. 1. Silver metal Lock...,Women,Jewelry,Necklaces,2.729020,0.889770
7,7,Rose Gold Stainless Steel Quartz Watch,1,Women/Women's Accessories/Watches,,1,Brand new Price firm No trades Box included wi...,Women,Women's Accessories,Watches,3.563318,0.965608
8,8,Daisy Marc Jacobs 3.4oz,3,Beauty/Fragrance/Women,MARC JACOBS,0,Brand new No box 100% authentic Firm price NO ...,Beauty,Fragrance,Women,2.987426,0.399767
9,9,Rose Brushes and Silicone Sponge,1,Beauty/Tools & Accessories/Makeup Brushes & Tools,,1,All new. 12 pcs makeup brushes and one Silicon...,Beauty,Tools & Accessories,Makeup Brushes & Tools,2.636882,0.381395


Probably some NaNs.

In [91]:
test_predict.loc[np.isnan(test_predict.targetmean), ['name', 'item_description']]

Unnamed: 0,name,item_description
217,Lularoe XS Carly Dress,XS Lularoe Carly Dress. Has peach tone color s...
555,Victoria's Secret VerySexy Touch Perfume,No description yet
702,Men's All Black Low Top Converse,"Men's Converse, size 11, all black. They have ..."
1110,92 Polly Pocket Jewel Palace/Ice Kingdom,Set of 2. In good condition. Both are complete...
1476,Bundle for Vanessa Wieseler,Avent Bottle Warmer Brand New Open Package [rm...
1586,Mens Shake Weight,No description yet
1983,Knee Sleeves Workout XS,XS brand new in package
2317,Honeywell Money Lockbox,No description yet
2376,#84 Wholesale Lot Workout Clothing 30 Pc,Bin Pickers from YT wholesale lot #84. All wom...
2508,Hollister sweatpants,"Juniors small in great condition. Very soft, c..."


NaNs because category_name is NaN.

Infer from item name, description.