In [226]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import dask.dataframe as dd
from datetime import datetime
from csv import DictReader
from math import exp, log, sqrt
from random import random,shuffle
import pickle
import sys
#from ngram import getUnigram
import string
import nltk
from nltk.util import ngrams # function for making ngrams
import re
from sklearn import preprocessing as pe
from sklearn.model_selection import StratifiedKFold
from scipy import sparse as ssp
import lightgbm as lgbm
import time
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge

In [188]:
# vectorized error calc
def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

#looping error calc
def rmsle_loop(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'rmsle', rmsle(labels, preds), False

In [193]:
def rmse0(y, y_pred):
    return sqrt(mean_squared_error(y, y_pred))

seed = 1024
np.random.seed(seed)

In [3]:
def stem_str(x,stemmer=SnowballStemmer('english')):
    try:
        x = text.re.sub("[^a-zA-Z0-9]"," ", x)
        x = (" ").join([stemmer.stem(z) for z in x.split(" ")])
        x = " ".join(x.split())
    except: 
        print(x)
        print('\n')
        return ''
    return x

In [4]:
def remove_digits(x):
    x = re.sub(r'\d+', '', x)
    ' '.join(x.split())
    return x

In [154]:
start_time = time.time()

In [5]:
data = '../data/'
cache = '../cache/'

train = pd.read_csv(data+"train.tsv", sep='\t')
test = pd.read_csv(data+"test.tsv", sep='\t')

In [6]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [7]:
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description
0,0,"Breast cancer ""I fight like a girl"" ring",1,Women/Jewelry/Rings,,1,Size 7
1,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers",1,Other/Office supplies/Shipping Supplies,,1,"25 pcs NEW 7.5""x12"" Kraft Bubble Mailers Lined..."
2,2,Coach bag,1,Vintage & Collectibles/Bags and Purses/Handbag,Coach,1,Brand new coach bag. Bought for [rm] at a Coac...
3,3,Floral Kimono,2,Women/Sweaters/Cardigan,,0,-floral kimono -never worn -lightweight and pe...
4,4,Life after Death,3,Other/Books/Religion & Spirituality,,1,Rediscovering life after the loss of a loved o...


In [8]:
# BNWT: Brand New With Tags
# NWT: New With Tags
# BNWOT: Brand New Without Tags
# NWOT: New With Out Tags
# BNIP: Brand New In Packet
# NIP: New In Packet
# BNIB: Brand New In Box
# NIB: New In Box
# MIB: Mint In Box
# MWOB: Mint With Out Box
# MIP: Mint In Packet
# MWOP: Mint With Out Packet

In [10]:
abbr = {}
abbr['BNWT'] = ['bnwt', 'brand new with tags']
abbr['NWT'] = ['nwt', 'new with tags']
abbr['BNWOT'] = ['bnwot', 'brand new with out tags', 'brand new without tags']
abbr['NWOT'] = ['nwot', 'new with out tags', 'new without tags']
abbr['BNIP'] = ['bnip', 'brand new in packet', 'brand new in packet']
abbr['NIP'] = ['nip', 'new in packet', 'new in packet']
abbr['BNIB'] = ['bnib', 'brand new in box']
abbr['NIB'] = ['nib', 'new in box']
abbr['MIB'] = ['mib', 'mint in box']
abbr['MWOB'] = ['mwob', 'mint with out box', 'mint without box']
abbr['MIP'] = ['mip', 'mint in packet']
abbr['MWOP'] = ['mwop', 'mint with out packet', 'mint without packet']

In [11]:
# features I can think of -
# 1. no of times each of the above appear in text
# 2. Put prices in bins (not done)
# 3. Number of times each category appears in each bin
# 4. no of times each item_condition appears in each bin
# 5. min price for each category
# 6. max price for each category
# 7. average price for each category
# 8. 

In [12]:
price = train['price'].values
train_id = train['train_id'].values
test_id = test['test_id'].values

In [42]:
train_label = train['price']

In [13]:
del train['price']
del train['train_id']
del test['test_id']

In [14]:
df_all = pd.concat((train, test), axis=0, ignore_index=True)

In [15]:
df_all.isnull().sum()

name                      0
item_condition_id         0
category_name          9385
brand_name           928207
shipping                  0
item_description          4
dtype: int64

In [16]:
df_all['item_description'].fillna('Unk', inplace=True)

In [17]:
%%time
# if any(x in str for x in a):
df_all['tag'] = df_all['item_description'].map(lambda a: 'BNWT' if any(x in a.lower() for x in abbr['BNWT'])
                                               else 'NWT' if any(x in a.lower() for x in abbr['NWT'])
                                               else 'BNWOT' if any(x in a.lower() for x in abbr['BNWOT'])
                                               else 'NWOT' if any(x in a.lower() for x in abbr['NWOT'])
                                               else 'BNIP' if any(x in a.lower() for x in abbr['BNIP'])
                                               else 'NIP' if any(x in a.lower() for x in abbr['NIP'])
                                               else 'BNIB' if any(x in a.lower() for x in abbr['BNIB'])
                                               else 'NIB' if any(x in a.lower() for x in abbr['NIB'])
                                               else 'MIB' if any(x in a.lower() for x in abbr['MIB'])
                                               else 'MWOB' if any(x in a.lower() for x in abbr['MWOB'])
                                               else 'MIP' if any(x in a.lower() for x in abbr['MIP'])
                                               else 'MWOP' if any(x in a.lower() for x in abbr['MWOP'])
                                               else 'None')
                                               

CPU times: user 39.4 s, sys: 8 ms, total: 39.5 s
Wall time: 39.4 s


In [18]:
%%time
d = df_all['tag'].value_counts()

CPU times: user 104 ms, sys: 4 ms, total: 108 ms
Wall time: 106 ms


In [19]:
d

None     1975136
NWT        84313
BNWT       41212
NWOT       31096
NIB        18286
BNIB       15159
BNWOT       5919
NIP         3398
BNIP        1217
MIB          122
MIP           36
Name: tag, dtype: int64

In [20]:
d['None']

1975136

In [21]:
%%time
df_all['tag_count'] = df_all['tag'].map(lambda x: d[x])

CPU times: user 17.8 s, sys: 56 ms, total: 17.9 s
Wall time: 17.9 s


In [22]:
%%time
d = df_all['category_name'].value_counts()

CPU times: user 240 ms, sys: 4 ms, total: 244 ms
Wall time: 244 ms


In [23]:
df_all['brand_name'].fillna('UnkB', inplace=True)
df_all['category_name'].fillna('UnkC', inplace=True)

In [24]:
%%time
for c in ['brand_name', 'category_name', 'item_condition_id']:
    d = df_all[c].value_counts()
    df_all[c+'_count'] = df_all[c].map(lambda x: d[x])

CPU times: user 55.6 s, sys: 172 ms, total: 55.8 s
Wall time: 55.8 s


In [25]:
%%time
df_all['bci'] = df_all['brand_name'].astype('str') + '_' + df_all['category_name'].astype('str') + '_' + \
                df_all['item_condition_id'].astype('str')
    
df_all['bc'] = df_all['brand_name'].astype('str') + '_' + df_all['category_name'].astype('str')
    
df_all['bi'] = df_all['brand_name'].astype('str') + '_' +   df_all['item_condition_id'].astype('str')
    
df_all['ci'] = df_all['category_name'].astype('str') + '_' + df_all['item_condition_id'].astype('str')

CPU times: user 11.4 s, sys: 484 ms, total: 11.9 s
Wall time: 11.9 s


In [26]:
%%time
for c in ['bci', 'bc', 'bi', 'ci']:
    d = df_all[c].value_counts()
    df_all[c+'_count'] = df_all[c].map(lambda x: d[x])

CPU times: user 1min 14s, sys: 184 ms, total: 1min 14s
Wall time: 1min 14s


In [27]:
%%time
for c in ['brand_name', 'category_name', 'item_condition_id', 'bci', 'bc', 'bi', 'ci']:
    cardinality = len(set(df_all[c]))
    print(c, cardinality)

brand_name 5290
category_name 1311
item_condition_id 5
bci 92981
bc 50920
bi 12891
ci 4736
CPU times: user 1.34 s, sys: 4 ms, total: 1.34 s
Wall time: 1.33 s


In [28]:
import psutil
import os
def print_memory_usage():
    print('consuming {:.2f}GB RAM'.format(
           psutil.Process(os.getpid()).memory_info().rss / 1073741824),
          flush=True)

In [29]:
print_memory_usage()

consuming 1.94GB RAM


In [30]:
%%time
for c in ['brand_name', 'category_name', 'item_condition_id', 'ci', 'bi']:
    lbl = pe.LabelEncoder()
    df_all[c+'_lbl_enc'] = lbl.fit_transform(df_all[c].values)
print_memory_usage()

consuming 2.02GB RAM
CPU times: user 21.4 s, sys: 48 ms, total: 21.5 s
Wall time: 21.4 s


In [31]:
%%time
for c in ['bci', 'bc']:
    df_all = df_all.drop(c, axis=1)

CPU times: user 1.16 s, sys: 140 ms, total: 1.3 s
Wall time: 1.3 s


In [35]:
%%time
df_all['sentiment_score'] = df_all['item_description'].map(lambda x: TextBlob(x).sentiment.polarity)
print_memory_usage()

consuming 1.66GB RAM
CPU times: user 8min 41s, sys: 700 ms, total: 8min 41s
Wall time: 8min 42s


In [37]:
%%time
from textblob import TextBlob
df_all['sentiment'] = df_all['sentiment_score'].map(lambda x: 'Pos' if x > 0 else 'Neu' if  x == 0 else 'Neg')
print_memory_usage()

consuming 1.66GB RAM
CPU times: user 632 ms, sys: 36 ms, total: 668 ms
Wall time: 672 ms


In [38]:
%%time
lbl = pe.LabelEncoder()
df_all['sentiment'] = lbl.fit_transform(df_all['sentiment'].values)
print_memory_usage()

consuming 1.67GB RAM
CPU times: user 1.48 s, sys: 8 ms, total: 1.49 s
Wall time: 1.5 s


In [39]:
%%time
for c in ['name', 'item_description']:
    df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
    df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' '))) 

CPU times: user 7.94 s, sys: 40 ms, total: 7.98 s
Wall time: 7.99 s


In [70]:
%%time
lbl = pe.LabelEncoder()
df_all['tag'] = lbl.fit_transform(df_all['tag'].values)
print_memory_usage()

consuming 3.01GB RAM
CPU times: user 1.46 s, sys: 16 ms, total: 1.48 s
Wall time: 1.48 s


In [155]:
%%time
cv = CountVectorizer(min_df=10)
X_name = cv.fit_transform(df_all['name'])
print('[{}] Finished count vectorize `name`'.format(time.time() - start_time))

cv = CountVectorizer()
X_category = cv.fit_transform(df_all['category_name'])
print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time))

tv = TfidfVectorizer(max_features=2**16,
                     ngram_range=(1, 3),
                     stop_words='english')
X_description = tv.fit_transform(df_all['item_description'])
print('[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time))

[39.45201539993286] Finished count vectorize `name`
[51.663666009902954] Finished count vectorize `category_name`
[316.8116579055786] Finished TFIDF vectorize `item_description`
CPU times: user 4min 49s, sys: 2.67 s, total: 4min 51s
Wall time: 4min 51s


In [195]:
%%time
svd = TruncatedSVD(n_components=300, n_iter=25)
X_desc = svd.fit_transform(X_description)
print('[{}] Finished SVD `item_description`'.format(time.time() - start_time))

[45569.725628852844] Finished SVD `item_description`
CPU times: user 29min 48s, sys: 2min 34s, total: 32min 23s
Wall time: 20min 39s


In [156]:
df_train = df_all.iloc[:len(train)]
df_test = df_all.iloc[len(train):]

In [None]:
# price = train['price'].values
# train_id = train['train_id'].values
# test_id = test['test_id'].values

In [157]:
data = '../data/'
cache = '../cache/'

df_tr = pd.read_csv(data+"train.tsv", sep='\t')
# df_test = pd.read_csv(data+"test.tsv", sep='\t')
df_tr['lp'] = df_tr['price'].map(lambda x: np.log1p(x))

In [158]:
train_label = df_tr['lp']

In [159]:
X = df_train
X_test = df_test

In [160]:
for c in ['name', 'category_name', 'brand_name', 'item_description', 'bi','ci', 'item_condition_id']:
    X = X.drop(c, axis=1)
    X_test = X_test.drop(c, axis=1)

In [168]:
X_num = df_all
for c in ['name', 'category_name', 'brand_name', 'item_description', 'bi','ci', 'item_condition_id']:
    X_num = X_num.drop(c, axis=1)

In [161]:
pd.set_option('display.max_columns', None)

In [196]:
X_num.head()

Unnamed: 0,shipping,tag,tag_count,brand_name_count,category_name_count,item_condition_id_count,bci_count,bc_count,bi_count,ci_count,brand_name_lbl_enc,category_name_lbl_enc,item_condition_id_lbl_enc,ci_lbl_enc,bi_lbl_enc,sentiment,sentiment_score,name_len,name_words,item_description_len,item_description_words
0,1,10,1975136,928207,22251,633834,2602,7630,216584,8629,4886,829,2,2808,11903,1,0.0,35,7,18,3
1,0,10,1975136,129,1419,633834,15,36,56,539,3889,86,2,351,9523,2,0.9,32,4,188,36
2,1,10,1975136,2725,29812,940630,13,59,784,6640,4588,1278,0,4580,11180,2,0.13625,14,2,124,29
3,1,9,84313,928207,19146,940630,7222,14049,493047,10905,4886,503,0,1535,11901,2,0.232121,21,3,173,32
4,0,10,1975136,928207,28926,940630,13239,19994,493047,16641,4886,1205,0,4276,11901,2,0.1,20,4,41,5


In [163]:
print(X.shape, X_test.shape)

(1482535, 21) (693359, 21)


In [164]:
del df_tr

In [217]:
%%time
# sparse_merge = hstack((X_num, X_desc, X_category, X_name)).tocsr()
sparse_merge = hstack((X_num, X_description, X_category, X_name)).tocsr()
print('[{}] Finished to create sparse merge'.format(time.time() - start_time))

[78370.98391628265] Finished to create sparse merge
CPU times: user 28.5 s, sys: 988 ms, total: 29.4 s
Wall time: 29.4 s


In [218]:
%%time
X = sparse_merge[:len(df_train)]
X_test = sparse_merge[len(df_train):]

CPU times: user 676 ms, sys: 160 ms, total: 836 ms
Wall time: 836 ms


In [219]:
X.shape

(1482535, 87836)

In [220]:
learning_rate = 0.1
num_leaves = 31
min_data_in_leaf = 2000
feature_fraction = 0.9
bagging_fraction = 0.8
num_boost_round = 2500
params = {"objective": "regression",
           "boosting_type": "gbdt",
           "learning_rate": 0.8,
           "num_leaves": num_leaves,
           "max_bin": 128,
           "feature_fraction": feature_fraction,
           'bagging_fraction': bagging_fraction,
           'metric': 'rmse',
           "verbosity": 0,
           "drop_rate": 0.1,
           "is_unbalance": False,
           "max_drop": 50,
           "min_child_samples": 10,
           "min_child_weight": 150,
           "min_split_gain": 0,
           "subsample": 0.9
          }
params1 = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

params2 = {
        'learning_rate': 0.75,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 100,
        'verbosity': -1,
        'metric': 'RMSE',
    }

In [223]:
%%time
y = np.log1p(price)
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) 
d_train = lgbm.Dataset(train_X, label=train_y, max_bin=64)
d_valid = lgbm.Dataset(valid_X, label=valid_y, max_bin=64)
watchlist = [d_train, d_valid]

model = lgbm.train(params2, train_set=d_train, num_boost_round=3600, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=100) 
preds_lgbm3 = model.predict(X_test)

Train until valid scores didn't improve in 50 rounds.
[100]	training's rmse: 0.540822	valid_1's rmse: 0.545655
[200]	training's rmse: 0.516822	valid_1's rmse: 0.522933
[300]	training's rmse: 0.503432	valid_1's rmse: 0.511139
[400]	training's rmse: 0.494188	valid_1's rmse: 0.502513
[500]	training's rmse: 0.487246	valid_1's rmse: 0.496712
[600]	training's rmse: 0.48147	valid_1's rmse: 0.492126
[700]	training's rmse: 0.47702	valid_1's rmse: 0.488865
[800]	training's rmse: 0.472903	valid_1's rmse: 0.485927
[900]	training's rmse: 0.468986	valid_1's rmse: 0.482945
[1000]	training's rmse: 0.466211	valid_1's rmse: 0.481099
[1100]	training's rmse: 0.463512	valid_1's rmse: 0.479164
[1200]	training's rmse: 0.460789	valid_1's rmse: 0.477341
[1300]	training's rmse: 0.458041	valid_1's rmse: 0.475666
[1400]	training's rmse: 0.455807	valid_1's rmse: 0.474257
[1500]	training's rmse: 0.45396	valid_1's rmse: 0.473115
[1600]	training's rmse: 0.452003	valid_1's rmse: 0.472163
[1700]	training's rmse: 0.4502

In [224]:
%%time
y = np.log1p(price)
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) 
d_train = lgbm.Dataset(train_X, label=train_y, max_bin=1024)
d_valid = lgbm.Dataset(valid_X, label=valid_y, max_bin=1024)
watchlist = [d_train, d_valid]

model = lgbm.train(params2, train_set=d_train, num_boost_round=4000, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=100) 
preds_lgbm_1024 = model.predict(X_test)

Train until valid scores didn't improve in 50 rounds.
[100]	training's rmse: 0.539543	valid_1's rmse: 0.544382
[200]	training's rmse: 0.515865	valid_1's rmse: 0.52195
[300]	training's rmse: 0.502809	valid_1's rmse: 0.510357
[400]	training's rmse: 0.493498	valid_1's rmse: 0.502599
[500]	training's rmse: 0.486599	valid_1's rmse: 0.497057
[600]	training's rmse: 0.481094	valid_1's rmse: 0.492783
[700]	training's rmse: 0.476692	valid_1's rmse: 0.489675
[800]	training's rmse: 0.471909	valid_1's rmse: 0.485754
[900]	training's rmse: 0.467576	valid_1's rmse: 0.482353
[1000]	training's rmse: 0.46467	valid_1's rmse: 0.480561
[1100]	training's rmse: 0.461886	valid_1's rmse: 0.478773
[1200]	training's rmse: 0.459308	valid_1's rmse: 0.47724
[1300]	training's rmse: 0.456653	valid_1's rmse: 0.475319
[1400]	training's rmse: 0.454665	valid_1's rmse: 0.4742
[1500]	training's rmse: 0.452543	valid_1's rmse: 0.47309
[1600]	training's rmse: 0.450767	valid_1's rmse: 0.472103
[1700]	training's rmse: 0.448887	

In [222]:
%%time
y = np.log1p(price)
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) 
d_train = lgbm.Dataset(train_X, label=train_y, max_bin=256)
d_valid = lgbm.Dataset(valid_X, label=valid_y, max_bin=256)
watchlist = [d_train, d_valid]

model = lgbm.train(params2, train_set=d_train, num_boost_round=3600, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=100) 
preds_lgbm3 = model.predict(X_test)

Train until valid scores didn't improve in 50 rounds.
[100]	training's rmse: 0.539709	valid_1's rmse: 0.544375
[200]	training's rmse: 0.516489	valid_1's rmse: 0.522649
[300]	training's rmse: 0.503476	valid_1's rmse: 0.510628
[400]	training's rmse: 0.493245	valid_1's rmse: 0.501681
[500]	training's rmse: 0.485844	valid_1's rmse: 0.495525
[600]	training's rmse: 0.479928	valid_1's rmse: 0.491185
[700]	training's rmse: 0.475673	valid_1's rmse: 0.488227
[800]	training's rmse: 0.471837	valid_1's rmse: 0.485522
[900]	training's rmse: 0.468494	valid_1's rmse: 0.483154
[1000]	training's rmse: 0.465402	valid_1's rmse: 0.481123
[1100]	training's rmse: 0.462828	valid_1's rmse: 0.479591
[1200]	training's rmse: 0.45996	valid_1's rmse: 0.477906
[1300]	training's rmse: 0.457201	valid_1's rmse: 0.475787
[1400]	training's rmse: 0.455303	valid_1's rmse: 0.474757
[1500]	training's rmse: 0.45288	valid_1's rmse: 0.473139
[1600]	training's rmse: 0.45098	valid_1's rmse: 0.472038
[1700]	training's rmse: 0.4488

In [227]:
%%time
model = Ridge(solver="sag", fit_intercept=True, random_state=205)
model.fit(X, y)
print('[{}] Finished to train ridge'.format(time.time() - start_time))
preds_ridge = model.predict(X=X_test)
print('[{}] Finished to predict ridge'.format(time.time() - start_time))

[92301.64115858078] Finished to train ridge
[92301.74994659424] Finished to predict ridge


In [None]:
preds = preds_lgbm_1024 + preds_ridge
submission['price'] = np.expm1(preds)
submission.to_csv("submission_lgbm_ridge_6.csv", index=False)


In [221]:
%%time
y = np.log1p(price)
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) 
d_train = lgbm.Dataset(train_X, label=train_y, max_bin=128)
d_valid = lgbm.Dataset(valid_X, label=valid_y, max_bin=128)
watchlist = [d_train, d_valid]

model = lgbm.train(params2, train_set=d_train, num_boost_round=3600, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=100) 
preds_lgbm3 = model.predict(X_test)

Train until valid scores didn't improve in 50 rounds.
[100]	training's rmse: 0.540917	valid_1's rmse: 0.545491
[200]	training's rmse: 0.517184	valid_1's rmse: 0.523028
[300]	training's rmse: 0.503406	valid_1's rmse: 0.510375
[400]	training's rmse: 0.494213	valid_1's rmse: 0.502486
[500]	training's rmse: 0.486534	valid_1's rmse: 0.495604
[600]	training's rmse: 0.480763	valid_1's rmse: 0.491384
[700]	training's rmse: 0.47645	valid_1's rmse: 0.488293
[800]	training's rmse: 0.472079	valid_1's rmse: 0.484901
[900]	training's rmse: 0.46887	valid_1's rmse: 0.482681
[1000]	training's rmse: 0.465953	valid_1's rmse: 0.480774
[1100]	training's rmse: 0.46293	valid_1's rmse: 0.4787
[1200]	training's rmse: 0.460204	valid_1's rmse: 0.476919
[1300]	training's rmse: 0.45825	valid_1's rmse: 0.475736
[1400]	training's rmse: 0.45546	valid_1's rmse: 0.473774
[1500]	training's rmse: 0.453632	valid_1's rmse: 0.472741
[1600]	training's rmse: 0.451629	valid_1's rmse: 0.471403
[1700]	training's rmse: 0.449616	v

In [214]:
%%time
y = np.log1p(price)
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) 
d_train = lgbm.Dataset(train_X, label=train_y, max_bin=8192)
d_valid = lgbm.Dataset(valid_X, label=valid_y, max_bin=8192)
watchlist = [d_train, d_valid]

model = lgbm.train(params2, train_set=d_train, num_boost_round=3600, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=100) 
preds_lgbm = model.predict(X_test)

Train until valid scores didn't improve in 50 rounds.
[100]	training's rmse: 0.541047	valid_1's rmse: 0.546165
[200]	training's rmse: 0.520315	valid_1's rmse: 0.526728
[300]	training's rmse: 0.506075	valid_1's rmse: 0.514408
[400]	training's rmse: 0.497893	valid_1's rmse: 0.507885
[500]	training's rmse: 0.490743	valid_1's rmse: 0.502083
[600]	training's rmse: 0.48514	valid_1's rmse: 0.497577
[700]	training's rmse: 0.481084	valid_1's rmse: 0.494833
[800]	training's rmse: 0.476951	valid_1's rmse: 0.492012
[900]	training's rmse: 0.473465	valid_1's rmse: 0.489857
[1000]	training's rmse: 0.469566	valid_1's rmse: 0.487076
[1100]	training's rmse: 0.466513	valid_1's rmse: 0.485309
[1200]	training's rmse: 0.464431	valid_1's rmse: 0.484001
[1300]	training's rmse: 0.46173	valid_1's rmse: 0.482485
[1400]	training's rmse: 0.458997	valid_1's rmse: 0.481013
[1500]	training's rmse: 0.45649	valid_1's rmse: 0.479687
[1600]	training's rmse: 0.45467	valid_1's rmse: 0.478867
[1700]	training's rmse: 0.45270

In [216]:
%%time
y = np.log1p(price)
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) 
d_train = lgbm.Dataset(train_X, label=train_y)
d_valid = lgbm.Dataset(valid_X, label=valid_y)
watchlist = [d_train, d_valid]

model2 = lgbm.train(params, train_set=d_train, num_boost_round=3600, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=100) 
preds_lgbm2 = model2.predict(X_test)

Train until valid scores didn't improve in 50 rounds.
[100]	training's rmse: 0.541425	valid_1's rmse: 0.545742
[200]	training's rmse: 0.516507	valid_1's rmse: 0.521925
[300]	training's rmse: 0.503875	valid_1's rmse: 0.510102
[400]	training's rmse: 0.495604	valid_1's rmse: 0.502645
[500]	training's rmse: 0.489324	valid_1's rmse: 0.497096
[600]	training's rmse: 0.48439	valid_1's rmse: 0.493052
[700]	training's rmse: 0.480227	valid_1's rmse: 0.48978
[800]	training's rmse: 0.476627	valid_1's rmse: 0.487071
[900]	training's rmse: 0.473621	valid_1's rmse: 0.484926
[1000]	training's rmse: 0.47078	valid_1's rmse: 0.483084
[1100]	training's rmse: 0.46822	valid_1's rmse: 0.481473
[1200]	training's rmse: 0.465853	valid_1's rmse: 0.480102
[1300]	training's rmse: 0.463474	valid_1's rmse: 0.478795
[1400]	training's rmse: 0.461466	valid_1's rmse: 0.477776
[1500]	training's rmse: 0.459445	valid_1's rmse: 0.476811
[1600]	training's rmse: 0.457647	valid_1's rmse: 0.475983
[1700]	training's rmse: 0.45578

In [208]:
%%time
x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))
cv_only = True
save_cv = True
full_train = False
NFOLDS = 5
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

for s in range(1):
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s

    if cv_only:
        kf = kfold.split(X, train_label)

        best_trees = []
        fold_scores = []

        for i, (train_fold, validate) in enumerate(kf):
            if i > 0:
                continue
            print(train_fold, validate)
#             train_fold = [1,2,3]
#             x1 = X[train_fold, :]
#             x2 =  X[validate, :]
#             y1 = train_label[train_fold]
#             y2 = train_label[validate]
            X_train, X_validate, label_train, label_validate = \
                X[train_fold, :], X[validate, :], train_label.ix[train_fold], train_label.ix[validate]
            dtrain = lgbm.Dataset(X_train, label_train)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, verbose_eval=100,
                            early_stopping_rounds=50)
            best_trees.append(bst.best_iteration)
            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
            cv_train[validate] += bst.predict(X_validate)

            score = rmse0(label_validate, cv_train[validate])
            print(score)
            fold_scores.append(score)

        cv_pred /= NFOLDS
        final_cv_train += cv_train
        final_cv_pred += cv_pred

        print("cv score:")
        print(rmse0(train_label, cv_train))
        print("current score:", rmse0(train_label, final_cv_train / (s + 1.)), s+1)
        print(fold_scores)
        print(best_trees, np.mean(best_trees))

        x_score.append(rmse0(train_label, cv_train))

print(x_score)
# 0.443656



[      0       2       3 ..., 1482532 1482533 1482534] [      1       5      11 ..., 1482514 1482518 1482522]
Train until valid scores didn't improve in 50 rounds.
[100]	valid_0's rmse: 0.544875
[200]	valid_0's rmse: 0.520821
[300]	valid_0's rmse: 0.509097
[400]	valid_0's rmse: 0.501528
[500]	valid_0's rmse: 0.49613
[600]	valid_0's rmse: 0.492172
[700]	valid_0's rmse: 0.489076
[800]	valid_0's rmse: 0.486318
[900]	valid_0's rmse: 0.484153
[1000]	valid_0's rmse: 0.482463
[1100]	valid_0's rmse: 0.480875
[1200]	valid_0's rmse: 0.479539
[1300]	valid_0's rmse: 0.478362
[1400]	valid_0's rmse: 0.477222
[1500]	valid_0's rmse: 0.476243
[1600]	valid_0's rmse: 0.47545
[1700]	valid_0's rmse: 0.474648
[1800]	valid_0's rmse: 0.473909
[1900]	valid_0's rmse: 0.473189
[2000]	valid_0's rmse: 0.47246
[2100]	valid_0's rmse: 0.471887
[2200]	valid_0's rmse: 0.471287
[2300]	valid_0's rmse: 0.470646
[2400]	valid_0's rmse: 0.470101
[2500]	valid_0's rmse: 0.469602
0.46960205822249146
cv score:
2.7548781141907615

In [None]:
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.2l.{}.{}GMT'.format(final_score, now)
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).to_csv(fn, index=False)
# pd.DataFrame({'id': train_id, 'target': final_cv_train / 16.}).to_csv('../cache/lgbm3_cv_avg.csv', index=False)

In [42]:
# df_all['tag'] = df_all['item_description'].map(lambda a: 'NWT' if any(x in a.lower() for x in ['nwt', 'new with tags']) 
#                                                else 'None')

In [40]:
df_all.head(50)

Unnamed: 0,name,item_condition_id,category_name,brand_name,shipping,item_description,tag,tag_count,brand_name_count,category_name_count,...,category_name_lbl_enc,item_condition_id_lbl_enc,ci_lbl_enc,bi_lbl_enc,sentiment,sentiment_score,name_len,name_words,item_description_len,item_description_words
0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,UnkB,1,No description yet,,1975136,928207,22251,...,829,2,2808,11903,1,0.0,35,7,18,3
1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,0,This keyboard is in great condition and works ...,,1975136,129,1419,...,86,2,351,9523,2,0.9,32,4,188,36
2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,1,Adorable top with a hint of lace and a key hol...,,1975136,2725,29812,...,1278,0,4580,11180,2,0.13625,14,2,124,29
3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,UnkB,1,New with tags. Leather horses. Retail for [rm]...,NWT,84313,928207,19146,...,503,0,1535,11901,2,0.232121,21,3,173,32
4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,UnkB,0,Complete with certificate of authenticity,,1975136,928207,28926,...,1205,0,4276,11901,2,0.1,20,4,41,5
5,Bundled items requested for Ruie,3,Women/Other/Other,UnkB,0,"Banana republic bottoms, Candies skirt with ma...",,1975136,928207,10579,...,1217,2,4323,11903,2,0.5,32,5,102,15
6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,Acacia Swimwear,0,Size small but straps slightly shortened to fi...,,1975136,638,18704,...,1277,2,4577,222,2,0.245833,34,5,83,13
7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,Soffe,1,You get three pairs of Sophie cheer shorts siz...,,1975136,215,803,...,908,2,3124,10598,0,-0.177083,36,7,268,51
8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,Nike,0,Girls Size small Plus green. Three shorts total.,,1975136,79277,803,...,908,2,3124,8197,0,-0.15,21,4,48,8
9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,UnkB,0,I realized his pants are on backwards after th...,,1975136,928207,1180,...,1046,2,3658,11903,0,-0.338333,38,6,297,55


In [9]:
train.shape

(1482535, 8)

In [10]:
test.shape

(693359, 7)

In [11]:
pd.value_counts(train.name)

Bundle                                      2232
Reserved                                     453
Converse                                     445
BUNDLE                                       418
Dress                                        410
Coach purse                                  404
Lularoe TC leggings                          396
Romper                                       353
Nike                                         340
Vans                                         334
American Eagle Jeans                         325
Miss Me Jeans                                284
Lularoe OS leggings                          281
ON HOLD                                      274
Coach Purse                                  258
Lularoe Irma                                 254
Shorts                                       247
Michael Kors Purse                           243
Bundle!                                      242
Coach wallet                                 242
Miss me jeans       

In [12]:
len(np.unique(train.name))

1225273

In [13]:
len(np.unique(train.item_condition_id))

5

In [14]:
pd.value_counts(train.item_condition_id)

1    640549
3    432161
2    375479
4     31962
5      2384
Name: item_condition_id, dtype: int64

In [183]:
len(set(train.brand_name))

4810

In [None]:
len(set(train.brand_name))

In [15]:
train.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64

In [16]:
test.isnull().sum()

test_id                   0
name                      0
item_condition_id         0
category_name          3058
brand_name           295525
shipping                  0
item_description          0
dtype: int64

In [17]:
train.fillna('Unk', inplace=True)
test.fillna('Unk', inplace=True)

In [18]:
len(np.unique(train.brand_name))

4810

In [19]:
len(np.unique(train.category_name))

1288

In [207]:
%%time
start = datetime.now()
sub_cat = []

for t, row in enumerate(DictReader(open('../data/train.tsv'), delimiter='\t')): 
    cat = str(row['category_name']).lower().split('/')
    nom = str(row['name']).lower().split()
    item_desc = str(row['item_description']).lower().split()

    if (t == 0) | (t==5000):
        print(cat)
        print(nom)
        print(item_desc)
    for sc in cat:
        if sc in nom:
            sub_cat.append(sc)
            break
        elif sc in item_desc:
            sub_cat.append(sc)
            break
        else:
            sub_cat.append('None')
            break
end = datetime.now()
print('times:',end-start)

['men', 'tops', 't-shirts']
['mlb', 'cincinnati', 'reds', 't', 'shirt', 'size', 'xl']
['no', 'description', 'yet']
['women', 'athletic apparel', 'shorts']
['hold', 'for', 'thewaxjunkie']
['merona', 'short', 'bundle,', 'pinstripe', 'shorts', 'and', 'inspire', 'shirt.']
times: 0:00:18.674790
CPU times: user 18.6 s, sys: 132 ms, total: 18.7 s
Wall time: 18.7 s


In [None]:
stops = ["http","www","img","border","home","body","a","about","above","after","again","against","all","am","an",
"and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't",
"cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from",
"further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers",
"herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its",
"itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought",
"our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such",
"than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're",
"they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were",
"weren't","what","what's","when","when's""where","where's","which","while","who","who's","whom","why","why's","with","won't","would",
"wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves" ]


In [None]:

def prepare_ngram_interaction(path,out):
    print path
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('count_of_stop_name,ratio_of_stop_name,count_of_stop_item_desc,ratio_of_stop_item_desc\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print 'finished',c
            q1_ngram = str(row['question1'].lower()).split()
            q2_ngram = str(row['question2'].lower()).split()

            count_of_stop_question1 = get_count_q1_in_q2(q1_ngram,stops)
            ratio_of_stop_question1 = get_ratio_q1_in_q2(q1_ngram,stops)

            count_of_stop_question2 = get_count_q1_in_q2(q2_ngram,stops)
            ratio_of_stop_question2 = get_ratio_q1_in_q2(q2_ngram,stops)


            outfile.write('%s,%s,%s,%s\n' % (
                count_of_stop_question1,
                ratio_of_stop_question1,
                count_of_stop_question2,
                ratio_of_stop_question2,
                ))
            c+=1
        end = datetime.now()

    print 'times:',end-start



In [None]:
prepare_ngram_interaction(path+'train_porter.csv',path+'train_porter_stop_features.csv')
prepare_ngram_interaction(path+'test_porter.csv',path+'test_porter_stop_features.csv')

In [196]:
sum(pd.Series(sub_cat) == 'None')

1464176

In [20]:
train.shipping.value_counts()

0    819435
1    663100
Name: shipping, dtype: int64

In [21]:
porter = PorterStemmer()
# snowball = SnowballStemmer('english')

In [22]:
train['DL'] = train['item_description'].map(lambda x: len(x))
test['DL'] = test['item_description'].map(lambda x: len(x))

In [21]:
train.iloc[18]

train_id                                                            18
name                               Too Faced Limited "Merry Macaroons"
item_condition_id                                                    1
category_name                            Beauty/Makeup/Makeup Palettes
brand_name                                                   Too Faced
price                                                               25
shipping                                                             1
item_description     This AUTHENTIC pallete by Too Faced is brand n...
DL                                                                 307
Name: 18, dtype: object

In [23]:
train[train['DL'] == 1].head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,DL
2730,2730,Vs lip gloss set!,1,Beauty/Makeup/Lips,Victoria's Secret,14.0,0,.,1
3552,3552,Strappy Black Lingerie,1,Women/Underwear/Panties,Unk,8.0,1,❤,1
9101,9101,2 half zips bundle,3,Women/Athletic Apparel/Jackets,Victoria's Secret,46.0,0,2,1
9404,9404,Carter's Valentines Onesie NWT NB,1,Kids/Girls 0-24 Mos/One-Pieces,Carter's,10.0,0,.,1
16733,16733,LuLaRoe CarlyDress size Xs,1,Women/Dresses/Knee-Length,Unk,36.0,0,-,1


In [24]:
train[train['DL'] == 2].head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,DL
9979,9979,200 cat collars and 30 dog collars :),1,Handmade/Pets/Collar,Unk,165.0,0,:),2
10429,10429,Old navy cute sundress,3,"Women/Dresses/Above Knee, Mini",Old Navy,8.0,0,:),2
12278,12278,Outcast 1-12 Comics,3,Vintage & Collectibles/Book/Comics,Unk,44.0,0,NM,2
14405,14405,Victoria's Secret PINK Glitter Leggings,2,"Women/Athletic Apparel/Pants, Tights, Leggings",Victoria's Secret,16.0,0,XS,2
14672,14672,Black fur coat,2,Women/Coats & Jackets/Other,Old Navy,16.0,0,XL,2


In [25]:
train[train['DL'] == 3].head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,DL
58,58,New yokai watch backpack,1,Kids/Gear/Backpacks & Carriers,Unk,10.0,0,New,3
140,140,Zelda Link Amiibo.,1,Electronics/Video Games & Consoles/Video Gamin...,Nintendo,40.0,1,New,3
768,768,Sale! [rm] For 2 morphe brushes,1,Beauty/Tools & Accessories/Makeup Brushes & Tools,Unk,12.0,1,New,3
920,920,"Lularoe hula dancers, os.",1,"Women/Athletic Apparel/Pants, Tights, Leggings",Unk,30.0,1,New,3
964,964,Doc McStuffins Activity Set Kit Toy,1,Kids/Toys/Dolls & Accessories,Unk,7.0,1,New,3


In [26]:
train[train['DL'] == 4].head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,DL
684,684,Splat Pink Fetish Rose Hair Dye,2,Beauty/Hair Care/Hair Color,Unk,7.0,0,Full,4
1253,1253,AF tank,2,"Women/Tops & Blouses/Tank, Cami",Abercrombie & Fitch,8.0,1,Nwot,4
1965,1965,"Burberry, baby boy, 18months",3,Kids/Boys 0-24 Mos/Tops & T-Shirts,Unk,22.0,0,Used,4
1978,1978,LuLaRoe Sarah Large black thin ribbed,1,Women/Sweaters/Cardigan,Unk,86.0,0,BNWT,4
2178,2178,Yoga Mat,3,Sports & Outdoors/Exercise/Yoga & Pilates,Unk,12.0,0,Used,4


In [31]:
%%time

print('Clean digits')
train['item_description'] = train['item_description'].astype('str').apply(lambda x:remove_digits(x.lower()))
test['item_description'] = test['item_description'].astype('str').apply(lambda x:remove_digits(x.lower()))

Clean digits
CPU times: user 11.7 s, sys: 164 ms, total: 11.9 s
Wall time: 11.9 s


In [32]:
%%time

print('Generate porter')
train['ide_p'] = train['item_description'].astype('str').apply(lambda x:stem_str(x.lower(),porter))
test['ide_p'] = test['item_description'].astype('str').apply(lambda x:stem_str(x.lower(),porter))

Generate porter
metal   sterling silver stone  black onyx style  celtic eing  vintage ring  boho ring size    rafaella jewelry


brand new   authentic  tarteist pro palette by tarte  high performance naturals   code on back of palette is hard to see it s oed  palette has never been used  swatched or took out of the box besides for this picture session     nib   retails higher  still has plastic over the colors and a  makeup  of the day if you d like a guide     price is firm as this is an expensive piece   no low balling  it will get you blocked  and i dont like blocking people so let s have some fun and get this brand new beauty of a palette to a new home  ty  happy shopping  


used for my first aid college course  purchased new  first aid  cpr    aed advanced textbook  th edition access code never scratched off  isbn       no free shipping  i ship priority mail   bundle with my other nursing books to save on shipping costs 


advanced first aid  cpr   aed   american govt textbook bu

In [33]:
%%time

print('Clean digits')
train['name'] = train['name'].astype('str').apply(lambda x:remove_digits(x.lower()))
test['name'] = test['name'].astype('str').apply(lambda x:remove_digits(x.lower()))

Clean digits
CPU times: user 5.16 s, sys: 72 ms, total: 5.23 s
Wall time: 5.23 s


In [34]:
%%time

print('Generate porter')
train['n_p'] = train['name'].astype('str').apply(lambda x:stem_str(x.lower(),porter))
test['n_p'] = test['name'].astype('str').apply(lambda x:stem_str(x.lower(),porter))

Generate porter
first aid cpr   aed advanced textbook


CPU times: user 3min 26s, sys: 200 ms, total: 3min 26s
Wall time: 3min 26s


In [35]:
train.to_csv('../cache/train_porter.csv', index=False)

In [36]:
test.to_csv('../cache/test_porter.csv', index=False)

In [37]:
#path = '../cache/'


string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')
string.punctuation.__add__('?')
string.punctuation.__add__('.')
string.punctuation.__add__(',')


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~,'

In [138]:
def remove_punctuation(x):
    new_line = [ w for w in list(x) if w not in string.punctuation]
    new_line = ''.join(new_line)
#     print(new_line)
    return new_line

In [140]:
def prepare_unigram(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_unigram,item_desc_unigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            a1 = remove_punctuation(str(row['n_p']).lower()).split(' ')
            a2 = remove_punctuation(str(row['ide_p']).lower()).lower().split(' ')
            if c==0:
                print('1')
                print(a1)
            a1_bigram = ' '.join(a1)
            a2_bigram = ' '.join(a2)
            if c==0:
                print('3')
                print(a1_bigram)
            outfile.write('%s,%s\n' % (a1_bigram, a2_bigram))
            c+=1
        end = datetime.now()


    print('times:{}'.format(end-start))


In [153]:
def getUnigram(words):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny']
        Output: a list of unigram
    """
    assert type(words) == list
    return words


In [154]:
def getBigram(words, join_string="_", skip=0):
    """
       Input: a list of words, e.g., ['I', 'am', 'Denny']
       Output: a list of bigram, e.g., ['I_am', 'am_Denny']
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1,skip+2):
                if i+k < L:
                    lst.append( join_string.join([words[i], words[i+k]]) )
    else:
        # set it as unigram
        lst = getUnigram(words)
    return lst


In [155]:
def prepare_bigram(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_bigram,item_desc_bigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            
            a1 = remove_punctuation(str(row['n_p']).lower()).split(' ')
            a2 = remove_punctuation(str(row['ide_p']).lower()).lower().split(' ')
            
            a1_bigram = getBigram(a1)
            a2_bigram = getBigram(a2)
            if c==0:
                print('----')
                print(row['n_p'])
                print(row['ide_p'])
                print(a1_bigram)
                print('----')
            a1_bigram = ' '.join(a1_bigram)
            a2_bigram = ' '.join(a2_bigram)
            if c==0:
                print(a1_bigram)
                print('-----------------')
            outfile.write('%s,%s\n' % (a1_bigram, a2_bigram))
            c+=1
        end = datetime.now()


    print('times:{}'.format(end-start))

In [156]:
def distinct_terms(lst1, lst2):
    lst1 = lst1.split(" ")
    lst2 = lst2.split(" ")
    common = set(lst1).intersection(set(lst2))
    new_lst1 = ' '.join([w for w in lst1 if w not in common])
    new_lst2 = ' '.join([w for w in lst2 if w not in common])
    
    return (new_lst1,new_lst2)

In [177]:
def prepare_distinct(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_distinct_unigram,item_desc_distinct_unigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            a1 = str(row['name_unigram'])
            a2 = str(row['item_desc_unigram'])
            coo_terms = distinct_terms(a1,a2)
            if c==0:
                print(coo_terms)
            outfile.write('%s,%s\n' % coo_terms)
            c+=1
        end = datetime.now()
    print('times:',end-start)
    
def prepare_distinct_bi(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_distinct_bigram,item_desc_distinct_bigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            a1 = str(row['name_bigram'])
            a2 = str(row['item_desc_bigram'])
            coo_terms = distinct_terms(a1,a2)
            if c==0:
                print(coo_terms)
            outfile.write('%s,%s\n' % coo_terms)
            c+=1
        end = datetime.now()
    print('times:',end-start)

In [166]:
def cooccurrence_terms(lst1, lst2, join_str="__"):
    lst1 = lst1.split(" ")
    lst2 = lst2.split(" ")
    terms = [""] * len(lst1) * len(lst2)
    cnt =  0
    for item1 in lst1:
        for item2 in lst2:
            terms[cnt] = item1 + join_str + item2
            cnt += 1
    res = " ".join(terms)
    return res

In [173]:
def prepare_cooccurrence(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_unigram_item_desc_unigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            q1 = str(row['name_distinct_unigram'])
            q2 = str(row['item_desc_distinct_unigram'])
            coo_terms = cooccurrence_terms(q1,q2)
            if c==0:
                print(coo_terms)
            outfile.write('%s\n' % coo_terms)
            c+=1
        end = datetime.now()
    print('times:',end-start)

In [168]:
def prepare_cooccurrence_bi(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_distinct_bigram_item_desc_distinct_bigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            q1 = str(row['name_distinct_bigram'])
            q2 = str(row['item_desc_distinct_bigram'])
            coo_terms = cooccurrence_terms(q1,q2)
            outfile.write('%s\n' % coo_terms)
            if c==0:
                print(coo_terms)
            c+=1
        end = datetime.now()
    print('times:',end-start)

In [142]:
%%time
prepare_unigram('../cache/train_porter.csv','../cache/train_unigram.csv')


../cache/train_porter.csv
finished 0
1
['mlb', 'cincinnati', 'red', 't', 'shirt', 'size', 'xl']
3
mlb cincinnati red t shirt size xl
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times:0:00:52.615828
CPU times: user 52.1 s, sys: 484 ms, total: 52.6 s
Wall time: 52.6 s


In [143]:
%%time
prepare_unigram('../cache/test_porter.csv','../cache/test_unigram.csv')

../cache/test_porter.csv
finished 0
1
['breast', 'cancer', 'i', 'fight', 'like', 'a', 'girl', 'ring']
3
breast cancer i fight like a girl ring
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times:0:00:23.973272
CPU times: user 23.8 s, sys: 180 ms, total: 24 s
Wall time: 24.2 s


In [161]:
%%time
prepare_bigram('../cache/train_porter.csv','../cache/train_bigram.csv')

../cache/train_porter.csv
finished 0
----
mlb cincinnati red t shirt size xl
no descript yet
['mlb_cincinnati', 'cincinnati_red', 'red_t', 't_shirt', 'shirt_size', 'size_xl']
----
mlb_cincinnati cincinnati_red red_t t_shirt shirt_size size_xl
-----------------
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times:0:01:28.207023
CPU times: user 1min 27s, sys: 776 ms, total: 1min 28s
Wall time: 1min 28s


In [162]:
%%time
prepare_bigram('../cache/test_porter.csv','../cache/test_bigram.csv')

../cache/test_porter.csv
finished 0
----
breast cancer i fight like a girl ring
size
['breast_cancer', 'cancer_i', 'i_fight', 'fight_like', 'like_a', 'a_girl', 'girl_ring']
----
breast_cancer cancer_i i_fight fight_like like_a a_girl girl_ring
-----------------
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times:0:00:41.940817
CPU times: user 40.1 s, sys: 392 ms, total: 40.5 s
Wall time: 41.9 s


In [163]:
%%time 
prepare_distinct('../cache/train_unigram.csv','../cache/train_distinct_unigram.csv')

../cache/train_unigram.csv
finished 0
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times: 0:00:23.690776
CPU times: user 23.3 s, sys: 312 ms, total: 23.6 s
Wall time: 23.7 s


In [169]:
%%time
prepare_distinct('../cache/test_unigram.csv','../cache/test_distinct_unigram.csv')

../cache/test_unigram.csv
finished 0
('breast cancer i fight like a girl ring', 'size')
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times: 0:00:10.425988
CPU times: user 10.3 s, sys: 148 ms, total: 10.4 s
Wall time: 10.4 s


In [170]:
%%time
prepare_cooccurrence('../cache/train_unigram.csv','../cache/train_cooccurrence_unigram.csv')


../cache/train_unigram.csv
finished 0
mlb__no mlb__descript mlb__yet cincinnati__no cincinnati__descript cincinnati__yet red__no red__descript red__yet t__no t__descript t__yet shirt__no shirt__descript shirt__yet size__no size__descript size__yet xl__no xl__descript xl__yet
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times: 0:00:56.872758
CPU times: user 52.5 s, sys: 1.53 s, total: 54 s
Wall time: 56.9 s


In [171]:
%%time
prepare_cooccurrence('../cache/test_unigram.csv','../cache/test_cooccurrence_unigram.csv')

../cache/test_unigram.csv
finished 0
breast__size cancer__size i__size fight__size like__size a__size girl__size ring__size
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times: 0:00:25.222971
CPU times: user 24.6 s, sys: 660 ms, total: 25.2 s
Wall time: 25.2 s


In [174]:
%%time
prepare_cooccurrence('../cache/train_distinct_unigram.csv','../cache/train_cooccurrence_distinct.csv')


../cache/train_distinct_unigram.csv
finished 0
mlb__no mlb__descript mlb__yet cincinnati__no cincinnati__descript cincinnati__yet red__no red__descript red__yet t__no t__descript t__yet shirt__no shirt__descript shirt__yet size__no size__descript size__yet xl__no xl__descript xl__yet
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times: 0:00:32.927723
CPU times: user 30.5 s, sys: 652 ms, total: 31.1 s
Wall time: 33.4 s


In [175]:
%%time
prepare_cooccurrence('../cache/test_distinct_unigram.csv','../cache/test_cooccurrence_distinct.csv')

../cache/test_distinct_unigram.csv
finished 0
breast__size cancer__size i__size fight__size like__size a__size girl__size ring__size
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times: 0:00:13.972935
CPU times: user 13.7 s, sys: 252 ms, total: 14 s
Wall time: 14 s


In [178]:
%%time 
prepare_distinct_bi('../cache/train_bigram.csv','../cache/train_distinct_bigram.csv')

../cache/train_bigram.csv
finished 0
('mlb_cincinnati cincinnati_red red_t t_shirt shirt_size size_xl', 'no_descript descript_yet')
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times: 0:00:25.199160
CPU times: user 24.8 s, sys: 440 ms, total: 25.2 s
Wall time: 25.2 s


In [179]:
%%time 
prepare_distinct_bi('../cache/test_bigram.csv','../cache/test_distinct_bigram.csv')

../cache/test_bigram.csv
finished 0
('breast_cancer cancer_i i_fight fight_like like_a a_girl girl_ring', 'size')
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times: 0:00:12.053643
CPU times: user 11.8 s, sys: 296 ms, total: 12.1 s
Wall time: 12.1 s


In [180]:
%%time
prepare_cooccurrence_bi('../cache/train_distinct_bigram.csv','../cache/train_cooccurrence_distinct_bigram.csv')

../cache/train_distinct_bigram.csv
finished 0
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times: 0:00:40.712241


In [181]:
%%time
prepare_cooccurrence_bi('../cache/test_distinct_bigram.csv','../cache/test_cooccurrence_distinct_bigram.csv')

../cache/test_distinct_bigram.csv
finished 0
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times: 0:00:18.645566


In [36]:
# %%time

# print('Generate snowball')
# train['de_sb'] = train['item_description'].astype('str').apply(lambda x:stem_str(x.lower(),snowball) if len(x) > 1 else x)
# test['de_sb'] = test['item_description'].astype('str').apply(lambda x:stem_str(x.lower(),snowball) if len(x) > 1 else x)

Generate snowball
CPU times: user 12min 37s, sys: 700 ms, total: 12min 38s
Wall time: 12min 38s


In [37]:
# %%time

# print('Generate snowball')
# train['name_sb'] = train['name'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
# test['name_sb'] = test['name'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))

Generate snowball
CPU times: user 5min 42s, sys: 416 ms, total: 5min 43s
Wall time: 5min 43s


In [38]:
# print('Generate porter')
# train['name_p'] = train['name'].astype(str).apply(lambda x:stem_str(x.lower(),porter))
# test['name_p'] = test['name'].astype(str).apply(lambda x:stem_str(x.lower(),porter))

Generate porter
metal  925 sterling silver stone  black onyx style  celtic eing  vintage ring  boho ring size 6   rafaella jewelry


used for my first aid college course  purchased new  first aid  cpr    aed advanced textbook  6th edition access code never scratched off  isbn 978 1 4496 3505 3  no free shipping  i ship priority mail   bundle with my other nursing books to save on shipping costs 


advanced first aid  cpr   aed   american govt textbook bundle  great condition minus for bend in cover  cd included  isbn     978 0 87912 341 3


cute black   white polka dot skort size 4 in very good ued condition 


set of 2 coordinating designs   summer sunburst and promised  these were purchased as second quality wraps  so there may be some minor color design flaws  wraps will ship in original packaging that provides application instructions   sheets were only removed for photos  one sheet has enough wraps for 2 4 mani pedis  depending on size of nails  selling only as set  3  rm  offer d