In [94]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import text
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import dask.dataframe as dd
from datetime import datetime
from csv import DictReader
from math import exp, log, sqrt
from random import random,shuffle
import pickle
import sys
#from ngram import getUnigram
import string
import nltk
from nltk.util import ngrams # function for making ngrams
import re


In [3]:
seed = 1024
np.random.seed(seed)

In [4]:
def stem_str(x,stemmer=SnowballStemmer('english')):
    try:
        x = text.re.sub("[^a-zA-Z0-9]"," ", x)
        x = (" ").join([stemmer.stem(z) for z in x.split(" ")])
        x = " ".join(x.split())
    except: 
        print(x)
        print('\n')
        return ''
    return x

In [28]:
def remove_digits(x):
    x = re.sub(r'\d+', '', x)
    ' '.join(x.split())
    return x

In [5]:
data = '../data/'
cache = '../cache/'

train = pd.read_csv(data+"train.tsv", sep='\t')
test = pd.read_csv(data+"test.tsv", sep='\t')

In [6]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [7]:
# BNWT: Brand New With Tags
# NWT: New With Tags
# BNWOT: Brand New Without Tags
# NWOT: New With Out Tags
# BNIP: Brand New In Packet
# NIP: New In Packet
# BNIB: Brand New In Box
# NIB: New In Box
# MIB: Mint In Box
# MWOB: Mint With Out Box
# MIP: Mint In Packet
# MWOP: Mint With Out Packet

In [8]:
# features I can think of -
# 1. no of times each of the above appear in text
# 2. Put prices in bins
# 3. Number of times each category appears in each bin
# 4. no of times each item_condition appears in each bin
# 5. min price for each category
# 6. max price for each category
# 7. average price for each category
# 8. 

In [9]:
train.shape

(1482535, 8)

In [10]:
test.shape

(693359, 7)

In [11]:
pd.value_counts(train.name)

Bundle                                      2232
Reserved                                     453
Converse                                     445
BUNDLE                                       418
Dress                                        410
Coach purse                                  404
Lularoe TC leggings                          396
Romper                                       353
Nike                                         340
Vans                                         334
American Eagle Jeans                         325
Miss Me Jeans                                284
Lularoe OS leggings                          281
ON HOLD                                      274
Coach Purse                                  258
Lularoe Irma                                 254
Shorts                                       247
Michael Kors Purse                           243
Bundle!                                      242
Coach wallet                                 242
Miss me jeans       

In [12]:
len(np.unique(train.name))

1225273

In [13]:
len(np.unique(train.item_condition_id))

5

In [14]:
pd.value_counts(train.item_condition_id)

1    640549
3    432161
2    375479
4     31962
5      2384
Name: item_condition_id, dtype: int64

In [183]:
len(set(train.brand_name))

4810

In [None]:
len(set(train.brand_name))

In [15]:
train.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64

In [16]:
test.isnull().sum()

test_id                   0
name                      0
item_condition_id         0
category_name          3058
brand_name           295525
shipping                  0
item_description          0
dtype: int64

In [17]:
train.fillna('Unk', inplace=True)
test.fillna('Unk', inplace=True)

In [18]:
len(np.unique(train.brand_name))

4810

In [19]:
len(np.unique(train.category_name))

1288

In [207]:
%%time
start = datetime.now()
sub_cat = []

for t, row in enumerate(DictReader(open('../data/train.tsv'), delimiter='\t')): 
    cat = str(row['category_name']).lower().split('/')
    nom = str(row['name']).lower().split()
    item_desc = str(row['item_description']).lower().split()

    if (t == 0) | (t==5000):
        print(cat)
        print(nom)
        print(item_desc)
    for sc in cat:
        if sc in nom:
            sub_cat.append(sc)
            break
        elif sc in item_desc:
            sub_cat.append(sc)
            break
        else:
            sub_cat.append('None')
            break
end = datetime.now()
print('times:',end-start)

['men', 'tops', 't-shirts']
['mlb', 'cincinnati', 'reds', 't', 'shirt', 'size', 'xl']
['no', 'description', 'yet']
['women', 'athletic apparel', 'shorts']
['hold', 'for', 'thewaxjunkie']
['merona', 'short', 'bundle,', 'pinstripe', 'shorts', 'and', 'inspire', 'shirt.']
times: 0:00:18.674790
CPU times: user 18.6 s, sys: 132 ms, total: 18.7 s
Wall time: 18.7 s


In [None]:
stops = ["http","www","img","border","home","body","a","about","above","after","again","against","all","am","an",
"and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't",
"cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from",
"further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers",
"herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its",
"itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought",
"our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such",
"than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're",
"they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were",
"weren't","what","what's","when","when's""where","where's","which","while","who","who's","whom","why","why's","with","won't","would",
"wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves" ]


In [None]:

def prepare_ngram_interaction(path,out):
    print path
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('count_of_stop_name,ratio_of_stop_name,count_of_stop_item_desc,ratio_of_stop_item_desc\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print 'finished',c
            q1_ngram = str(row['question1'].lower()).split()
            q2_ngram = str(row['question2'].lower()).split()

            count_of_stop_question1 = get_count_q1_in_q2(q1_ngram,stops)
            ratio_of_stop_question1 = get_ratio_q1_in_q2(q1_ngram,stops)

            count_of_stop_question2 = get_count_q1_in_q2(q2_ngram,stops)
            ratio_of_stop_question2 = get_ratio_q1_in_q2(q2_ngram,stops)


            outfile.write('%s,%s,%s,%s\n' % (
                count_of_stop_question1,
                ratio_of_stop_question1,
                count_of_stop_question2,
                ratio_of_stop_question2,
                ))
            c+=1
        end = datetime.now()

    print 'times:',end-start



In [None]:
prepare_ngram_interaction(path+'train_porter.csv',path+'train_porter_stop_features.csv')
prepare_ngram_interaction(path+'test_porter.csv',path+'test_porter_stop_features.csv')


In [196]:
sum(pd.Series(sub_cat) == 'None')

1464176

In [20]:
train.shipping.value_counts()

0    819435
1    663100
Name: shipping, dtype: int64

In [21]:
porter = PorterStemmer()
# snowball = SnowballStemmer('english')

In [22]:
train['DL'] = train['item_description'].map(lambda x: len(x))
test['DL'] = test['item_description'].map(lambda x: len(x))

In [21]:
train.iloc[18]

train_id                                                            18
name                               Too Faced Limited "Merry Macaroons"
item_condition_id                                                    1
category_name                            Beauty/Makeup/Makeup Palettes
brand_name                                                   Too Faced
price                                                               25
shipping                                                             1
item_description     This AUTHENTIC pallete by Too Faced is brand n...
DL                                                                 307
Name: 18, dtype: object

In [23]:
train[train['DL'] == 1].head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,DL
2730,2730,Vs lip gloss set!,1,Beauty/Makeup/Lips,Victoria's Secret,14.0,0,.,1
3552,3552,Strappy Black Lingerie,1,Women/Underwear/Panties,Unk,8.0,1,❤,1
9101,9101,2 half zips bundle,3,Women/Athletic Apparel/Jackets,Victoria's Secret,46.0,0,2,1
9404,9404,Carter's Valentines Onesie NWT NB,1,Kids/Girls 0-24 Mos/One-Pieces,Carter's,10.0,0,.,1
16733,16733,LuLaRoe CarlyDress size Xs,1,Women/Dresses/Knee-Length,Unk,36.0,0,-,1


In [24]:
train[train['DL'] == 2].head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,DL
9979,9979,200 cat collars and 30 dog collars :),1,Handmade/Pets/Collar,Unk,165.0,0,:),2
10429,10429,Old navy cute sundress,3,"Women/Dresses/Above Knee, Mini",Old Navy,8.0,0,:),2
12278,12278,Outcast 1-12 Comics,3,Vintage & Collectibles/Book/Comics,Unk,44.0,0,NM,2
14405,14405,Victoria's Secret PINK Glitter Leggings,2,"Women/Athletic Apparel/Pants, Tights, Leggings",Victoria's Secret,16.0,0,XS,2
14672,14672,Black fur coat,2,Women/Coats & Jackets/Other,Old Navy,16.0,0,XL,2


In [25]:
train[train['DL'] == 3].head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,DL
58,58,New yokai watch backpack,1,Kids/Gear/Backpacks & Carriers,Unk,10.0,0,New,3
140,140,Zelda Link Amiibo.,1,Electronics/Video Games & Consoles/Video Gamin...,Nintendo,40.0,1,New,3
768,768,Sale! [rm] For 2 morphe brushes,1,Beauty/Tools & Accessories/Makeup Brushes & Tools,Unk,12.0,1,New,3
920,920,"Lularoe hula dancers, os.",1,"Women/Athletic Apparel/Pants, Tights, Leggings",Unk,30.0,1,New,3
964,964,Doc McStuffins Activity Set Kit Toy,1,Kids/Toys/Dolls & Accessories,Unk,7.0,1,New,3


In [26]:
train[train['DL'] == 4].head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,DL
684,684,Splat Pink Fetish Rose Hair Dye,2,Beauty/Hair Care/Hair Color,Unk,7.0,0,Full,4
1253,1253,AF tank,2,"Women/Tops & Blouses/Tank, Cami",Abercrombie & Fitch,8.0,1,Nwot,4
1965,1965,"Burberry, baby boy, 18months",3,Kids/Boys 0-24 Mos/Tops & T-Shirts,Unk,22.0,0,Used,4
1978,1978,LuLaRoe Sarah Large black thin ribbed,1,Women/Sweaters/Cardigan,Unk,86.0,0,BNWT,4
2178,2178,Yoga Mat,3,Sports & Outdoors/Exercise/Yoga & Pilates,Unk,12.0,0,Used,4


In [31]:
%%time

print('Clean digits')
train['item_description'] = train['item_description'].astype('str').apply(lambda x:remove_digits(x.lower()))
test['item_description'] = test['item_description'].astype('str').apply(lambda x:remove_digits(x.lower()))

Clean digits
CPU times: user 11.7 s, sys: 164 ms, total: 11.9 s
Wall time: 11.9 s


In [32]:
%%time

print('Generate porter')
train['ide_p'] = train['item_description'].astype('str').apply(lambda x:stem_str(x.lower(),porter))
test['ide_p'] = test['item_description'].astype('str').apply(lambda x:stem_str(x.lower(),porter))

Generate porter
metal   sterling silver stone  black onyx style  celtic eing  vintage ring  boho ring size    rafaella jewelry


brand new   authentic  tarteist pro palette by tarte  high performance naturals   code on back of palette is hard to see it s oed  palette has never been used  swatched or took out of the box besides for this picture session     nib   retails higher  still has plastic over the colors and a  makeup  of the day if you d like a guide     price is firm as this is an expensive piece   no low balling  it will get you blocked  and i dont like blocking people so let s have some fun and get this brand new beauty of a palette to a new home  ty  happy shopping  


used for my first aid college course  purchased new  first aid  cpr    aed advanced textbook  th edition access code never scratched off  isbn       no free shipping  i ship priority mail   bundle with my other nursing books to save on shipping costs 


advanced first aid  cpr   aed   american govt textbook bu

In [33]:
%%time

print('Clean digits')
train['name'] = train['name'].astype('str').apply(lambda x:remove_digits(x.lower()))
test['name'] = test['name'].astype('str').apply(lambda x:remove_digits(x.lower()))

Clean digits
CPU times: user 5.16 s, sys: 72 ms, total: 5.23 s
Wall time: 5.23 s


In [34]:
%%time

print('Generate porter')
train['n_p'] = train['name'].astype('str').apply(lambda x:stem_str(x.lower(),porter))
test['n_p'] = test['name'].astype('str').apply(lambda x:stem_str(x.lower(),porter))

Generate porter
first aid cpr   aed advanced textbook


CPU times: user 3min 26s, sys: 200 ms, total: 3min 26s
Wall time: 3min 26s


In [35]:
train.to_csv('../cache/train_porter.csv', index=False)

In [36]:
test.to_csv('../cache/test_porter.csv', index=False)

In [37]:
#path = '../cache/'


string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')
string.punctuation.__add__('?')
string.punctuation.__add__('.')
string.punctuation.__add__(',')


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~,'

In [138]:
def remove_punctuation(x):
    new_line = [ w for w in list(x) if w not in string.punctuation]
    new_line = ''.join(new_line)
#     print(new_line)
    return new_line

In [140]:
def prepare_unigram(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_unigram,item_desc_unigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            a1 = remove_punctuation(str(row['n_p']).lower()).split(' ')
            a2 = remove_punctuation(str(row['ide_p']).lower()).lower().split(' ')
            if c==0:
                print('1')
                print(a1)
            a1_bigram = ' '.join(a1)
            a2_bigram = ' '.join(a2)
            if c==0:
                print('3')
                print(a1_bigram)
            outfile.write('%s,%s\n' % (a1_bigram, a2_bigram))
            c+=1
        end = datetime.now()


    print('times:{}'.format(end-start))


In [153]:
def getUnigram(words):
    """
        Input: a list of words, e.g., ['I', 'am', 'Denny']
        Output: a list of unigram
    """
    assert type(words) == list
    return words


In [154]:
def getBigram(words, join_string="_", skip=0):
    """
       Input: a list of words, e.g., ['I', 'am', 'Denny']
       Output: a list of bigram, e.g., ['I_am', 'am_Denny']
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1,skip+2):
                if i+k < L:
                    lst.append( join_string.join([words[i], words[i+k]]) )
    else:
        # set it as unigram
        lst = getUnigram(words)
    return lst


In [155]:
def prepare_bigram(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_bigram,item_desc_bigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            
            a1 = remove_punctuation(str(row['n_p']).lower()).split(' ')
            a2 = remove_punctuation(str(row['ide_p']).lower()).lower().split(' ')
            
            a1_bigram = getBigram(a1)
            a2_bigram = getBigram(a2)
            if c==0:
                print('----')
                print(row['n_p'])
                print(row['ide_p'])
                print(a1_bigram)
                print('----')
            a1_bigram = ' '.join(a1_bigram)
            a2_bigram = ' '.join(a2_bigram)
            if c==0:
                print(a1_bigram)
                print('-----------------')
            outfile.write('%s,%s\n' % (a1_bigram, a2_bigram))
            c+=1
        end = datetime.now()


    print('times:{}'.format(end-start))

In [156]:
def distinct_terms(lst1, lst2):
    lst1 = lst1.split(" ")
    lst2 = lst2.split(" ")
    common = set(lst1).intersection(set(lst2))
    new_lst1 = ' '.join([w for w in lst1 if w not in common])
    new_lst2 = ' '.join([w for w in lst2 if w not in common])
    
    return (new_lst1,new_lst2)

In [177]:
def prepare_distinct(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_distinct_unigram,item_desc_distinct_unigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            a1 = str(row['name_unigram'])
            a2 = str(row['item_desc_unigram'])
            coo_terms = distinct_terms(a1,a2)
            if c==0:
                print(coo_terms)
            outfile.write('%s,%s\n' % coo_terms)
            c+=1
        end = datetime.now()
    print('times:',end-start)
    
def prepare_distinct_bi(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_distinct_bigram,item_desc_distinct_bigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            a1 = str(row['name_bigram'])
            a2 = str(row['item_desc_bigram'])
            coo_terms = distinct_terms(a1,a2)
            if c==0:
                print(coo_terms)
            outfile.write('%s,%s\n' % coo_terms)
            c+=1
        end = datetime.now()
    print('times:',end-start)

In [166]:
def cooccurrence_terms(lst1, lst2, join_str="__"):
    lst1 = lst1.split(" ")
    lst2 = lst2.split(" ")
    terms = [""] * len(lst1) * len(lst2)
    cnt =  0
    for item1 in lst1:
        for item2 in lst2:
            terms[cnt] = item1 + join_str + item2
            cnt += 1
    res = " ".join(terms)
    return res

In [173]:
def prepare_cooccurrence(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_unigram_item_desc_unigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            q1 = str(row['name_distinct_unigram'])
            q2 = str(row['item_desc_distinct_unigram'])
            coo_terms = cooccurrence_terms(q1,q2)
            if c==0:
                print(coo_terms)
            outfile.write('%s\n' % coo_terms)
            c+=1
        end = datetime.now()
    print('times:',end-start)

In [168]:
def prepare_cooccurrence_bi(path,out):
    print(path)
    c = 0
    start = datetime.now()
    with open(out, 'w') as outfile:
        outfile.write('name_distinct_bigram_item_desc_distinct_bigram\n')
        for t, row in enumerate(DictReader(open(path), delimiter=',')): 
            if c%100000==0:
                print('finished',c)
            q1 = str(row['name_distinct_bigram'])
            q2 = str(row['item_desc_distinct_bigram'])
            coo_terms = cooccurrence_terms(q1,q2)
            outfile.write('%s\n' % coo_terms)
            if c==0:
                print(coo_terms)
            c+=1
        end = datetime.now()
    print('times:',end-start)

In [142]:
%%time
prepare_unigram('../cache/train_porter.csv','../cache/train_unigram.csv')


../cache/train_porter.csv
finished 0
1
['mlb', 'cincinnati', 'red', 't', 'shirt', 'size', 'xl']
3
mlb cincinnati red t shirt size xl
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times:0:00:52.615828
CPU times: user 52.1 s, sys: 484 ms, total: 52.6 s
Wall time: 52.6 s


In [143]:
%%time
prepare_unigram('../cache/test_porter.csv','../cache/test_unigram.csv')

../cache/test_porter.csv
finished 0
1
['breast', 'cancer', 'i', 'fight', 'like', 'a', 'girl', 'ring']
3
breast cancer i fight like a girl ring
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times:0:00:23.973272
CPU times: user 23.8 s, sys: 180 ms, total: 24 s
Wall time: 24.2 s


In [161]:
%%time
prepare_bigram('../cache/train_porter.csv','../cache/train_bigram.csv')

../cache/train_porter.csv
finished 0
----
mlb cincinnati red t shirt size xl
no descript yet
['mlb_cincinnati', 'cincinnati_red', 'red_t', 't_shirt', 'shirt_size', 'size_xl']
----
mlb_cincinnati cincinnati_red red_t t_shirt shirt_size size_xl
-----------------
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times:0:01:28.207023
CPU times: user 1min 27s, sys: 776 ms, total: 1min 28s
Wall time: 1min 28s


In [162]:
%%time
prepare_bigram('../cache/test_porter.csv','../cache/test_bigram.csv')

../cache/test_porter.csv
finished 0
----
breast cancer i fight like a girl ring
size
['breast_cancer', 'cancer_i', 'i_fight', 'fight_like', 'like_a', 'a_girl', 'girl_ring']
----
breast_cancer cancer_i i_fight fight_like like_a a_girl girl_ring
-----------------
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times:0:00:41.940817
CPU times: user 40.1 s, sys: 392 ms, total: 40.5 s
Wall time: 41.9 s


In [163]:
%%time 
prepare_distinct('../cache/train_unigram.csv','../cache/train_distinct_unigram.csv')

../cache/train_unigram.csv
finished 0
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times: 0:00:23.690776
CPU times: user 23.3 s, sys: 312 ms, total: 23.6 s
Wall time: 23.7 s


In [169]:
%%time
prepare_distinct('../cache/test_unigram.csv','../cache/test_distinct_unigram.csv')

../cache/test_unigram.csv
finished 0
('breast cancer i fight like a girl ring', 'size')
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times: 0:00:10.425988
CPU times: user 10.3 s, sys: 148 ms, total: 10.4 s
Wall time: 10.4 s


In [170]:
%%time
prepare_cooccurrence('../cache/train_unigram.csv','../cache/train_cooccurrence_unigram.csv')


../cache/train_unigram.csv
finished 0
mlb__no mlb__descript mlb__yet cincinnati__no cincinnati__descript cincinnati__yet red__no red__descript red__yet t__no t__descript t__yet shirt__no shirt__descript shirt__yet size__no size__descript size__yet xl__no xl__descript xl__yet
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times: 0:00:56.872758
CPU times: user 52.5 s, sys: 1.53 s, total: 54 s
Wall time: 56.9 s


In [171]:
%%time
prepare_cooccurrence('../cache/test_unigram.csv','../cache/test_cooccurrence_unigram.csv')

../cache/test_unigram.csv
finished 0
breast__size cancer__size i__size fight__size like__size a__size girl__size ring__size
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times: 0:00:25.222971
CPU times: user 24.6 s, sys: 660 ms, total: 25.2 s
Wall time: 25.2 s


In [174]:
%%time
prepare_cooccurrence('../cache/train_distinct_unigram.csv','../cache/train_cooccurrence_distinct.csv')


../cache/train_distinct_unigram.csv
finished 0
mlb__no mlb__descript mlb__yet cincinnati__no cincinnati__descript cincinnati__yet red__no red__descript red__yet t__no t__descript t__yet shirt__no shirt__descript shirt__yet size__no size__descript size__yet xl__no xl__descript xl__yet
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times: 0:00:32.927723
CPU times: user 30.5 s, sys: 652 ms, total: 31.1 s
Wall time: 33.4 s


In [175]:
%%time
prepare_cooccurrence('../cache/test_distinct_unigram.csv','../cache/test_cooccurrence_distinct.csv')

../cache/test_distinct_unigram.csv
finished 0
breast__size cancer__size i__size fight__size like__size a__size girl__size ring__size
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times: 0:00:13.972935
CPU times: user 13.7 s, sys: 252 ms, total: 14 s
Wall time: 14 s


In [178]:
%%time 
prepare_distinct_bi('../cache/train_bigram.csv','../cache/train_distinct_bigram.csv')

../cache/train_bigram.csv
finished 0
('mlb_cincinnati cincinnati_red red_t t_shirt shirt_size size_xl', 'no_descript descript_yet')
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times: 0:00:25.199160
CPU times: user 24.8 s, sys: 440 ms, total: 25.2 s
Wall time: 25.2 s


In [179]:
%%time 
prepare_distinct_bi('../cache/test_bigram.csv','../cache/test_distinct_bigram.csv')

../cache/test_bigram.csv
finished 0
('breast_cancer cancer_i i_fight fight_like like_a a_girl girl_ring', 'size')
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times: 0:00:12.053643
CPU times: user 11.8 s, sys: 296 ms, total: 12.1 s
Wall time: 12.1 s


In [180]:
%%time
prepare_cooccurrence_bi('../cache/train_distinct_bigram.csv','../cache/train_cooccurrence_distinct_bigram.csv')

../cache/train_distinct_bigram.csv
finished 0
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
finished 700000
finished 800000
finished 900000
finished 1000000
finished 1100000
finished 1200000
finished 1300000
finished 1400000
times: 0:00:40.712241


In [181]:
%%time
prepare_cooccurrence_bi('../cache/test_distinct_bigram.csv','../cache/test_cooccurrence_distinct_bigram.csv')

../cache/test_distinct_bigram.csv
finished 0
finished 100000
finished 200000
finished 300000
finished 400000
finished 500000
finished 600000
times: 0:00:18.645566


In [36]:
# %%time

# print('Generate snowball')
# train['de_sb'] = train['item_description'].astype('str').apply(lambda x:stem_str(x.lower(),snowball) if len(x) > 1 else x)
# test['de_sb'] = test['item_description'].astype('str').apply(lambda x:stem_str(x.lower(),snowball) if len(x) > 1 else x)

Generate snowball
CPU times: user 12min 37s, sys: 700 ms, total: 12min 38s
Wall time: 12min 38s


In [37]:
# %%time

# print('Generate snowball')
# train['name_sb'] = train['name'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))
# test['name_sb'] = test['name'].astype(str).apply(lambda x:stem_str(x.lower(),snowball))

Generate snowball
CPU times: user 5min 42s, sys: 416 ms, total: 5min 43s
Wall time: 5min 43s


In [38]:
# print('Generate porter')
# train['name_p'] = train['name'].astype(str).apply(lambda x:stem_str(x.lower(),porter))
# test['name_p'] = test['name'].astype(str).apply(lambda x:stem_str(x.lower(),porter))

Generate porter
metal  925 sterling silver stone  black onyx style  celtic eing  vintage ring  boho ring size 6   rafaella jewelry


used for my first aid college course  purchased new  first aid  cpr    aed advanced textbook  6th edition access code never scratched off  isbn 978 1 4496 3505 3  no free shipping  i ship priority mail   bundle with my other nursing books to save on shipping costs 


advanced first aid  cpr   aed   american govt textbook bundle  great condition minus for bend in cover  cd included  isbn     978 0 87912 341 3


cute black   white polka dot skort size 4 in very good ued condition 


set of 2 coordinating designs   summer sunburst and promised  these were purchased as second quality wraps  so there may be some minor color design flaws  wraps will ship in original packaging that provides application instructions   sheets were only removed for photos  one sheet has enough wraps for 2 4 mani pedis  depending on size of nails  selling only as set  3  rm  offer d