In [10]:
# Based on Bojan's -> https://www.kaggle.com/tunguz/more-effective-ridge-lgbm-script-lb-44944
#
import gc
import time
import numpy as np
import pandas as pd

from joblib import Parallel, delayed

from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import SGDRegressor
# from textblob import TextBlob
import lightgbm as lgb
import os, psutil
from multiprocessing import Pool
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from scipy.spatial.distance import pdist, squareform
from collections import Counter
import re
import lzma
# import Levenshtein
from numba import jit
import spacy

try:
    import lzma
    import Levenshtein
except:
    pass
from difflib import SequenceMatcher
from sklearn.metrics.pairwise import cosine_similarity



In [11]:
NUM_BRANDS = 4000
NUM_CATEGORIES = 1000
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 2 ** 14
NUM_PARTITIONS = 12 #number of partitions to split dataframe
NUM_CORES = 4 #number of cores on your machine

###################################################################################


In [12]:
def rmsle(y, y0):
    assert len(y) == len(y0)
    return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")

def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)

def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category = dataset['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['category_name'].isin(pop_category), 'category_name'] = 'missing'

def to_categorical(dataset):
    dataset['category_name'] = dataset['category_name'].astype('category')
    dataset['brand_name'] = dataset['brand_name'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')

def print_memory_usage():
    print('cpu: {}'.format(psutil.cpu_percent()))
    print('consuming {:.2f}GB RAM'.format(
           psutil.Process(os.getpid()).memory_info().rss / 1073741824),
          flush=True)


In [13]:
def _edit_dist(str1, str2):
    try:
        # very fast
        # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio(str1, str2)
        d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
    except:
        # https://docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
    return d
def _entropy(proba):
    entropy = -np.sum(proba*np.log(proba))
    return entropy


def _try_divide(x, y, val=0.0):
    """try to divide two numbers"""
    if y != 0.0:
        val = float(x) / y
    return val

def _jaccard_coef(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return _try_divide(float(len(A.intersection(B))), len(A.union(B)))


def _dice_dist(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return _try_divide(2.*float(len(A.intersection(B))), (len(A) + len(B)))

   
def entropy(obs, token_pattern=' '):
    obs_tokens = obs.split(token_pattern)
    counter = Counter(obs_tokens)
    count = np.asarray(list(counter.values()))
    proba = count/np.sum(count)
    del obs_tokens
    return _entropy(proba)

def digit_count(obs):
    return len(re.findall(r"\d", obs))

def digit_ratio(obs, token_pattern = ' '):
    obs_tokens = obs.split(token_pattern)
    dr = _try_divide(len(re.findall(r"\d", obs)), len(obs_tokens))
    del obs_tokens
    return dr

def _entropy(proba):
    entropy = -np.sum(proba*np.log(proba))
    return entropy


def _try_divide(x, y, val=0.0):
    """try to divide two numbers"""
    if y != 0.0:
        val = float(x) / y
    return val


def _jaccard_coef(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return _try_divide(float(len(A.intersection(B))), len(A.union(B)))


def _dice_dist(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return _try_divide(2.*float(len(A.intersection(B))), (len(A) + len(B)))

 
def entropy(obs, token_pattern=' '):
    obs_tokens = obs.split(token_pattern)
    counter = Counter(obs_tokens)
    count = np.asarray(list(counter.values()))
    proba = count/np.sum(count)
    del obs_tokens
    return _entropy(proba)

def digit_count(obs):
    return len(re.findall(r"\d", obs))

def digit_ratio(obs, token_pattern = ' '):
    obs_tokens = obs.split(token_pattern)
    return _try_divide(len(re.findall(r"\d", obs)), len(obs_tokens))


def _unigrams(words):
    """
        Input: a list of words, e.g., ["I", "am", "Denny"]
        Output: a list of unigram
    """
    assert type(words) == list
    return words

def _bigrams(words, join_string, skip=0):
    """
       Input: a list of words, e.g., ["I", "am", "Denny"]
       Output: a list of bigram, e.g., ["I_am", "am_Denny"]
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
                for k in range(1,skip+2):
                        if i+k < L:
                                lst.append( join_string.join([words[i], words[i+k]]) )
    else:
        # set it as unigram
        lst = _unigrams(words)
    return lst

def _trigrams(words, join_string, skip=0):
    """
       Input: a list of words, e.g., ["I", "am", "Denny"]
       Output: a list of trigram, e.g., ["I_am_Denny"]
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
                for k1 in range(1,skip+2):
                        for k2 in range(1,skip+2):
                                if i+k1 < L and i+k1+k2 < L:
                                        lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) )
    else:
        # set it as bigram
        lst = _bigrams(words, join_string, skip)
    return lst


def _ngrams(words, ngram, join_string=" "):
    """wrapper for ngram"""
    if ngram == 1:
        return _unigrams(words)
    elif ngram == 2:
        return _bigrams(words, join_string)
    elif ngram == 3:
        return _trigrams(words, join_string)
    elif ngram == 4:
        return _fourgrams(words, join_string)
    elif ngram == 12:
        unigram = _unigrams(words)
        bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
        return unigram + bigram
    elif ngram == 123:
        unigram = _unigrams(words)
        bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
        trigram = [x for x in _trigrams(words, join_string) if len(x.split(join_string)) == 3]
        return unigram + bigram + trigram

def _ngrams(words, ngram, join_string=" "):
    """wrapper for ngram"""
    if ngram == 1:
        return _unigrams(words)
    elif ngram == 2:
        return _bigrams(words, join_string)
    elif ngram == 3:
        return _trigrams(words, join_string)
    elif ngram == 4:
        return _fourgrams(words, join_string)
    elif ngram == 12:
        unigram = _unigrams(words)
        bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
        return unigram + bigram
    elif ngram == 123:
        unigram = _unigrams(words)
        bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
        trigram = [x for x in _trigrams(words, join_string) if len(x.split(join_string)) == 3]
        return unigram + bigram + trigram

def UniqueCount_Ngram(obs, count, token_pattern=' '):
    obs_tokens = obs.lower().split(token_pattern)
    obs_ngrams = _ngrams(obs_tokens, count)
    l = len(set(obs_ngrams))
    del obs_tokens
    del obs_ngrams
    return l

def UniqueRatio_Ngram(obs, count, token_pattern=' '):
    obs_tokens = obs.lower().split(token_pattern)
    obs_ngrams = _ngrams(obs_tokens, count)
    r = _try_divide(len(set(obs_ngrams)), len(obs_ngrams))
    del obs_tokens
    del obs_ngrams
    return r


def parallelize_dataframe(df, func):
    df_split = np.array_split(df, NUM_PARTITIONS)
    pool = Pool(NUM_CORES)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def get_sentiment_score(df):
    df['sentiment_score'] = df['item_description'].map(lambda x: TextBlob(x).sentiment.polarity)
    return df



In [14]:
# def main():
start_time = time.time()


In [21]:
train = pd.read_table('../input/train.tsv', engine='c')
test = pd.read_table('../input/test.tsv', engine='c')
train = train[train.price != 0] 
print('[{}] Finished to load data'.format(time.time() - start_time))
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)



[333.3775749206543] Finished to load data
Train shape:  (1481661, 8)
Test shape:  (693359, 7)


In [22]:
train.nunique()

train_id             1481661
name                 1224596
item_condition_id          5
category_name           1287
brand_name              4807
price                    827
shipping                   2
item_description     1280671
dtype: int64

In [23]:
train['IDL'] = train['item_description'].map(lambda x: len(str(x).lower().split(' ')))

In [25]:
np.max(train['IDL'].values)

245

In [None]:
nrow_test = test.shape[0]

test_id = test['test_id'].values
submission = pd.DataFrame(test[['test_id']])

if nrow_test < 700000:
    test = pd.concat([test,test,test,test,test])
    print('Test shape ', test.shape)


nrow_train = train.shape[0]
y = np.log1p(train["price"])
del train['price']
merge= pd.concat([train, test])

train_cols = set(train.columns)
# del train
del test
gc.collect()


handle_missing_inplace(merge)
print('[{}] Handle missing completed.'.format(time.time() - start_time))

print_memory_usage()

In [18]:
len(merge)

4948456

NameError: name 'train' is not defined

In [17]:
merge.nunique()

brand_name              5288
category_name           1311
item_condition_id          5
item_description     1861289
name                 1749956
shipping                   2
test_id               693359
train_id             1481661
dtype: int64

In [7]:
cutting(merge)
print('[{}] Finished to cut'.format(time.time() - start_time))

to_categorical(merge)
print('[{}] Finished to convert categorical'.format(time.time() - start_time))

[70.99395394325256] Finished to cut
[72.0034282207489] Finished to convert categorical


In [9]:
merge.head(100)

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,shipping,test_id,train_id
0,missing,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,1,,2.0
3,missing,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,1,,3.0
4,missing,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,0,,4.0
5,missing,Women/Other/Other,3,"Banana republic bottoms, Candies skirt with ma...",Bundled items requested for Ruie,0,,5.0
6,Acacia Swimwear,Women/Swimwear/Two-Piece,3,Size small but straps slightly shortened to fi...,Acacia pacific tides santorini top,0,,6.0
7,Soffe,Sports & Outdoors/Apparel/Girls,3,You get three pairs of Sophie cheer shorts siz...,Girls cheer and tumbling bundle of 7,1,,7.0
8,Nike,Sports & Outdoors/Apparel/Girls,3,Girls Size small Plus green. Three shorts total.,Girls Nike Pro shorts,0,,8.0
9,missing,Vintage & Collectibles/Collectibles/Doll,3,I realized his pants are on backwards after th...,Porcelain clown doll checker pants VTG,0,,9.0


In [8]:
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

In [9]:
def get_tokens(x):
    doc = nlp(x)
    tokens = [i.lemma_ for i in doc]
    return tokens

In [10]:
def get_tokens_name(df):
    df['name1'] = df['name'].map(lambda x: get_tokens(x))
    return df

def get_tokens_item_desc(df):
    df['item_description1'] = df['item_description'].map(lambda x: get_tokens(x))
    return df

In [11]:
%%time
merge = parallelize_dataframe(merge, get_tokens_name)
merge = parallelize_dataframe(merge, get_tokens_item_desc)

CPU times: user 1min 6s, sys: 8.71 s, total: 1min 15s
Wall time: 9min 13s


In [12]:
merge = merge.drop('name', axis=1)
merge = merge.drop('item_description', axis=1)

In [13]:
merge.rename(columns={'item_description1': 'item_description', 'name1': 'name'}, inplace=True)

In [14]:
merge.head()

Unnamed: 0,brand_name,category_name,item_condition_id,shipping,test_id,train_id,name,item_description
0,missing,Men/Tops/T-shirts,3,1,,0.0,"[MLB, Cincinnati, Reds, T, Shirt, Size, XL]","[No, description, yet]"
1,Razer,Electronics/Computers & Tablets/Components & P...,3,0,,1.0,"[Razer, BlackWidow, Chroma, Keyboard]","[This, keyboard, be, in, great, condition, and..."
2,Target,Women/Tops & Blouses/Blouse,1,1,,2.0,"[AVA, -, VIV, Blouse]","[Adorable, top, with, a, hint, of, lace, and, ..."
3,missing,Home/Home Décor/Home Décor Accents,1,1,,3.0,"[Leather, Horse, Statues]","[New, with, tag, ., Leather, horse, ., Retail,..."
4,missing,Women/Jewelry/Necklaces,1,0,,4.0,"[24, K, GOLD, plate, rise]","[Complete, with, certificate, of, authenticity]"


In [15]:
# def get_doclen_name(df):
#     df['name_doclen'] = df['name'].map(lambda x: len(str(x).lower().split(' ')))
#     return df

# def get_doclen_itemdesc(df):
#     df['item_description_doclen'] = df['item_description'].map(lambda x: len(str(x).lower().split(' ')))
#     return df

# def get_doclen_brand_name(df):
#     df['brand_name_doclen'] = df['brand_name'].map(lambda x: len(str(x).lower().split(' ')))
#     return df

# def get_entropy_name(df):
#     df['name_entropy'] = df['name'].map(lambda x: entropy(str(x).lower(), ' '))
#     return df

# def get_entropy_itemdesc(df):
#     df['item_description_entropy'] = \
#         df['item_description'].map(lambda x: entropy(str(x).lower(), ' '))
#     return df

# def get_entropy_brand_name(df):
#     df['brand_name_entropy'] = \
#         df['brand_name'].map(lambda x: entropy(str(x).lower(), ' '))
#     return df

# def get_digit_count_name(df):
#     df['name_dc'] = df['name'].map(lambda x: digit_count(str(x).lower()))
#     return df

# def get_digit_count_itemdesc(df):
#     df['item_description_dc'] = \
#         df['item_description'].map(lambda x: digit_count(str(x).lower()))
#     return df

# def get_digit_count_brand_name(df):
#     df['brand_name_dc'] = \
#         df['brand_name'].map(lambda x: digit_count(str(x).lower()))
#     return df

# def get_digit_ratio_name(df):
#     df['name_dr'] = df['name'].map(lambda x: digit_ratio(str(x).lower()))
#     return df

# def get_digit_ratio_itemdesc(df):
#     df['item_description_dr'] = \
#         df['item_description'].map(lambda x: digit_ratio(str(x).lower()))
#     return df

# def get_digit_ratio_brand_name(df):
#     df['brand_name_dr'] = \
#         df['brand_name'].map(lambda x: digit_ratio(str(x).lower()))
#     return df

# def get_emoji_count_name(df):
#     df['name_ec'] = df['name'].map(lambda x: emoji_count(str(x).lower()))
#     return df

# def get_emoji_count_itemdesc(df):
#     df['item_description_ec'] = \
#         df['item_description'].map(lambda x: emoji_count(str(x).lower()))
#     return df

# def get_emoji_count_brand_name(df):
#     df['brand_name_ec'] = \
#         df['brand_name'].map(lambda x: emoji_count(str(x).lower()))
#     return df

# def get_emoji_ratio_name(df):
#     df['name_er'] = df['name'].map(lambda x: emoji_ratio(str(x).lower()))
#     return df

# def get_emoji_ratio_itemdesc(df):
#     df['item_description_er'] = \
#         df['item_description'].map(lambda x: emoji_ratio(str(x).lower()))
#     return df

# def get_emoji_ratio_brand_name(df):
#     df['brand_name_er'] = \
#         df['brand_name'].map(lambda x: emoji_ratio(str(x).lower()))
#     return df


In [16]:
# cols1 = set(merge.columns)
# cols = []
# obs_fields = ['name', 'brand_name', 'item_description']
# merge = parallelize_dataframe(merge, get_doclen_name)
# merge = parallelize_dataframe(merge, get_doclen_itemdesc)
# merge = parallelize_dataframe(merge, get_doclen_brand_name)

# merge = parallelize_dataframe(merge, get_entropy_name)
# merge = parallelize_dataframe(merge, get_entropy_itemdesc)
# merge = parallelize_dataframe(merge, get_entropy_brand_name)

# merge = parallelize_dataframe(merge, get_digit_count_name)
# merge = parallelize_dataframe(merge, get_digit_count_itemdesc)
# merge = parallelize_dataframe(merge, get_digit_count_brand_name)

# merge = parallelize_dataframe(merge, get_digit_ratio_name)
# merge = parallelize_dataframe(merge, get_digit_ratio_itemdesc)
# merge = parallelize_dataframe(merge, get_digit_ratio_brand_name)

# # merge = parallelize_dataframe(merge, get_emoji_count_name)
# # merge = parallelize_dataframe(merge, get_emoji_count_itemdesc)
# # merge = parallelize_dataframe(merge, get_emoji_count_brand_name)

# # merge = parallelize_dataframe(merge, get_emoji_ratio_name)
# # merge = parallelize_dataframe(merge, get_emoji_ratio_itemdesc)
# # merge = parallelize_dataframe(merge, get_emoji_ratio_brand_name)

# print('[{}] Finished basic creation for name, bn, item_desc'.format(time.time() - start_time))
# print_memory_usage()

# for f in obs_fields:
#     counter = Counter(merge[f].values)
#     merge[f+'_docfreq'] = merge[f].map(lambda x: counter[x])

#     cols.append(f+'_doclen')
#     cols.append(f+'_docfreq')
#     cols.append(f+'_docEntropy')
#     cols.append(f+'_digitCount')
#     cols.append(f+'_digitRatio')
#     # cols.append(f+'_emojiCount')
#     # cols.append(f+'_emojiRatio')

# f = 'category_name'
# def get_category_name_doclen(df):
#     df[f+'_doclen'] = df[f].map(lambda x: len(str(x).lower().split('/')))
#     return df

# merge = parallelize_dataframe(merge, get_category_name_doclen)

# counter = Counter(merge[f].values)
# merge[f+'_docfreq'] = merge[f].map(lambda x: counter[x])

# token_pattern = '/'

# def get_category_name_entropy(df):
#         df[f+'_docEntropy'] = df[f].map(lambda x: entropy(str(x).lower(),token_pattern))
#         return df
# merge = parallelize_dataframe(merge, get_category_name_entropy)

# def get_category_name_dc(df):
#         df[f+'_dc'] = df[f].map(lambda x: digit_count(str(x).lower()))
#         return df
# merge = parallelize_dataframe(merge, get_category_name_dc)

# def get_category_name_dr(df):
#         df[f+'_dr'] = df[f].map(lambda x: digit_ratio(str(x).lower(), token_pattern))
#         return df
# merge = parallelize_dataframe(merge, get_category_name_dr)

# def get_category_name_ec(df):
#         df[f+'_emojiCount'] = df[f].map(lambda x: emoji_count(str(x).lower()))
#         return df
# # merge = parallelize_dataframe(merge, get_category_name_ec)

# def get_category_name_er(df):
#         df[f+'_emojiRatio'] = df[f].map(lambda x: emoji_ratio(str(x).lower()))
#         return df
# # merge = parallelize_dataframe(merge, get_category_name_er)

# cols.append(f+'_doclen')
# cols.append(f+'_docfreq')
# cols.append(f+'_docEntropy')
# cols.append(f+'_digitCount')
# cols.append(f+'_digitRatio')
# # cols.append(f+'_emojiCount')
# # cols.append(f+'_emojiRatio')

# print('[{}] Finished basic creation for category_name'.format(time.time() - start_time))

# obs_fields = ["name", "item_description"]

# print_memory_usage()

In [17]:
# def get_bigram_uc_name(df):
#     df['name_2_uc'] = df['name'].map(lambda x: UniqueCount_Ngram(str(x), 2))
#     return df
# merge = parallelize_dataframe(merge, get_bigram_uc_name)

# def get_bigram_uc_item_desc(df):
#     df['item_desc_2_uc'] = \
#             df['item_description'].map(lambda x: UniqueCount_Ngram(str(x), 2))
#     return df
# merge = parallelize_dataframe(merge, get_bigram_uc_item_desc)

# def get_bigram_ur_name(df):
#     df['name_2_ur'] = df['name'].map(lambda x: UniqueRatio_Ngram(str(x), 2))
#     return df
# merge = parallelize_dataframe(merge, get_bigram_ur_name)

# def get_bigram_ur_item_desc(df):
#     df['item_desc_2_ur'] = \
#             df['item_description'].map(lambda x: UniqueRatio_Ngram(str(x), 2))
#     return df
# merge = parallelize_dataframe(merge, get_bigram_ur_item_desc)


In [18]:
# merge =  merge.loc[:, (merge != merge.iloc[0]).any()]
# print(len(cols))
# del cols
# cols = list(set(merge.columns) - cols1)
# print(len(cols))

# X_b = merge[cols]

# print('[{}] Finished X_basic1'.format(time.time() - start_time))

# scaler = MinMaxScaler()
# X_b = scaler.fit_transform(X_b)
# X_basic = csr_matrix(X_b)
# print('basic: ', X_basic.data.nbytes)
# print('[{}] Finished X_basic2'.format(time.time() - start_time))
# del X_b
# for c in cols:
#     merge = merge.drop(c, axis=1)
# print_memory_usage()



In [19]:
# jaccard and dice
merge['n_id'] = merge['name'].astype('str') + '+++++_____+++++' + merge['item_description'].astype('str')

In [20]:
%load_ext cython

In [21]:
def _jaccard_coef(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return _try_divide(float(len(A.intersection(B))), len(A.union(B)))


def _dice_dist(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return _try_divide(2.*float(len(A.intersection(B))), (len(A) + len(B)))


In [22]:
%%cython
def cyjaccard1(seq1, seq2):
    cdef set set1 = set(seq1)
    cdef set set2 = set()

    cdef Py_ssize_t length_intersect = 0

    for char in seq2:
        if char not in set2:
            if char in set1:
                length_intersect += 1
            set2.add(char)

    return 1 - (length_intersect / float(len(set1) + len(set2) - length_intersect))

def cydice1(seq1, seq2):
    cdef set set1 = set(seq1)
    cdef set set2 = set()

    cdef Py_ssize_t length_intersect = 0

    for char in seq2:
        if char not in set2:
            if char in set1:
                length_intersect += 1
            set2.add(char)

    return ((2. * length_intersect) / float(len(set1) + len(set2)))

In [23]:
def jaccard_c(x):
#     print('---------------------------------')
#     print(x)
    x1, x2 = x.split('+++++_____+++++')
    x1_tokens = list(x1)
    x2_tokens = list(x2)
    j = 1-cyjaccard1(x1_tokens, x2_tokens)
    return j

def dice_c(x):
#     print('---------------------------------')
#     print(x)
    x1, x2 = x.split('+++++_____+++++')
    x1_tokens = list(x1)
    x2_tokens = list(x2)
    j = cydice1(x1_tokens, x2_tokens)
    return j

In [24]:
def get_jaccard(df):
    df['j_n_id'] = df['n_id'].map(lambda x: jaccard_c(str(x)))
    return df

def get_dice(df):
    df['d_n_id'] = df['n_id'].map(lambda x: dice_c(str(x)))
    return df

def _is_str_match(str1, str2, threshold=1.0):
    assert threshold >= 0.0 and threshold <= 1.0, "Wrong threshold."
    if float(threshold) == 1.0:
        return str1 == str2
    else:
        return (1. - _edit_dist(str1, str2)) >= threshold

    
def _get_match_count(obs, target, idx):
    cnt = 0
    if (len(obs) != 0) and (len(target) != 0):
        for word in target:
            if _is_str_match(word, obs[idx], 0.85):
                cnt += 1
    return cnt

def firstIntersectionNGram(obs_tokens, target_tokens, count):
    
#     obs_tokens = _tokenize(obs, token_pattern)
#     target_tokens = _tokenize(target, token_pattern)
    obs_ngrams = _ngrams(obs_tokens, count)
    target_ngrams = _ngrams(target_tokens, count)
    return _get_match_count(obs_ngrams, target_ngrams, 0)

def lastIntersectionNGram(obs_tokens, target_tokens, count):
#     obs_tokens = _tokenize(obs, token_pattern)
#     target_tokens = _tokenize(target, token_pattern)
    obs_ngrams = _ngrams(obs_tokens, count)
    target_ngrams = _ngrams(target_tokens, count)
    return _get_match_count(obs_ngrams, target_ngrams, -1)

def get_first_i_n(x):
    x1, x2 = x.split('+++++_____+++++')
    x1_tokens = list(x1)
    x2_tokens = list(x2)
    return firstIntersectionNGram(x1_tokens, x2_tokens, 1)

def get_last_i_n(x):
    x1, x2 = x.split('+++++_____+++++')
    x1_tokens = list(x1)
    x2_tokens = list(x2)
    return lastIntersectionNGram(x1_tokens, x2_tokens, 1)

def get_first_int_ngram(df):
    df['f_i_n'] = df['n_id'].map(lambda x: get_first_i_n(str(x)))
    return df

def get_last_int_ngram(df):
    df['l_i_n'] = df['n_id'].map(lambda x: get_last_i_n(str(x)))
    return df

In [25]:
%%time
merge = parallelize_dataframe(merge, get_jaccard)
# merge = parallelize_dataframe(merge, get_dice)

CPU times: user 1min 20s, sys: 11.3 s, total: 1min 32s
Wall time: 1min 38s


In [38]:
merge.head()

NameError: name 'merge' is not defined

In [27]:
%%time
merge = parallelize_dataframe(merge, get_first_int_ngram)
merge = parallelize_dataframe(merge, get_last_int_ngram)

CPU times: user 2min 56s, sys: 31.2 s, total: 3min 27s
Wall time: 1h 56min 33s


In [28]:
X_j = merge[['j_n_id', 'f_i_n', 'l_i_n']]
del merge['n_id']
print(np.min(X_j))
print(np.max(X_j))
print('[{}] Finished X_j'.format(time.time() - start_time))
print_memory_usage()


j_n_id    0.056338
f_i_n     1.000000
l_i_n     1.000000
dtype: float64
j_n_id     1.0
f_i_n     43.0
l_i_n     43.0
dtype: float64
[7690.919023275375] Finished X_j
cpu: 34.0
consuming 27.67GB RAM


In [29]:
X_j = csr_matrix(X_j)
print('[{}] Finished X_j csr'.format(time.time() - start_time))
print_memory_usage()

[7691.370595693588] Finished X_j csr
cpu: 9.8
consuming 27.83GB RAM


In [30]:
merge['name2'] = merge['name'].map(lambda x: ' '.join(x))

In [31]:
merge['item_description2'] = merge['item_description'].map(lambda x: ' '.join(x))

In [32]:
abbr = {}
abbr['BNWT'] = ['bnwt', 'brand new with tags']
abbr['NWT'] = ['nwt', 'new with tags']
abbr['BNWOT'] = ['bnwot', 'brand new with out tags', 'brand new without tags']
abbr['NWOT'] = ['nwot', 'new with out tags', 'new without tags']
abbr['BNIP'] = ['bnip', 'brand new in packet', 'brand new in packet']
abbr['NIP'] = ['nip', 'new in packet', 'new in packet']
abbr['BNIB'] = ['bnib', 'brand new in box']
abbr['NIB'] = ['nib', 'new in box']
abbr['MIB'] = ['mib', 'mint in box']
abbr['MWOB'] = ['mwob', 'mint with out box', 'mint without box']
abbr['MIP'] = ['mip', 'mint in packet']
abbr['MWOP'] = ['mwop', 'mint with out packet', 'mint without packet']

merge['tag'] = merge['item_description2'].map(lambda a: 'BNWT' if any(x in a.lower() for x in abbr['BNWT'])
                                                                                   else 'NWT' if any(x in a.lower() for x in abbr['NWT'])
                                                                                   else 'BNWOT' if any(x in a.lower() for x in abbr['BNWOT'])
                                                                                   else 'NWOT' if any(x in a.lower() for x in abbr['NWOT'])
                                                                                   else 'BNIP' if any(x in a.lower() for x in abbr['BNIP'])
                                                                                   else 'NIP' if any(x in a.lower() for x in abbr['NIP'])
                                                                                   else 'BNIB' if any(x in a.lower() for x in abbr['BNIB'])
                                                                                   else 'NIB' if any(x in a.lower() for x in abbr['NIB'])
                                                                                   else 'MIB' if any(x in a.lower() for x in abbr['MIB'])
                                                                                   else 'MWOB' if any(x in a.lower() for x in abbr['MWOB'])
                                                                                   else 'MIP' if any(x in a.lower() for x in abbr['MIP'])
                                                                                   else 'MWOP' if any(x in a.lower() for x in abbr['MWOP'])
                                                                                   else 'None')
print('[{}] Finished tag'.format(time.time() - start_time))
del abbr
print_memory_usage()


[8235.72233915329] Finished tag
cpu: 2.7
consuming 29.00GB RAM


In [33]:
merge['bci'] = merge['brand_name'].astype('str') + ' ' + merge['category_name'].astype('str') + ' ' + \
                        merge['item_condition_id'].astype('str')

merge['bc'] = merge['brand_name'].astype('str') + ' ' + merge['category_name'].astype('str')

merge['bcis'] = merge['brand_name'].astype('str') + ' ' \
                                + merge['category_name'].astype('str') + ' ' + \
                                merge['item_condition_id'].astype('str') + ' ' + \
                                merge['shipping'].astype('str')

merge['bcs'] = merge['brand_name'].astype('str') + ' ' + \
                                merge['category_name'].astype('str') + ' ' + \
                                merge['shipping'].astype('str')

# merge['bi'] = merge['brand_name'].astype('str') + '_' +   merge['item_condition_id'].astype('str')

# merge['ci'] = merge['category_name'].astype('str') + '_' + merge['item_condition_id'].astype('str')

print('[{}] Finished creating bci bc bi ci bcs bcis'.format(time.time() - start_time))
print_memory_usage()


# merge.drop(['bci', 'bc'], axis=1, inplace=True)

# merge = parallelize_dataframe(merge, get_sentiment_score)
# merge['sentiment_score'] = merge['item_description'].map(lambda x: TextBlob(x).sentiment.polarity)

# print('[{}] Finished sentiment score'.format(time.time() - start_time))
# a = merge['sentiment_score'].values
# print(np.min(a))
# print(np.max(a))

# print_memory_usage()
# merge['sentiment'] = merge['sentiment_score'].map(lambda x: 'VPos' if x > 0.5
                                                                                                        # else 'Pos' if (x <= 0.5) and (x > 0)
                                                                                                        # else 'Neu' if  x == 0
                                                                                                        # else 'Neg' if (x < 0) and (x >= -0.5)
                                                                                                        # else 'VNeg')

# print('[{}] Finished sentiment'.format(time.time() - start_time))

cutting(merge)
print('[{}] Finished to cut'.format(time.time() - start_time))

to_categorical(merge)
print('[{}] Finished to convert categorical'.format(time.time() - start_time))



[8284.722241163254] Finished creating bci bc bi ci bcs bcis
cpu: 8.7
consuming 30.94GB RAM
[8305.616286754608] Finished to cut
[8307.865171909332] Finished to convert categorical


In [34]:
cv = CountVectorizer(min_df=NAME_MIN_DF, stop_words='english')
X_name = cv.fit_transform(merge['name2'])
norm = Normalizer()
X_name = norm.fit_transform(X_name)
print('[{}] Finished count vectorize `name`'.format(time.time() - start_time))
print(X_name.shape)
print(np.min(X_name))
print(np.max(X_name))
del merge['name']
print_memory_usage()

cv = CountVectorizer()
X_category = cv.fit_transform(merge['category_name'])
norm = Normalizer()
X_category = norm.fit_transform(X_category)
print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time))
print(X_category.shape)
print(np.min(X_category))
print(np.max(X_category))
del merge['category_name']
gc.collect()
print_memory_usage()

tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                                         ngram_range=(1, 3),
                                         stop_words='english')
X_description = tv.fit_transform(merge['item_description2'])
print('[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time))
print(X_description.shape)
print(np.min(X_description))
print(np.max(X_description))
del merge['item_description']
print_memory_usage()

lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(merge['brand_name'])
print('[{}] Finished label binarize `brand_name`'.format(time.time() - start_time))
print(X_brand.shape)
del merge['brand_name']
print_memory_usage()

lb = LabelBinarizer(sparse_output=True)
X_bci = lb.fit_transform(merge['bci'])
print('[{}] Finished label binarize `bci`'.format(time.time() - start_time))
print(X_bci.shape)
del merge['bci']
print_memory_usage()

lb = LabelBinarizer(sparse_output=True)
X_bcis = lb.fit_transform(merge['bcis'])
print('[{}] Finished label binarize `bcis`'.format(time.time() - start_time))
print(X_bcis.shape)
del merge['bcis']
gc.collect()
print_memory_usage()

lb = LabelBinarizer(sparse_output=True)
X_bcs = lb.fit_transform(merge['bcs'])
print('[{}] Finished label binarize `bcs`'.format(time.time() - start_time))
print(X_bcs.shape)
del merge['bcs']
gc.collect()
print_memory_usage()



[8341.990030288696] Finished count vectorize `name`
(4949330, 32989)
0.0
1.0
cpu: 8.7
consuming 31.47GB RAM
[8371.144346475601] Finished count vectorize `category_name`
(4949330, 1022)
0.0
1.0
cpu: 8.7
consuming 31.70GB RAM
[8764.98168349266] Finished TFIDF vectorize `item_description`
(4949330, 16384)
0.0
1.0
cpu: 8.9
consuming 36.67GB RAM
[8795.687213897705] Finished label binarize `brand_name`
(4949330, 4001)
cpu: 8.9
consuming 36.83GB RAM
[8888.251164674759] Finished label binarize `bci`
(4949330, 92981)
cpu: 9.0
consuming 36.90GB RAM
[8982.92055773735] Finished label binarize `bcis`
(4949330, 127794)
cpu: 9.0
consuming 36.95GB RAM
[9081.005019426346] Finished label binarize `bcs`
(4949330, 71722)
cpu: 9.0
consuming 37.03GB RAM


In [35]:
X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping',
                                                                                        'tag']], sparse=True).values)
print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'.format(time.time() - start_time))
print(X_dummies.shape)
print_memory_usage()

del merge
gc.collect()
print_memory_usage()


[9107.689462423325] Finished to get dummies on `item_condition_id` and `shipping`
(4949330, 17)
cpu: 8.7
consuming 37.24GB RAM
cpu: 8.9
consuming 26.07GB RAM


In [36]:
print('j: ', X_j.data.nbytes)
# print('basic: ', X_basic.data.nbytes)
print('bcis: ', X_bcis.data.nbytes)
print('bci: ', X_bci.data.nbytes)
print('dummies: ', X_dummies.data.nbytes)
print('description: ', X_description.data.nbytes)
print('brand: ', X_brand.data.nbytes)
print('category: ', X_category.data.nbytes)
print('name: ', X_name.data.nbytes)
# print('name1: ', X_name1.data.nbytes)

sparse_merge = hstack((X_j, X_bci, X_bcis, X_bcs, X_dummies, X_description, X_brand, X_category, X_name)).tocsr()
print('[{}] Finished to create sparse merge'.format(time.time() - start_time))

del X_j, X_bcis, X_bci, X_bcs, X_dummies, X_description, X_brand, X_category, X_name
gc.collect()
print_memory_usage()

X = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_train:]

print(X.shape)
print_memory_usage()

del sparse_merge
gc.collect()
print_memory_usage()




j:  118783920
bcis:  39594640
bci:  39594640
dummies:  96911280
description:  763011728
brand:  39594640
category:  158521288
name:  153527288
[9135.312714576721] Finished to create sparse merge
cpu: 8.9
consuming 26.01GB RAM
(1482535, 346913)
cpu: 8.7
consuming 28.04GB RAM
cpu: 8.9
consuming 26.01GB RAM


In [37]:
np.random.seed(0)
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.01, random_state = 0)
print_memory_usage()
# d_train = lgb.Dataset(X, label=y, max_bin=8192)
d_train = lgb.Dataset(train_X, label=train_y, max_bin=8192)
d_valid = lgb.Dataset(valid_X, label=valid_y, max_bin=8192)
watchlist = [d_train, d_valid]
print_memory_usage()

params = {
        'learning_rate': 0.75,
        'application': 'regression',
        'max_depth': 3,
        'num_leaves': 100,
        'verbosity': -1,
        'metric': 'RMSE',
        'num_threads': 4
}


model = lgb.train(params, train_set=d_train, valid_sets=watchlist,
                                        num_boost_round=5000,early_stopping_rounds=100,verbose_eval=500)
print('[{}] Finished to train lgbm'.format(time.time() - start_time))


cpu: 9.1
consuming 26.41GB RAM
cpu: 0.0
consuming 26.41GB RAM
Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 0.489008	valid_1's rmse: 0.494383
[1000]	training's rmse: 0.46791	valid_1's rmse: 0.476431
[1500]	training's rmse: 0.456376	valid_1's rmse: 0.467865
[2000]	training's rmse: 0.448011	valid_1's rmse: 0.464196
[2500]	training's rmse: 0.441391	valid_1's rmse: 0.461019
[3000]	training's rmse: 0.436142	valid_1's rmse: 0.458279
Early stopping, best iteration is:
[3182]	training's rmse: 0.434581	valid_1's rmse: 0.457517
[9594.305319786072] Finished to train lgbm


In [None]:
print(X.shape)
print(X_test.shape)

In [None]:
print('predicting..')
preds = model.predict(X_test)
print('[{}] Finished to train predict lgbm'.format(time.time() - start_time))


In [None]:
#print('Deleting mem..')
#del model, d_train, d_valid
print_memory_usage()

In [None]:

# submission=pd.DataFrame()
# submission['test_id'] = test_id
# submission['price'] = np.expm1(preds)
# submission.to_csv("submission_lgbm_nlp2.csv", index=False)
preds *= 0.6
# print('[{}] Finished submission lgbm'.format(time.time() - start_time))
if nrow_test < 700000:
        preds = preds[:nrow_test]


In [None]:
model = Ridge(solver="saga", fit_intercept=True, random_state=205)
model.fit(X, y)
print('[{}] Finished to train ridge'.format(time.time() - start_time))
preds1 = model.predict(X=X_test)
print('[{}] Finished to predict ridge'.format(time.time() - start_time))
# submission['price'] = np.expm1(preds1)
# submission.to_csv("submission_ridge_nlp2.csv", index=False)
print_memory_usage()
if nrow_test < 700000:
        preds1 = preds1[:nrow_test]

preds += 0.4*preds1
submission['price'] = np.expm1(preds)
# submission.to_csv("submission_lgbm_ridge_nlp2.csv", index=False)
