In [None]:
import random, copy, struct
from hashlib import sha1
import numpy as np

# The size of a hash value in number of bytes
hashvalue_byte_size = len(bytes(np.int64(42).data))

# http://en.wikipedia.org/wiki/Mersenne_prime
_mersenne_prime = (1 << 61) - 1
_max_hash = (1 << 32) - 1
_hash_range = (1 << 32)

class MinHash(object):
    '''MinHash is a probabilistic data structure for computing 
    `Jaccard similarity`_ between sets.
 
    Args:
        num_perm (int, optional): Number of random permutation functions.
            It will be ignored if `hashvalues` is not None.
        seed (int, optional): The random seed controls the set of random 
            permutation functions generated for this MinHash.
        hashobj (optional): The hash function used by this MinHash. 
            It must implements
            the `digest()` method similar to hashlib_ hash functions, such
            as `hashlib.sha1`.
        hashvalues (`numpy.array` or `list`, optional): The hash values is 
            the internal state of the MinHash. It can be specified for faster 
            initialization using the existing state from another MinHash. 
        permutations (optional): The permutation function parameters. This argument
            can be specified for faster initialization using the existing
            state from another MinHash.
    
    Note:
        To save memory usage, consider using :class:`datasketch.LeanMinHash`.
        
    Note:
        Since version 1.1.1, MinHash will only support serialization using 
        `pickle`_. ``serialize`` and ``deserialize`` methods are removed, 
        and are supported in :class:`datasketch.LeanMinHash` instead. 
        MinHash serialized before version 1.1.1 cannot be deserialized properly 
        in newer versions (`need to migrate? <https://github.com/ekzhu/datasketch/issues/18>`_). 
    Note:
        Since version 1.1.3, MinHash uses Numpy's random number generator 
        instead of Python's built-in random package. This change makes the 
        hash values consistent across different Python versions.
        The side-effect is that now MinHash created before version 1.1.3 won't
        work (i.e., ``jaccard``, ``merge`` and ``union``)
        with those created after. 
    .. _`Jaccard similarity`: https://en.wikipedia.org/wiki/Jaccard_index
    .. _hashlib: https://docs.python.org/3.5/library/hashlib.html
    .. _`pickle`: https://docs.python.org/3/library/pickle.html
    '''

    def __init__(self, num_perm=128, seed=1, hashobj=sha1,
            hashvalues=None, permutations=None):
        if hashvalues is not None:
            num_perm = len(hashvalues)
        if num_perm > _hash_range:
            # Because 1) we don't want the size to be too large, and
            # 2) we are using 4 bytes to store the size value
            raise ValueError("Cannot have more than %d number of\
                    permutation functions" % _hash_range)
        self.seed = seed
        self.hashobj = hashobj
        # Initialize hash values
        if hashvalues is not None:
            self.hashvalues = self._parse_hashvalues(hashvalues)
        else:
            self.hashvalues = self._init_hashvalues(num_perm)
        # Initalize permutation function parameters
        if permutations is not None:
            self.permutations = permutations
        else:
            generator = np.random.RandomState(self.seed)
            # Create parameters for a random bijective permutation function
            # that maps a 32-bit hash value to another 32-bit hash value.
            # http://en.wikipedia.org/wiki/Universal_hashing
            self.permutations = np.array([(generator.randint(1, _mersenne_prime, dtype=np.uint64),
                                           generator.randint(0, _mersenne_prime, dtype=np.uint64))
                                          for _ in range(num_perm)], dtype=np.uint64).T
        if len(self) != len(self.permutations[0]):
            raise ValueError("Numbers of hash values and permutations mismatch")

    def _init_hashvalues(self, num_perm):
        return np.ones(num_perm, dtype=np.uint64)*_max_hash

    def _parse_hashvalues(self, hashvalues):
        return np.array(hashvalues, dtype=np.uint64)

    def update(self, b):
        '''Update this MinHash with a new value.
        
        Args:
            b (bytes): The value of type `bytes`.
            
        Example:
            To update with a new string value:
            
            .. code-block:: python
                minhash.update("new value".encode('utf-8'))
        '''
        hv = struct.unpack('<I', self.hashobj(b).digest()[:4])[0]
        a, b = self.permutations
        phv = np.bitwise_and((a * hv + b) % _mersenne_prime, np.uint64(_max_hash))
        self.hashvalues = np.minimum(phv, self.hashvalues)

    def jaccard(self, other):
        '''Estimate the `Jaccard similarity`_ (resemblance) between the sets
        represented by this MinHash and the other.
        
        Args:
            other (datasketch.MinHash): The other MinHash.
            
        Returns:
            float: The Jaccard similarity, which is between 0.0 and 1.0.
        '''
        if other.seed != self.seed:
            raise ValueError("Cannot compute Jaccard given MinHash with\
                    different seeds")
        if len(self) != len(other):
            raise ValueError("Cannot compute Jaccard given MinHash with\
                    different numbers of permutation functions")
        return np.float(np.count_nonzero(self.hashvalues==other.hashvalues)) /\
                np.float(len(self))

    def count(self):
        '''Estimate the cardinality count based on the technique described in
        `this paper <http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=365694>`_.
        
        Returns:
            int: The estimated cardinality of the set represented by this MinHash.
        '''
        k = len(self)
        return np.float(k) / np.sum(self.hashvalues / np.float(_max_hash)) - 1.0

    def merge(self, other):
        '''Merge the other MinHash with this one, making this one the union
        of both.
        
        Args:
            other (datasketch.MinHash): The other MinHash.
        '''
        if other.seed != self.seed:
            raise ValueError("Cannot merge MinHash with\
                    different seeds")
        if len(self) != len(other):
            raise ValueError("Cannot merge MinHash with\
                    different numbers of permutation functions")
        self.hashvalues = np.minimum(other.hashvalues, self.hashvalues)

    def digest(self):
        '''Export the hash values, which is the internal state of the
        MinHash.
        
        Returns:
            numpy.array: The hash values which is a Numpy array.
        '''
        return copy.copy(self.hashvalues)

    def is_empty(self):
        '''
        Returns: 
            bool: If the current MinHash is empty - at the state of just
                initialized.
        '''
        if np.any(self.hashvalues != _max_hash):
            return False
        return True

    def clear(self):
        '''
        Clear the current state of the MinHash.
        All hash values are reset.
        '''
        self.hashvalues = self._init_hashvalues(len(self))

    def copy(self):
        '''
        Returns:
            datasketch.MinHash: A copy of this MinHash by exporting its
                state.
        '''
        return MinHash(seed=self.seed, hashvalues=self.digest(),
                permutations=self.permutations)

    def __len__(self):
        '''
        Returns:
            int: The number of hash values.
        '''
        return len(self.hashvalues)

    def __eq__(self, other):
        '''
        Returns:
            bool: If their seeds and hash values are both equal then two
                are equivalent.
        '''
        return self.seed == other.seed and \
                np.array_equal(self.hashvalues, other.hashvalues)

    @classmethod
    def union(cls, *mhs):
        '''Create a MinHash which is the union of the MinHash objects passed as arguments.
        Args:
            *mhs: The MinHash objects to be united. The argument list length is variable,
                but must be at least 2.
        
        Returns:
            datasketch.MinHash: A new union MinHash.
        '''
        if len(mhs) < 2:
            raise ValueError("Cannot union less than 2 MinHash")
        num_perm = len(mhs[0])
        seed = mhs[0].seed
        if any((seed != m.seed or num_perm != len(m)) for m in mhs):
            raise ValueError("The unioning MinHash must have the\
                    same seed and number of permutation functions")
        hashvalues = np.minimum.reduce([m.hashvalues for m in mhs])
        permutations = mhs[0].permutations
        return cls(num_perm=num_perm, seed=seed, hashvalues=hashvalues,
                permutations=permutations)


In [6]:
import pandas as pd
data = '../data/'
cache = '../cache/'

train = pd.read_csv(data+"train.tsv", sep='\t')
test = pd.read_csv(data+"test.tsv", sep='\t')

In [36]:
df_all = pd.concat([train, test], axis=0)

In [3]:
X = train[['item_description']]

In [4]:
%%time
X['sets'] = X['item_description'].apply(lambda x: set(str(x).lower().split(' ')))

CPU times: user 15.7 s, sys: 884 ms, total: 16.6 s
Wall time: 16.6 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [6]:
import numpy as np

In [10]:
len(X)

1482535

In [11]:

D = X['set'].values

MemoryError: 

In [8]:
%%time
for i in range(len(X)):
    min_jac = np.empty(1, len(X))
    d = list(D[i])
    m = MinHash()
    m.update(d.encode('utf8'))
    for j in range((i+1), len(X)):
        d = list(D[j])
        m1 = MinHash()
        m1.update(d.encode('utf8'))
        min_jac[i][j] = m.jaccard(m1)
        mini_jac[j][i] = jac[i][j]

MemoryError: 

In [22]:
import sys

import numpy as np
from scipy.stats import pearsonr
from collections import Counter

sys.path.append("..")
# import config


def _sigmoid(score):
    p = 1. / (1. + np.exp(-score))
    return p


def _logit(p):
    return np.log(p/(1.-p))


def _softmax(score):
    score = np.asarray(score, dtype=float)
    score = np.exp(score - np.max(score))
    score /= np.sum(score, axis=1)[:,np.newaxis]
    return score


def _cast_proba_predict(proba):
    N = proba.shape[1]
    w = np.arange(1,N+1)
    pred = proba * w[np.newaxis,:]
    pred = np.sum(pred, axis=1)
    return pred


def _one_hot_label(label, n_classes):
    num = label.shape[0]
    tmp = np.zeros((num, n_classes), dtype=int)
    tmp[np.arange(num),label.astype(int)] = 1
    return tmp


def _majority_voting(x, weight=None):
    ## apply weight
    if weight is not None:
        assert len(weight) == len(x)
        x = np.repeat(x, weight)
    c = Counter(x)
    value, count = c.most_common()[0]
    return value


def _voter(x, weight=None):
    idx = np.isfinite(x)
    if sum(idx) == 0:
        value = config.MISSING_VALUE_NUMERIC
    else:
        if weight is not None:
            value = _majority_voting(x[idx], weight[idx])
        else:
            value = _majority_voting(x[idx])
    return value


def _array_majority_voting(X, weight=None):
    y = np.apply_along_axis(_voter, axis=1, arr=X, weight=weight)
    return y


def _mean(x):
    idx = np.isfinite(x)
    if sum(idx) == 0:
        value = float(config.MISSING_VALUE_NUMERIC) # cast it to float to accommodate the np.mean
    else:
        value = np.mean(x[idx]) # this is float!
    return value


def _array_mean(X):
    y = np.apply_along_axis(_mean, axis=1, arr=X)
    return y


def _corr(x, y_train):
    if _dim(x) == 1:
        corr = pearsonr(x.flatten(), y_train)[0]
        if str(corr) == "nan":
            corr = 0.
    else:
        corr = 1.
    return corr


def _dim(x):
    d = 1 if len(x.shape) == 1 else x.shape[1]
    return d


def _entropy(proba):
    entropy = -np.sum(proba*np.log(proba))
    return entropy


def _try_divide(x, y, val=0.0):
    """try to divide two numbers"""
    if y != 0.0:
        val = float(x) / y
    return val

In [23]:
import pickle


def _save(fname, data, protocol=3):
    with open(fname, "wb") as f:
        pickle.dump(data, f, protocol)

def _load(fname):
    with open(fname, "rb") as f:
        return pickle.load(f)

In [24]:
# Since we have many features that measure the correlation/similarity/distance
# between category_name and name, name and item_description, we implement this base class.
class BaseEstimator:
    def __init__(self, obs_corpus, target_corpus, aggregation_mode, id_list=None, aggregation_mode_prev=""):
        self.obs_corpus = obs_corpus
        self.N = len(obs_corpus)
        # for standalone feature, we use the same interface, so better take care of it
        self.target_corpus = range(self.N) if target_corpus is None else target_corpus
        # id_list is used for group based relevance/distance features
        self.id_list = range(self.N) if id_list is None else id_list
        # aggregation for list features, e.g., intersect positions
        self.aggregation_mode, self.aggregator = self._check_aggregation_mode(aggregation_mode)
        self.aggregation_mode_prev, self.aggregator_prev = self._check_aggregation_mode(aggregation_mode_prev)
        self.double_aggregation = False
        if self.aggregator_prev != [None]:
            # the output of transform_one is a list of list, i.e., [[...], [...], [...]]
            # self.aggregator_prev is used to aggregate the inner list
            # This is used for the following features:
            # 1. EditDistance_Ngram
            # 2. CompressionDistance_Ngram
            # 3. Word2Vec_CosineSim
            # 4. WordNet_Path_Similarity, WordNet_Lch_Similarity, WordNet_Wup_Similarity
            # which are very time consuming to compute the inner list
            self.double_aggregation = True

    def _check_aggregation_mode(self, aggregation_mode):
        valid_aggregation_modes = ["", "size", "mean", "std", "max", "min", "median"]
        if isinstance(aggregation_mode, str):
            assert aggregation_mode.lower() in valid_aggregation_modes, "Wrong aggregation_mode: %s"%aggregation_mode
            aggregation_mode = [aggregation_mode.lower()]
        elif isinstance(aggregation_mode, list):
            for m in aggregation_mode:
                assert m.lower() in valid_aggregation_modes, "Wrong aggregation_mode: %s"%m
            aggregation_mode = [m.lower() for m in aggregation_mode]

        aggregator = [None if m == "" else getattr(np, m) for m in aggregation_mode]

        return aggregation_mode, aggregator

    def transform(self):
        # original score
        score = list(map(self.transform_one, self.obs_corpus, self.target_corpus, self.id_list))
        # aggregation
        if isinstance(score[0], list):
            if self.double_aggregation:
                # double aggregation
                res = np.zeros((self.N, len(self.aggregator_prev) * len(self.aggregator)), dtype=float)
                for m,aggregator_prev in enumerate(self.aggregator_prev):
                    for n,aggregator in enumerate(self.aggregator):
                        idx = m * len(self.aggregator) + n
                        for i in range(self.N):
                            # process in a safer way
                            try:
                                tmp = []
                                for l in score[i]:
                                    try:
                                        s = aggregator_prev(l)
                                    except:
                                        s = config.MISSING_VALUE_NUMERIC
                                    tmp.append(s)
                            except:
                                tmp = [ config.MISSING_VALUE_NUMERIC ]
                            try:
                                s = aggregator(tmp)
                            except:
                                s = config.MISSING_VALUE_NUMERIC
                            res[i,idx] = s
            else:
                # single aggregation
                res = np.zeros((self.N, len(self.aggregator)), dtype=float)
                for m,aggregator in enumerate(self.aggregator):
                    for i in range(self.N):
                        # process in a safer way
                        try:
                            s = aggregator(score[i])
                        except:
                            s = config.MISSING_VALUE_NUMERIC
                        res[i,m] = s
        else:
            res = np.asarray(score, dtype=float)
        return res


# Wrapper for generating standalone feature, e.g., 
# count of words in search_term
class StandaloneFeatureWrapper:
    def __init__(self, generator, dfAll, obs_fields, param_list, force_corr=False):
        self.generator = generator
        self.dfAll = dfAll
        self.obs_fields = obs_fields
        self.param_list = param_list
#         self.feat_dir = feat_dir
#         self.logger = logger
        self.force_corr = force_corr

    def go(self):
        y_train = self.dfAll["price"].values[:TRAIN_SIZE]
        for obs_field in self.obs_fields:
            if obs_field not in self.dfAll.columns:
                self.logger.info("Skip %s"%obs_field)
                continue
            obs_corpus = self.dfAll[obs_field].values
            ext = self.generator(obs_corpus, None, *self.param_list)
            x = ext.transform()
            if isinstance(ext.__name__(), list):
                for i,feat_name in enumerate(ext.__name__()):
                    dim = 1
                    fname = "%s_%s_%dD"%(feat_name, obs_field, dim)
#                     pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x[:,i])
                    corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train)
                    print("%s (%dD): corr = %.6f"%(fname, dim, corr))
            else:
                dim = np_utils._dim(x)
                fname = "%s_%s_%dD"%(ext.__name__(), obs_field, dim)
                pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x)
                if dim == 1:
                    corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                    self.logger.info("%s (%dD): corr = %.6f"%(fname, dim, corr))
                elif self.force_corr:
                    for j in range(dim):
                        corr = np_utils._corr(x[:TRAIN_SIZE,j], y_train)
                        print("%s (%d/%dD): corr = %.6f"%(fname, j+1, dim, corr))


# Wrapper for generating pairwise feature, e.g., 
# intersect count of words between search_term and product_title
class PairwiseFeatureWrapper:
    def __init__(self, generator, dfAll, obs_fields, target_fields, param_list, feat_dir, logger, force_corr=False):
        self.generator = generator
        self.dfAll = dfAll
        self.obs_fields = obs_fields
        self.target_fields = target_fields
        self.param_list = param_list
#         self.feat_dir = feat_dir
#         self.logger = logger
        self.force_corr = force_corr

    def go(self):
        y_train = self.dfAll["price"].values[:TRAIN_SIZE]
        for obs_field in self.obs_fields:
            if obs_field not in self.dfAll.columns:
                print("Skip %s"%obs_field)
                continue
            obs_corpus = self.dfAll[obs_field].values
            for target_field in self.target_fields:
                if target_field not in self.dfAll.columns:
                    self.logger.info("Skip %s"%target_field)
                    continue
                target_corpus = self.dfAll[target_field].values
                ext = self.generator(obs_corpus, target_corpus, *self.param_list)
                x = ext.transform()
                if isinstance(ext.__name__(), list):
                    for i,feat_name in enumerate(ext.__name__()):
                        dim = 1
                        fname = "%s_%s_x_%s_%dD"%(feat_name, obs_field, target_field, dim)
                        pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x[:,i])
                        corr = np_utils._corr(x[:TRAIN_SIZE,i], y_train)
                        print("%s (%dD): corr = %.6f"%(fname, dim, corr))
                else:
                    dim = np_utils._dim(x)
                    fname = "%s_%s_x_%s_%dD"%(ext.__name__(), obs_field, target_field, dim)
                    pkl_utils._save(os.path.join(self.feat_dir, fname+config.FEAT_FILE_SUFFIX), x)
                    if dim == 1:
                        corr = np_utils._corr(x[:TRAIN_SIZE], y_train)
                        print("%s (%dD): corr = %.6f"%(fname, dim, corr))
                    elif self.force_corr:
                        for j in range(dim):
                            corr = np_utils._corr(x[:TRAIN_SIZE,j], y_train)
                            print("%s (%d/%dD): corr = %.6f"%(fname, j+1, dim, corr))

In [29]:
class DocLen(BaseEstimator):
    """Length of document"""
    def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
        super().__init__(obs_corpus, target_corpus, aggregation_mode)

    def __name__(self):
        return "DocLen"

    def transform_one(self, obs, target, id):
        obs_tokens = _tokenize(obs, token_pattern=' ')
        return len(obs_tokens)


class DocFreq(BaseEstimator):
    """Frequency of the document in the corpus"""
    def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
        super().__init__(obs_corpus, target_corpus, aggregation_mode)
        self.counter = Counter(obs_corpus)

    def __name__(self):
        return "DocFreq"

    def transform_one(self, obs, target, id):
        return self.counter[obs]


class DocEntropy(BaseEstimator):
    """Entropy of the document"""
    def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
        super().__init__(obs_corpus, target_corpus, aggregation_mode)

    def __name__(self):
        return "DocEntropy"

    def transform_one(self, obs, target, id):
        obs_tokens = _tokenize(obs, token_pattern)
        counter = Counter(obs_tokens)
        count = np.asarray(list(counter.values()))
        proba = count/np.sum(count)
        return np_utils._entropy(proba)


class DigitCount(BaseEstimator):
    """Count of digit in the document"""
    def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
        super().__init__(obs_corpus, target_corpus, aggregation_mode)

    def __name__(self):
        return "DigitCount"

    def transform_one(self, obs, target, id):
        return len(re.findall(r"\d", obs))


class DigitRatio(BaseEstimator):
    def __init__(self, obs_corpus, target_corpus, aggregation_mode=""):
        super().__init__(obs_corpus, target_corpus, aggregation_mode)

    def __name__(self):
        return "DigitRatio"

    def transform_one(self, obs, target, id):
        obs_tokens = _tokenize(obs, token_pattern = ' ')
        return np_utils._try_divide(len(re.findall(r"\d", obs)), len(obs_tokens))


In [30]:
TRAIN_SIZE = len(train)

In [31]:
import re


def _tokenize(text, token_pattern=" "):
    # token_pattern = r"(?u)\b\w\w+\b"
    # token_pattern = r"\w{1,}"
    # token_pattern = r"\w+"
    # token_pattern = r"[\w']+"
    if token_pattern == " ":
        # just split the text into tokens
        return text.split(" ")
    else:
        token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
        group = token_pattern.findall(text)
        return group

In [39]:
df_all.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id
0,,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0
3,,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0
4,,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0


In [37]:
from collections import Counter

In [41]:
counter = Counter(df_all['name'].values)

In [42]:
counter

Counter({'MLB Cincinnati Reds T Shirt Size XL': 1,
         'Razer BlackWidow Chroma Keyboard': 1,
         'AVA-VIV Blouse': 1,
         'Leather Horse Statues': 1,
         '24K GOLD plated rose': 1,
         'Bundled items requested for Ruie': 1,
         'Acacia pacific tides santorini top': 1,
         'Girls cheer and tumbling bundle of 7': 1,
         'Girls Nike Pro shorts': 2,
         'Porcelain clown doll checker pants VTG': 1,
         'Smashbox primer': 41,
         'New vs pi k body mists': 1,
         'Black Skater dress': 1,
         'Sharpener and eraser': 1,
         'HOLD for Dogs2016 Minnetonka boots': 1,
         'Sephora tarte birthday gift': 2,
         'Glitter Eyeshadow': 6,
         "New: Baby K'tan active baby carrier": 1,
         'Too Faced Limited "Merry Macaroons"': 1,
         'Cream/ Beige Front Cross Shirt': 1,
         'Torrid Nautical Peplum Tube Top': 1,
         'NWT VS ULTIMATE SPORTS BRA 34ddd': 1,
         'Galaxy S7 Edge (Unlocked) 32GB': 1,
  

In [53]:
def entropy(obs, token_pattern=' '):
    obs_tokens = obs.split(token_pattern)
    counter = Counter(obs_tokens)
    count = np.asarray(list(counter.values()))
    proba = count/np.sum(count)
    return _entropy(proba)

def digit_count(obs):
    return len(re.findall(r"\d", obs))

def digit_ratio(obs, token_pattern = ' '):
    obs_tokens = obs.split(token_pattern)
    return _try_divide(len(re.findall(r"\d", obs)), len(obs_tokens))

def emoji_count(obs):
    return len(re.findall(r'[^\w\s,]', obs))
    
def emoji_ratio(obs, token_pattern = ' '):
    obs_tokens = obs.split(token_pattern)
    return _try_divide(len(re.findall(r'[^\w\s,]', obs)), len(obs_tokens))

In [55]:
obs_fields = ["name", "item_description", 'brand_name']
def get_doc_len(df):
    for f in obs_fields:
        df[f+'_doclen'] = df[f].map(lambda x: len(str(x).lower().split(' ')))
    return df
    
def get_doc_entropy(df):
    for f in obs_fields:
        df[f+'_docEntropy'] = df[f].map(lambda x: entropy(str(x).lower(), token_pattern))
    return df

def get_doc_digit_count(df):
    for f in obs_fields:
        df[f+'_digitCount'] = df[f].map(lambda x: digit_count(str(x).lower()))
    return df

def get_doc_digit_ratio(df):
    for f in obs_fields:
        df[f+'_digitRatio'] = df[f].map(lambda x: digit_ratio(str(x).lower()))
    return df

def get_doc_emoji_count(df):
    for f in obs_fields:
        df[f+'_emojiCount'] = df[f].map(lambda x: emoji_count(str(x).lower()))
    return df

def get_doc_emoji_ratio(df):
    for f in obs_fields:
        df[f+'_emojiRatio'] = df[f].map(lambda x: emoji_ratio(str(x).lower()))
    return df

In [87]:
%%time
## basic
generators = ['DocLen', 'DocFreq', 'DocEntropy', 'DigitCount', 'DigitRatio']
obs_fields = ["name", "item_description", 'brand_name']

for f in obs_fields:
    df_all[f+'_doclen'] = df_all[f].map(lambda x: len(str(x).lower().split(' ')))
    counter = Counter(df_all[f].values)
    df_all[f+'_docfreq'] = df_all[f].map(lambda x: counter[x])
    token_pattern = ' '
    df_all[f+'_docEntropy'] = df_all[f].map(lambda x: entropy(str(x).lower(), token_pattern))
    df_all[f+'_digitCount'] = df_all[f].map(lambda x: digit_count(str(x).lower()))
    df_all[f+'_digitRatio'] = df_all[f].map(lambda x: digit_ratio(str(x).lower()))
    df_all[f+'_emojiCount'] = df_all[f].map(lambda x: emoji_count(str(x).lower()))
    df_all[f+'_emojiRatio'] = df_all[f].map(lambda x: emoji_ratio(str(x).lower()))

CPU times: user 4min 14s, sys: 208 ms, total: 4min 15s
Wall time: 4min 15s


In [88]:
for f in obs_fields:
    df_all[f+'_doclen'] = df_all[f].map(lambda x: len(str(x).lower().split(' ')))
    counter = Counter(df_all[f].values)
    df_all[f+'_docfreq'] = df_all[f].map(lambda x: counter[x])
    token_pattern = ' '
    df_all[f+'_docEntropy'] = df_all[f].map(lambda x: entropy(str(x).lower(), token_pattern))
    df_all[f+'_digitCount'] = df_all[f].map(lambda x: digit_count(str(x).lower()))
    df_all[f+'_digitRatio'] = df_all[f].map(lambda x: digit_ratio(str(x).lower()))
    df_all[f+'_emojiCount'] = df_all[f].map(lambda x: emoji_count(str(x).lower()))
    df_all[f+'_emojiRatio'] = df_all[f].map(lambda x: emoji_ratio(str(x).lower()))

In [89]:
%%time
f = 'category_name'
df_all[f+'_doclen'] = df_all[f].map(lambda x: len(str(x).lower().split('/')))
counter = Counter(df_all[f].values)
df_all[f+'_docfreq'] = df_all[f].map(lambda x: counter[x])
token_pattern = '/'
df_all[f+'_docEntropy'] = df_all[f].map(lambda x: entropy(str(x).lower(),token_pattern))
df_all[f+'_digitCount'] = df_all[f].map(lambda x: digit_count(str(x).lower()))
df_all[f+'_digitRatio'] = df_all[f].map(lambda x: digit_ratio(str(x).lower(), token_pattern))
df_all[f+'_emojiCount'] = df_all[f].map(lambda x: emoji_count(str(x).lower()))
df_all[f+'_emojiRatio'] = df_all[f].map(lambda x: emoji_ratio(str(x).lower()))

CPU times: user 1min 12s, sys: 32 ms, total: 1min 12s
Wall time: 1min 12s


In [90]:
df_all.shape

(2175894, 59)

In [91]:
sum((df_all != 0).any(axis=0) == True)

59

In [92]:
def _unigrams(words):
    """
        Input: a list of words, e.g., ["I", "am", "Denny"]
        Output: a list of unigram
    """
    assert type(words) == list
    return words


def _bigrams(words, join_string, skip=0):
    """
       Input: a list of words, e.g., ["I", "am", "Denny"]
       Output: a list of bigram, e.g., ["I_am", "am_Denny"]
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L-1):
            for k in range(1,skip+2):
                if i+k < L:
                    lst.append( join_string.join([words[i], words[i+k]]) )
    else:
        # set it as unigram
        lst = _unigrams(words)
    return lst


def _trigrams(words, join_string, skip=0):
    """
       Input: a list of words, e.g., ["I", "am", "Denny"]
       Output: a list of trigram, e.g., ["I_am_Denny"]
       I use _ as join_string for this example.
    """
    assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L-2):
            for k1 in range(1,skip+2):
                for k2 in range(1,skip+2):
                    if i+k1 < L and i+k1+k2 < L:
                        lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) )
    else:
        # set it as bigram
        lst = _bigrams(words, join_string, skip)
    return lst

def UniqueCount_Ngram(obs, count, token_pattern=' '):
    obs_tokens = obs.lower().split(token_pattern)
    obs_ngrams = _ngrams(obs_tokens, count)
    return len(set(obs_ngrams))

def UniqueRatio_Ngram(obs, count, token_pattern=' '):
    obs_tokens = obs.lower().split(token_pattern)
    obs_ngrams = _ngrams(obs_tokens, count)
    return _try_divide(len(set(obs_ngrams)), len(obs_ngrams))

def _ngrams(words, ngram, join_string=" "):
    """wrapper for ngram"""
    if ngram == 1:
        return _unigrams(words)
    elif ngram == 2:
        return _bigrams(words, join_string)
    elif ngram == 3:
        return _trigrams(words, join_string)
    elif ngram == 4:
        return _fourgrams(words, join_string)
    elif ngram == 12:
        unigram = _unigrams(words)
        bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
        return unigram + bigram
    elif ngram == 123:
        unigram = _unigrams(words)
        bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
        trigram = [x for x in _trigrams(words, join_string) if len(x.split(join_string)) == 3]
        return unigram + bigram + trigram

In [93]:
%%time
## basic
# generators = [UniqueCount_Ngram, UniqueRatio_Ngram]
obs_fields = ["name", "item_description", 'brand_name']
ngrams = [1,2,3]
token_pattern =' '
for f in obs_fields:
    for n in ngrams:
        df_all[f+'_{}_uc'.format(n)] = df_all[f].map(lambda x: UniqueCount_Ngram(str(x), n))
        df_all[f+'_{}_ur'.format(n)] = df_all[f].map(lambda x: UniqueRatio_Ngram(str(x), n))

CPU times: user 5min 53s, sys: 128 ms, total: 5min 53s
Wall time: 5min 53s


In [94]:
%%time
f = 'category_name'
for n in ngrams:
    df_all[f+'_{}_uc'.format(n)] = df_all[f].map(lambda x: UniqueCount_Ngram(str(x), n, '/'))
    df_all[f+'_{}_ur'.format(n)] = df_all[f].map(lambda x: UniqueRatio_Ngram(str(x), n, '/'))

CPU times: user 40.9 s, sys: 56 ms, total: 40.9 s
Wall time: 40.9 s


In [95]:
cols1 = set(df_all.columns)

In [96]:
df_all.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,name_doclen,...,brand_name_emojiRatio,category_name_emojiCount,category_name_emojiRatio,category_name_1_uc,category_name_1_ur,category_name_2_uc,category_name_2_ur,category_name_3_uc,brand_name_3_ur,category_name_3_ur
0,,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,7,...,0.0,3,3.0,3,1.0,2,1.0,1,1.0,1.0
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,4,...,0.0,4,0.8,3,1.0,2,1.0,1,1.0,1.0
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,2,...,0.0,3,1.0,3,1.0,2,1.0,1,1.0,1.0
3,,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0,3,...,0.0,2,0.5,3,1.0,2,1.0,1,1.0,1.0
4,,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0,4,...,0.0,2,2.0,3,1.0,2,1.0,1,1.0,1.0


In [97]:
sum((df_all != df_all.iloc[0]).any() == False) # any constant cols?

  result = func(values, other)


2

In [98]:
df_all =  df_all.loc[:, (df_all != df_all.iloc[0]).any()] 

  result = func(values, other)


In [100]:
cols2= set(df_all.columns)

In [101]:
cols1 - cols2

{'brand_name_3_ur', 'category_name_3_ur'}

In [102]:
df_all.head()

Unnamed: 0,brand_name,category_name,item_condition_id,item_description,name,price,shipping,test_id,train_id,name_doclen,...,item_description_emojiRatio,brand_name_emojiCount,brand_name_emojiRatio,category_name_emojiCount,category_name_emojiRatio,category_name_1_uc,category_name_1_ur,category_name_2_uc,category_name_2_ur,category_name_3_uc
0,,Men/Tops/T-shirts,3,No description yet,MLB Cincinnati Reds T Shirt Size XL,10.0,1,,0.0,7,...,0.0,0,0.0,3,3.0,3,1.0,2,1.0,1
1,Razer,Electronics/Computers & Tablets/Components & P...,3,This keyboard is in great condition and works ...,Razer BlackWidow Chroma Keyboard,52.0,0,,1.0,4,...,0.083333,0,0.0,4,0.8,3,1.0,2,1.0,1
2,Target,Women/Tops & Blouses/Blouse,1,Adorable top with a hint of lace and a key hol...,AVA-VIV Blouse,10.0,1,,2.0,2,...,0.068966,0,0.0,3,1.0,3,1.0,2,1.0,1
3,,Home/Home Décor/Home Décor Accents,1,New with tags. Leather horses. Retail for [rm]...,Leather Horse Statues,35.0,1,,3.0,3,...,0.28125,0,0.0,2,0.5,3,1.0,2,1.0,1
4,,Women/Jewelry/Necklaces,1,Complete with certificate of authenticity,24K GOLD plated rose,44.0,0,,4.0,4,...,0.0,0,0.0,2,2.0,3,1.0,2,1.0,1
