# Imports

In [1]:
import pathlib

# Constants

In [2]:
PROJECT_DIR = pathlib.Path('~/work').expanduser()
DATA_DIR = PROJECT_DIR / 'data'

In [3]:
# https://rstudio-pubs-static.s3.amazonaws.com/120883_c8123ff272164b2a94be097a6237150b.html
YELP_DATA_DIR = DATA_DIR / 'yelp' / 'v6' / 'yelp_dataset_challenge_academic_dataset'

# Example 5-3

Feature hashing for word features

In [4]:
def hash_features(word_list, m):
    output = [0] * m
    for word in word_list:
        index = hash_fcn(word) % m
        output[index] += 1
    return output

# Example 5-4

Signed feature hashing

In [5]:
def hash_features(word_list, m):
    output = [0] * m
    for word in word_list:
        index = hash_fcn(word) % m
        sign_bit = sign_hash(word) % 2
        if sign_bit == 0:
            output[index] -= 1
        else:
            output[index] += 1
    return output

Feature hashing can be used for models that involve the inner product of feature vectors
and coefficients, such as linear models and kernel methods.

# Example 5-5

Feature hashing (a.k.a. “the hashing trick”)

In [6]:
import json
import pandas as pd

In [7]:
# Load the first 10,000 reviews
with open(YELP_DATA_DIR / 'yelp_academic_dataset_review.json') as f:
    review_df = pd.DataFrame([
        json.loads(f.readline()) for i in range(10000)
    ])

In [8]:
# Define m as equal to the unique number of business_ids
m = len(review_df['business_id'].unique())
m

528

In [9]:
from sklearn.feature_extraction import FeatureHasher

h = FeatureHasher(n_features=m, input_type='string')
f = h.transform(review_df['business_id'])

In [10]:
# How does this affect feature interpretability?
review_df['business_id'].unique().tolist()[0:5]

['vcNAWiLM4dR7D2nwwJ7nCA',
 'UsFtqoBl7naz8AVUBZMjQQ',
 'cE27W9VPgO88Qxe4ol6y_g',
 'HZdLhv6COCleJMo7nPl-RA',
 'mVHrayjG3uZ_RLHkLj-AMg']

In [11]:
f.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
# Not great. BUT, let's see the storage size of our features.
from sys import getsizeof

print('Our pandas Series, in bytes: ', getsizeof(review_df['business_id']))
print('Our hashed numpy array, in bytes: ', getsizeof(f))

Our pandas Series, in bytes:  790160
Our hashed numpy array, in bytes:  64
