# Imports

In [1]:
import pathlib

# Constants

In [2]:
PROJECT_DIR = pathlib.Path('~/work').expanduser()
DATA_DIR = PROJECT_DIR / 'data'

In [3]:
# https://www.kaggle.com/c/avazu-ctr-prediction/data
AVAZU_DATA_DIR = DATA_DIR / 'avazu'

# Example 5-6

Bin-counting example

In [4]:
import numpy as np
import pandas as pd

In [5]:
# train_subset is first 100K rows of 6+GB set
df = pd.read_csv(AVAZU_DATA_DIR / 'train_subset.csv')

In [6]:
# How many unique features should we have after?
len(df['device_id'].unique())

7202

Features are $\theta$ = [$N^+$, $N^-$, $log(N^+)-log(N^-)$, isRest]

$N^+$ = $p(+)$ = $n^+/(n^+ + n^-)$

$N^-$ = $p(-)$ = $n^-/(n^+ + n^-)$

$log(N^+)-log(N^-)$ = $log(\frac{p(+)}{p(-)})$


isRest = back-off bin (not shown here)

In [7]:
def click_counting(x, bin_column):
    
    clicks = (x.loc[x['click'] > 0, bin_column]
              .value_counts().rename('clicks'))
    no_clicks = (x.loc[x['click'] < 1, bin_column]
                 .value_counts().rename('no_clicks'))
    
    counts = pd.concat([clicks, no_clicks], axis=1).fillna(0)
    counts['total'] = counts['clicks'] + counts['no_clicks']
    
    return counts

In [8]:
def bin_counting(counts):
    counts['N+'] = counts['clicks'] / counts['total']
    counts['N-'] = counts['no_clicks'] / counts['total']
    
    # Infinity-proof formulas
    counts['odds_ratio'] = (counts['clicks'] + 1) / (counts['no_clicks'] + 1)
    counts['log_odds_ratio'] = np.log1p(counts['clicks']) - np.log1p(counts['no_clicks'])    

    # If we wanted to only return bin-counting
    # properties, we would filter here
    bin_counts = counts[['N+', 'N-', 'odds_ratio', 'log_odds_ratio']]
    return counts, bin_counts

In [9]:
# Bin counts example: device_id
bin_column = 'device_id'
device_clicks = click_counting(df[[bin_column, 'click']], bin_column)
device_all, device_bin_counts = bin_counting(device_clicks)

In [10]:
# Check to make sure we have all the devices
len(device_bin_counts)

7202

In [11]:
device_all.sort_values(by='total', ascending=False).head()

Unnamed: 0,clicks,no_clicks,total,N+,N-,odds_ratio,log_odds_ratio
a99f214a,15729.0,71206.0,86935.0,0.180928,0.819072,0.220905,-1.510021
c357dbff,33.0,134.0,167.0,0.197605,0.802395,0.251852,-1.378914
31da1bd0,0.0,62.0,62.0,0.0,1.0,0.015873,-4.143135
936e92fb,5.0,54.0,59.0,0.084746,0.915254,0.109091,-2.215574
a167aa83,0.0,55.0,55.0,0.0,1.0,0.017857,-4.025352


In [12]:
# We can see how this can change model evaluation time by comparing raw vs. bin-counting size
from sys import getsizeof

print('Our pandas Series, in bytes: ', getsizeof(df[['device_id', 'click']]))
print('Our bin-counting feature, in bytes: ', getsizeof(device_bin_counts))

Our pandas Series, in bytes:  7300160
Our bin-counting feature, in bytes:  698626
