## Encoding Categorical Variables Using Statistics
[Source](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/discussion/32123)

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train = pd.read_json('train.json')
test = pd.read_json('test.json')
print(train.shape)
print(test.shape)

(49352, 15)
(74659, 14)


In [2]:
train['row_id'] = range(train.shape[0])
test['row_id'] = range(test.shape[0])
train['train'] = 1
test['train'] = 0

In [3]:
target_column = 'price'
group_column = 'manager_id'

data = train[['row_id', 'train', target_column, group_column]].append(
    test[['row_id','train', target_column, group_column]])
data.head(2)

Unnamed: 0,row_id,train,price,manager_id
10,0,1,3000,5ba989232d0489da1b5f2c45f6688adc
10000,1,1,5465,7533621a882f71e25173b27e3139d83d


In [4]:
data.shape

(124011, 4)

In [5]:
grouped = data[[target_column, group_column]].groupby(group_column)

In [6]:
len(data.price[data.manager_id=='0000abd7518b94c35a90d64b56fbf3e6'])

35

In [7]:
the_size = pd.DataFrame(grouped.size()).reset_index()
the_size.columns = [group_column, '%s_size' % target_column]
the_size.head(2)

Unnamed: 0,manager_id,price_size
0,0000abd7518b94c35a90d64b56fbf3e6,35
1,001ce808ce1720e24a9510e014c69707,28


In [8]:
sum(data.price[data.manager_id=='0000abd7518b94c35a90d64b56fbf3e6']) / \
len(data.price[data.manager_id=='0000abd7518b94c35a90d64b56fbf3e6'])

2765.1714285714284

In [9]:
the_mean = pd.DataFrame(grouped.mean()).reset_index()
the_mean.columns = [group_column, '%s_mean' % target_column]
the_mean.head(2)

Unnamed: 0,manager_id,price_mean
0,0000abd7518b94c35a90d64b56fbf3e6,2765.171429
1,001ce808ce1720e24a9510e014c69707,3170.535714


In [10]:
the_std = pd.DataFrame(grouped.std()).reset_index().fillna(0)
the_std.columns = [group_column, '%s_std' % target_column]
the_median = pd.DataFrame(grouped.median()).reset_index()
the_median.columns = [group_column, '%s_median' % target_column]
the_max = pd.DataFrame(grouped.max()).reset_index()
the_max.columns = [group_column, '%s_max' % target_column]
the_min = pd.DataFrame(grouped.min()).reset_index()
the_min.columns = [group_column, '%s_min' % target_column]

In [11]:
the_stats = pd.merge(the_size, the_mean).merge(the_std).merge(the_median).merge(the_max).merge(the_min)
the_stats.head(2)

Unnamed: 0,manager_id,price_size,price_mean,price_std,price_median,price_max,price_min
0,0000abd7518b94c35a90d64b56fbf3e6,35,2765.171429,532.174976,2695.0,4295,1995
1,001ce808ce1720e24a9510e014c69707,28,3170.535714,1116.447192,3000.0,6495,1795


In [12]:
print(the_stats.shape)
print(data.shape)

(4399, 7)
(124011, 4)


In [13]:
data = pd.merge(data, the_stats)
data.head()

Unnamed: 0,row_id,train,price,manager_id,price_size,price_mean,price_std,price_median,price_max,price_min
0,0,1,3000,5ba989232d0489da1b5f2c45f6688adc,235,2541.914894,524.703572,2600.0,4400,1700
1,107,1,2250,5ba989232d0489da1b5f2c45f6688adc,235,2541.914894,524.703572,2600.0,4400,1700
2,152,1,2200,5ba989232d0489da1b5f2c45f6688adc,235,2541.914894,524.703572,2600.0,4400,1700
3,572,1,1950,5ba989232d0489da1b5f2c45f6688adc,235,2541.914894,524.703572,2600.0,4400,1700
4,1198,1,2000,5ba989232d0489da1b5f2c45f6688adc,235,2541.914894,524.703572,2600.0,4400,1700


In [14]:
selected_train = data[data['train'] == 1]
selected_test = data[data['train'] == 0]
print(selected_train.shape)
print(selected_test.shape)

(49352, 10)
(74659, 10)


In [15]:
selected_train.sort_values('row_id', inplace=True)
selected_test.sort_values('row_id', inplace=True)

In [16]:
selected_train.head()

Unnamed: 0,row_id,train,price,manager_id,price_size,price_mean,price_std,price_median,price_max,price_min
0,0,1,3000,5ba989232d0489da1b5f2c45f6688adc,235,2541.914894,524.703572,2600.0,4400,1700
235,1,1,5465,7533621a882f71e25173b27e3139d83d,194,5419.93299,1905.188512,4700.0,12995,1995
429,2,1,2850,d9039c43983f6e564b1482b273bd7b01,314,3618.716561,1318.133209,3295.0,8795,1650
743,3,1,3275,1067e078446a7897d2da493d2f741316,470,3701.768085,1392.090938,3450.0,11100,1650
1213,4,1,3350,98e13ad4b495b9613cef886d79a6291f,51,3114.901961,1328.458765,2870.0,7995,1495


In [17]:
train[['row_id', 'manager_id', 'interest_level']].head()

Unnamed: 0,row_id,manager_id,interest_level
10,0,5ba989232d0489da1b5f2c45f6688adc,medium
10000,1,7533621a882f71e25173b27e3139d83d,low
100004,2,d9039c43983f6e564b1482b273bd7b01,high
100007,3,1067e078446a7897d2da493d2f741316,low
100013,4,98e13ad4b495b9613cef886d79a6291f,low


In [18]:
selected_train.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)
selected_test.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)

In [19]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
selected_train.reset_index(drop=True, inplace=True)
selected_test.reset_index(drop=True, inplace=True)

train = pd.merge(train, selected_train, left_index=True, right_index=True)
train.shape

(49352, 23)

In [28]:
train.manager_id = pd.factorize(train.manager_id)[0]

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

train = pd.read_json('train.json')
test = pd.read_json('test.json')

In [34]:
def get_stats(train, test, target_column, group_column='manager_id'):
    '''
    target_column: numeric columns to group with (e.g. price, bedrooms, bathrooms)
    group_column: categorical columns to group on (e.g. manager_id, building_id)
    '''
    train['row_id'] = range(train.shape[0])
    test['row_id'] = range(test.shape[0])
    train['train'] = 1
    test['train'] = 0
    data = train[['row_id', 'train', target_column, group_column]].append(
        test[['row_id','train', target_column, group_column]])
    
    grouped = data[[target_column, group_column]].groupby(group_column)
    the_size = pd.DataFrame(grouped.size()).reset_index()
    the_size.columns = [group_column, '%s_size' % target_column]
    the_mean = pd.DataFrame(grouped.mean()).reset_index()
    the_mean.columns = [group_column, '%s_mean' % target_column]
    the_std = pd.DataFrame(grouped.std()).reset_index().fillna(0)
    the_std.columns = [group_column, '%s_std' % target_column]
    the_median = pd.DataFrame(grouped.median()).reset_index()
    the_median.columns = [group_column, '%s_median' % target_column]
    the_max = pd.DataFrame(grouped.max()).reset_index()
    the_max.columns = [group_column, '%s_max' % target_column]
    the_min = pd.DataFrame(grouped.min()).reset_index()
    the_min.columns = [group_column, '%s_min' % target_column]
    
    the_stats = pd.merge(
        the_size, the_mean).merge(the_std).merge(the_median).merge(the_max).merge(the_min)

    data = pd.merge(data, the_stats)

    selected_train = data[data['train'] == 1]
    selected_test = data[data['train'] == 0]
    selected_train.sort_values('row_id', inplace=True)
    selected_test.sort_values('row_id', inplace=True)
    selected_train.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)
    selected_test.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)
    selected_train.reset_index(drop=True, inplace=True)
    selected_test.reset_index(drop=True, inplace=True)

    return selected_train, selected_test

In [35]:
selected_train, selected_test = get_stats(train, test, 'price', group_column='manager_id')

In [36]:
selected_train.head()

Unnamed: 0,price_size,price_mean,price_std,price_median,price_max,price_min
0,235,2541.914894,524.703572,2600.0,4400,1700
1,194,5419.93299,1905.188512,4700.0,12995,1995
2,314,3618.716561,1318.133209,3295.0,8795,1650
3,470,3701.768085,1392.090938,3450.0,11100,1650
4,51,3114.901961,1328.458765,2870.0,7995,1495


In [37]:
train[['manager_id', 'interest_level']].head()

Unnamed: 0,manager_id,interest_level
10,5ba989232d0489da1b5f2c45f6688adc,medium
10000,7533621a882f71e25173b27e3139d83d,low
100004,d9039c43983f6e564b1482b273bd7b01,high
100007,1067e078446a7897d2da493d2f741316,low
100013,98e13ad4b495b9613cef886d79a6291f,low


In [38]:
selected_manager_id_proj = ['bathrooms', 'bedrooms', 'listing_id']
train_stack = train[['manager_id']]
train_stack.reset_index(drop=True, inplace=True)

for target_col in selected_manager_id_proj:
    tmp_train, tmp_test = get_stats(train, test, target_column=target_col)
    train_stack = pd.merge(train_stack, tmp_train, left_index=True, right_index=True)

In [40]:
train_stack.head()

Unnamed: 0,manager_id,bathrooms_size,bathrooms_mean,bathrooms_std,bathrooms_median,bathrooms_max,bathrooms_min,bedrooms_size,bedrooms_mean,bedrooms_std,bedrooms_median,bedrooms_max,bedrooms_min,listing_id_size,listing_id_mean,listing_id_std,listing_id_median,listing_id_max,listing_id_min
0,5ba989232d0489da1b5f2c45f6688adc,235,1.106383,0.305508,1.0,2.0,1.0,235,1.502128,0.935412,1.0,4,0,235,7031766,125976.544428,7050454,7226750,6814711
1,7533621a882f71e25173b27e3139d83d,194,1.360825,0.544603,1.0,3.5,0.0,194,1.623711,0.831787,1.0,5,0,194,7120468,74552.553594,7146956,7231089,7011029
2,d9039c43983f6e564b1482b273bd7b01,314,1.136943,0.402117,1.0,4.0,1.0,314,1.646497,1.196055,1.0,5,0,314,7038196,152151.539263,7015751,7714547,6813410
3,1067e078446a7897d2da493d2f741316,470,1.157447,0.387295,1.0,3.5,1.0,470,1.485106,0.976696,1.0,6,0,470,6945694,95778.089245,6931886,7220934,6820184
4,98e13ad4b495b9613cef886d79a6291f,51,1.176471,0.385013,1.0,2.0,1.0,51,1.54902,1.316263,2.0,4,0,51,6945288,89410.430369,6923845,7220248,6828038


In [None]:
selected_manager_id_proj = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'bad_addr', 'listing_id',
                   'month', 'day', 'weekday', 'day_of_year', 'hour', 'num_features', 'num_desc',
                   'bed_to_bath', 'price_per_bed', 'price_per_bath', 'bldg_count', 'zero_bldg', 'total_room', 'room_diff',
                   'photo_count', 'latitude_grid', 'longitude_grid', 'lat_long_grid']

for target_col in selected_manager_id_proj:
    tmp_train, tmp_test = get_stats(train_df, test_df, target_column=target_col)
    train_stack_list.append(tmp_train)
    test_stack_list.append(tmp_test)

selected_bedrooms_proj = ['price', 'listing_id', 'month', 'day', 'weekday', 'day_of_year', 'hour', 'num_features', 'bldg_count', 'zero_bldg']

for target_col in selected_bedrooms_proj:
    tmp_train, tmp_test = get_stats(train_df, test_df, target_column=target_col, group_column='bedrooms')
    train_stack_list.append(tmp_train)
    test_stack_list.append(tmp_test)