In [1]:
import re

import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_json('data/train.json')
df_test = pd.read_json('data/test.json')

In [3]:
df_train.interest_level.value_counts()

low       34284
medium    11229
high       3839
Name: interest_level, dtype: int64

In [4]:
df_train['train'] = 1
df_test['train'] = 0

In [5]:
df_train.reset_index(inplace=1)
df_test.reset_index(inplace=1)

df_all = pd.concat([df_train, df_test])

In [6]:
df_all.interest_level.fillna('', inplace=1)

In [8]:
re_unsplit = re.compile('([a-z])([A-Z])')
re_html = re.compile(r'<.{1,5}>')
re_tokens = re.compile(r'\w+')

stopwords = {'a', 'an', 'these', 'as', 'you', 'w', 'in', 'the', 'do', 'don', 't', 
             'we', 'have', 'in', 'for', 'to', 'lot', 'lots', 'of', 'it', 's', 'and',
             'by', 'is', 'are', 'been', 'or', 'any', 'me', 'at', 'that', 'via', 
             'this', 'also', 'has', 'very', 'many', 'your', 'i', 'forward', 'into', 
             'up', 'then', 'p'}

def unsplit(s):
    return re_unsplit.sub(r'\1 \2', s)

def remove_html(s):
    return re_html.sub(' ', s)

def prepare_text(s):
    s = unsplit(s)
    s = remove_html(s)
    s = s.lower()
    tokens = re_tokens.findall(s)
    tokens = [t for t in tokens if t not in stopwords]
    return ' '.join(tokens)

In [9]:
from tqdm import tqdm
tqdm.pandas(desc="pd")

In [10]:
df_all['description_cleaned'] = df_all.description.progress_apply(prepare_text)

pd: 100%|██████████| 124011/124011 [00:09<00:00, 12634.44it/s]


In [11]:
df_all.display_address = df_all.display_address.str.lower()
df_all.street_address = df_all.street_address.str.lower()

In [12]:
df_all.num_features = df_all.features.progress_apply(len)

pd: 100%|██████████| 124011/124011 [00:00<00:00, 556443.90it/s]


In [13]:
def str_features(f):
    return ' '.join(s.replace(' ', '_').lower() for s in f)

In [14]:
df_all.features = df_all.features.progress_apply(str_features)

pd: 100%|██████████| 124011/124011 [00:00<00:00, 167266.90it/s]


In [15]:
interest_level_map = {'low': 0, 'medium': 1, 'high': 2}
df_all.interest_level = df_all.interest_level.apply(lambda x: interest_level_map.get(x, -1))

In [16]:
df_all['num_photos'] = df_all.photos.apply(len)

In [21]:
train_b = set(df_all[df_all.train == 1].building_id)
test_b = set(df_all[df_all.train == 0].building_id)

In [31]:
b_cnt = df_all.building_id.value_counts()

df_all['building_cnt'] = b_cnt[df_all.building_id].reset_index(drop=1)
df_all.loc[df_all.building_cnt == 20664, 'building_cnt'] = -1

In [36]:
m_cnt = df_all.manager_id.value_counts()
df_all['manager_cnt'] = m_cnt[df_all.manager_id].reset_index(drop=1)

In [40]:
df_all['building_cnt'] = b_cnt[df_all.building_id].reset_index(drop=1)

In [60]:
def normalize_address(s):
    s = s.replace('.', '')
    s = s.replace(',', '')
    s = s.replace('\r', '')
    s = s.replace('\t', '')
    s = s.replace('avenue', 'av')
    s = s.replace('ave', 'av')
    s = s.replace('street', 'st')
    s = s.replace('east', 'e')
    s = s.replace('west', 'w')
    return s

In [61]:
normalized = df_all.street_address.apply(normalize_address)
normalized

0          792 metropolitan av
1              808 columbus av
2                  241 w 13 st
3                333 e 49th st
4               500 w 143rd st
5                350 w 18th st
6               210 w 107th st
7                155 w 21st st
8          63 hamilton terrace
9                   522 e 11th
10                1661 york av
11              644 w 173rd st
12               137 e 38th st
13                30 w 63rd st
14               315 e 56th st
15               340 e 34th st
16                  214 1st av
17                98 thayer st
18              127 w 106th st
19                  272 1st av
20              121 madison av
21               210 e 30th st
22               326 e 35th st
23               10 liberty st
24                  30 w 18 st
25                  310 e 56th
26               444 e 81st st
27               601 w 57th st
28              1215 morris av
29               435 e 79th st
                 ...          
74629             257  gold st
74630   

In [62]:
normalized.value_counts()

1 w st                  527
505 w 37th st           516
200 water st            504
90 washington st        455
3333 broadway           451
100 john st             437
401 e 34th st           401
2 gold st               396
95 wall st              356
100 maiden lane         340
8 spruce st             321
45 wall st              291
95 christopher st       291
145 e 16th st           281
200 e 72nd st           277
340 e 29th st           277
50 w 34th st            269
116 john st             259
550 w 54th st           258
620 w 42nd st           245
301 e 47th st           245
236 e 36th st           244
140 e 46th st           242
360 w 34th st           240
10 hanover square       238
560 w 43rd st           236
150 w 47th st           224
435 e 79th st           220
260 w 54th st           220
150 e 39th st           211
                       ... 
e 102nd st                1
338 e 70th st             1
313 w 75th st             1
351 e 83rd                1
410 w 24th st       

In [56]:
'19 monitor st\r', '19 monitor st\r'.replace('\r', '')

('19 monitor st\r', '19 monitor st')

In [21]:
def image_path_full(url):
    path = url[url.rfind('/')+1:]
    return 'data/images/' + path[0:7] + '/' + path

In [14]:
first = df_all.iloc[0]

In [17]:
url = first.photos[0]

In [22]:
image_path_full(url)

'data/images/7211212/7211212_1ed4542ec81621d70d1061aa833e669c.jpg'

In [20]:
url[url.rfind('/')+1:], url

('7211212_1ed4542ec81621d70d1061aa833e669c.jpg',
 'https://photos.renthop.com/2/7211212_1ed4542ec81621d70d1061aa833e669c.jpg')

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,index,interest_level,latitude,...,longitude,manager_id,photos,price,street_address,train,description_cleaned,num_photos,building_cnt,manager_cnt
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,metropolitan avenue,,10,1,40.7145,...,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 metropolitan avenue,1,brand new 3 bedroom 1 5 bath apartment enjoy f...,5,5,235
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,columbus avenue,doorman elevator fitness_center cats_allowed d...,10000,0,40.7947,...,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 columbus avenue,1,,11,51,194
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",w 13 street,laundry_in_building dishwasher hardwood_floors...,100004,2,40.7388,...,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 w 13 street,1,top top west village location beautiful pre wa...,8,131,314
3,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,east 49th street,hardwood_floors no_fee,100007,0,40.7539,...,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 east 49th street,1,building amenities garage garden fitness room ...,3,236,470
4,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,west 143rd street,pre-war,100013,0,40.8241,...,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 west 143rd street,1,beautifully renovated 3 bedroom flex 4 bedroom...,3,20664,51
5,2.0,4,38a913e46c94a7f46ddf19b756a9640c,2016-04-19 04:24:47,,west 18th street,,100014,1,40.7429,...,-74.0028,b209e2c4384a64cc307c26759ee0c651,[https://photos.renthop.com/2/6894514_9abb8592...,7995,350 west 18th street,1,,5,34,538
6,1.0,2,3ba49a93260ca5df92fde024cb4ca61f,2016-04-27 03:19:56,Stunning unit with a great location and lots o...,west 107th street,prewar elevator dogs_allowed cats_allowed lowr...,100016,0,40.8012,...,-73.9660,01287194f20de51872e81f660def4784,[https://photos.renthop.com/2/6930771_7e3622b6...,3600,210 west 107th street,1,stunning unit with great location natural ligh...,10,40,20
7,2.0,1,0372927bcb6a0949613ef5bf893bbac7,2016-04-13 06:01:42,"This huge sunny ,plenty of lights 1 bed/2 bath...",west 21st street,doorman elevator pre-war terrace laundry_in_un...,100020,0,40.7427,...,-73.9957,e6472c7237327dd3903b3d6f6a94515a,[https://photos.renthop.com/2/6867392_b18283f6...,5645,155 west 21st street,1,huge sunny plenty lights 1 bed 2 bath offers b...,5,74,6387
8,1.0,1,a7efbeb58190aa267b4a9121cd0c88c0,2016-04-20 02:36:35,<p><a website_redacted,hamilton terrace,cats_allowed dogs_allowed elevator laundry_in_...,100026,1,40.8234,...,-73.9457,c1a6598437b7db560cde66e5a297a53f,[https://photos.renthop.com/2/6898799_3759be4c...,1725,63 hamilton terrace,1,website_redacted,5,37,207
9,2.0,4,0,2016-04-02 02:58:15,This is a spacious four bedroom with every bed...,522 e 11th,dishwasher hardwood_floors,100027,0,40.7278,...,-73.9808,23a01ea7717b38875f5b070282d1b9d2,[https://photos.renthop.com/2/6814332_e19a8552...,5800,522 e 11th,1,spacious four bedroom with every bedroom able ...,9,20664,21
