In [242]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

import requests as re
from html.parser import HTMLParser

sns.set_context('poster')
sns.set_style('whitegrid')

In [243]:
df = pd.read_json('./rawData/train_data.json')

In [244]:
test = pd.read_json('./rawData/test_data.json')

In [245]:
test['test'] = True

In [246]:
df = df.append(test)
df.reset_index(drop=True, inplace=True)

In [247]:
df['test'].fillna(False, inplace=True)

In [248]:
df.columns = [c.replace(' ', '_') for c in df.columns]
df.columns = [c.replace('-', '_') for c in df.columns]
df.columns = [c.replace('/', '_') for c in df.columns]

In [249]:
df['bedrooms'] = df['bedrooms'].apply(float)
df['bedrooms'].fillna(0, inplace=True)
df['bathrooms'].fillna(0, inplace=True)

In [250]:
df.drop(['index', 'level_0'], axis=1, inplace=True)
df['interestVal'] = df['interest_level'].map({'high': 1, 'medium': 0.5, 'low':0})

In [251]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

In [252]:
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [253]:
def descrClean(x):
    des = strip_tags(x)
    return des.lower()

In [254]:
df['description'] = df['description'].apply(descrClean)

In [255]:
# Verify two laundry in buildings are the same
df['Laundry_In_Building'].value_counts()

0    46759
1     2593
Name: Laundry_In_Building, dtype: int64

In [256]:
# Verify two laundry in buildings are the same
df['Laundry_in_Building'].value_counts()

0    33008
1    16344
Name: Laundry_in_Building, dtype: int64

In [257]:
def combine_laundry(col1, col2):
    if col1 != 0:
        return col1
    else:
        return col2

In [258]:
# Aggregate to create one laundry in building column that isn't case sensitive
df['laundry_in_building'] = df.apply(lambda row: combine_laundry(row['Laundry_in_Building'], row['Laundry_In_Building']), axis=1)

# Drop old laundry in building columns
df = df.drop(['Laundry_in_Building', 'Laundry_In_Building'], axis=1)

In [259]:
# Drop photos column (is just links to photos)
df = df.drop(['photos'], axis=1)

In [260]:
df.to_json('./cleaningTest/cleaned.json')

To determine the type of rental unit, we conduct a basic NLP

In [261]:
# Define basic unit types
apt = ['apartment', 'apt']
condo = ['condominium', 'condo']
walkUp = ['walk_up', 'walk-up', 'walkup', 'walk up']
studio = ['studio']
ph = ['ph', 'penhouse']
townhome = ['townhome', 'duplex', 'townhouse']
loft = ['loft']

types = [apt, condo, walkUp, studio, ph, townhome, loft]

In [262]:
# Function to Classify Unit Types
def unitType(x):
    homeType = {    
        }
    for lst in types:
        homeType[lst[0]] = False
    
    for lst in types:
        for w in lst:
            if w in x:
                return lst[0]
    return 'other'

In [263]:
df['type'] = df['description'].apply(lambda x : unitType(x)) #Determine rental type
df['foundType'] = ~df['type'].str.contains('other') #Determine if a type has been found

df = pd.concat([df, pd.get_dummies(df['type'])], axis=1) #Create binary columns for each type

#Combine and drop the two loft column
df['loft'].fillna(False, inplace=True)
df['loft'] = df[['loft', 'Loft']].apply(lambda row : row['loft'] or row['Loft'], axis=1)
df.drop('Loft', axis=1, inplace=True)

In [264]:
df.to_json('./cleaningTest/cleanedTyped.json')

In [265]:
# Luxury Score Term - higher the score means the more luxury items included
df['lux_score'] = (df['Exclusive'] + df['Doorman'] + df['Outdoor_Space'] + 
                   df['New_Construction'] + df['Roof_Deck'] + df['Fitness_Center'] + 
                   df['Swimming_Pool'] + df['Elevator'] + df['Laundry_in_Unit'] + df['Hardwood_Floors']) / 10

In [266]:
agentGroup = df.groupby(['manager_id']).mean()
buildingGroup = df.groupby(['building_id', 'manager_id']).mean()

In [267]:
buildingAvg = buildingGroup[['interestVal']]
buildingAvg.columns = ['prob_interest_building']
buildingAvg.reset_index(inplace=True)

In [268]:
managerAvg = agentGroup[['interestVal']]
managerAvg.columns = ['prob_interest_manager']
managerAvg.reset_index(inplace=True)

In [269]:
df = df.merge(managerAvg, on='manager_id', how='left')
df = df.merge(buildingAvg, on=['building_id', 'manager_id'], how='left')

In [270]:
df['prob_buildManager'] = (df['prob_interest_building']+df['prob_interest_manager'])/2

In [271]:
df['rooms'] = df['bedrooms']+df['bathrooms']

In [272]:
# Number of Luxury Features Term
df['num_luxury'] = df['Exclusive'] + df['Doorman'] + df['Outdoor_Space'] + df['New_Construction'] + df['Roof_Deck'] + df['Fitness_Center'] + df['Swimming_Pool'] + df['Elevator'] + df['Laundry_in_Unit'] + df['Hardwood_Floors']

In [273]:
# Number of Features per Listing
df['num_features'] = df['features'].apply(lambda x: len(x))

In [274]:
# ADA compatible interaction term
# 1 if both elevator and wheelchair access, 0 if one or neither are included
df['ada'] = df['Elevator'] * df['Wheelchair_Access']

# Create transformed term that creates a score for outdoor spaces
# Higher the score, the more of these features are included
df['outdoor_score'] = (df['Outdoor_Space'] + df['Balcony'] + df['Common_Outdoor_Space'] 
                       + df['Garden_Patio'] + df['Roof_Deck'] + df['Terrace']) / 6

# Create interaction term for fitness oriented
# 1 if both swimming pool and fitness center are included, 0 if one or neither included
df['fitness_oriented'] = df['Fitness_Center'] * df['Swimming_Pool']

# Create interaction term for doorman/exclusive
# 1 if both are included, 0 if one or neither are included
df['door_excl'] = df['Doorman'] * df['Exclusive']

# Create interaction term for cats and dogs allowed
# 1 if both are allowed, 0 if one or neither are allowed
df['pets_allowed'] = df['Cats_Allowed'] * df['Dogs_Allowed']

In [None]:
df['price_per_feature'] = df['price']/df['num_features']
df['price_per_feature'].replace(np.inf, np.nan, inplace=True)

In [None]:
g1 = interaction_typed.groupby(['type']).mean()
g1.reset_index(inplace=True)

In [None]:
avgs = g1[['type','lux_score', 'num_features', 
           'num_luxury','outdoor_score', 'price_per_num_lux', 
           'price_per_feature']]

In [None]:
avgs.columns = ['avg_'+x for x in avgs]
avgs.rename(columns={'avg_type':'type'}, inplace=True)
avgAdded = pd.merge(df, avgs, on='type')

In [275]:
train = df[df['test']==False].dropna()
train.drop('test', inplace=True, axis=1)
testTest = df[df['test']]
testTest.reset_index(drop=True, inplace=True)
test.drop('test', inplace=True, axis=1)

In [276]:
df['test'].value_counts()

False    41918
True      7434
Name: test, dtype: int64

In [277]:
train.to_json('./cleaningTest/train.json')
test.to_json('./cleaningTest/test.json')