In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline

import requests as re

sns.set_context('poster')
sns.set_style('whitegrid')


from sklearn.tree import export_graphviz


#Trees
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

#Preprocessing packages
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer #One hot encoding
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

#Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

#Metrics
from sklearn.metrics import confusion_matrix, auc, roc_curve, classification_report

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image 

#Bootstrap
from sklearn.utils import resample

%matplotlib inline

This file explores the NLP used to classify listings based on the type of home. In this file, we first split up the dataset (that was successfully classified) into many smaller datasets based on type (partition the data), and then train a Random Forest Classifier on each partition. The classification accuracy for each was then compared to two more models. The first model it is compared to is a basic Random Forest Classifier trained without 'typed' data. The second is a Random Forest Classifier trained with all the 'typed' data (binary columns of type). The results of partitioned Forests were then compared, to see if a partitioned Random Forest performed better than the others. I.e. are the features that are important for listing interest different for a different type of home?

In [3]:
#train = pd.read_json("typedData/train_typed.json")
#test = pd.read_json("typedData/testX.json")
data = pd.read_json("rawData/cleanedTyped.json")

In [4]:
# Split data by type
train, test, y_train, y_test = train_test_split(data, data['interest_level'], test_size=0.3, random_state=42)

train_orig = train.copy()

y_train = train['interest_level']

y_train_stud = train[train['studio'] == 1]['interest_level']
y_train_town = train[train['townhome'] == 1]['interest_level']
y_train_appt = train[train['apartment'] == 1]['interest_level']
y_train_loft = train[train['loft'] == 1]['interest_level']
y_train_cond = train[train['condominium'] == 1]['interest_level']
y_train_ph = train[train['ph'] == 1]['interest_level']
y_train_walk = train[train['walk_up'] == 1]['interest_level']


train = train.drop(['interest_level','interestVal','description','type','display_address','created','building_id','features', 'manager_id', 'street_address'], axis=1)

x_train_stud = train[train['studio'] == 1]
x_train_town = train[train['townhome'] == 1]
x_train_appt = train[train['apartment'] == 1]
x_train_loft = train[train['loft'] == 1]
x_train_cond = train[train['condominium'] == 1]
x_train_ph = train[train['ph'] == 1]
x_train_walk = train[train['walk_up'] == 1]

# Tree for studio, townhome, apartment, condominium, loft, pd

In [5]:
y_test_stud = test[test['studio'] == 1]['interest_level']
y_test_town = test[test['townhome'] == 1]['interest_level']
y_test_appt = test[test['apartment'] == 1]['interest_level']
y_test_loft = test[test['loft'] == 1]['interest_level']
y_test_cond = test[test['condominium'] == 1]['interest_level']
y_test_ph = test[test['ph'] == 1]['interest_level']
y_test_walk = test[test['walk_up'] == 1]['interest_level']

y_test = test['interest_level']
test = test.drop(['interest_level','interestVal','type','description','display_address','created','building_id','features', 'manager_id','street_address'], axis=1)


x_test_stud = test[test['studio'] == 1]
x_test_town = test[test['townhome'] == 1]
x_test_appt = test[test['apartment'] == 1]
x_test_loft = test[test['loft'] == 1]
x_test_cond = test[test['condominium'] == 1]
x_test_ph = test[test['ph'] == 1]
x_test_walk = test[test['walk_up'] == 1]

In [73]:
# Random Forest on Studio Listings
sqrt_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

sqrt_forest.set_params(n_estimators=170)
sqrt_forest.fit(x_train_stud, y_train_stud)

print("The score for a tree trained with studio listings only (and predicted for studio listings) is: {} ".format(sqrt_forest.score(x_test_stud, y_test_stud)))
print(sqrt_forest.oob_score_)

The score for a tree trained with studio listings only (and predicted for studio listings) is: 0.6727748691099477 
0.691932864104


In [74]:
# Townhouse listings only
sqrt_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

sqrt_forest.set_params(n_estimators=170)
sqrt_forest.fit(x_train_town, y_train_town)

print("The score for a tree trained and tested with townhouse listings only is: {} ".format(sqrt_forest.score(x_test_town, y_test_town)))
print(sqrt_forest.oob_score_)

The score for a tree trained and tested with townhouse listings only is: 0.7192982456140351 
0.703832752613


In [75]:
# Apartment Listings Only
sqrt_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

sqrt_forest.set_params(n_estimators=170)
sqrt_forest.fit(x_train_appt, y_train_appt)

print("The score for a tree trained and tested with apartment listings only is: {}".format(sqrt_forest.score(x_test_appt, y_test_appt)))
print(sqrt_forest.oob_score_)


# Tree for studio, townhome, apartment, condominium, loft, pd

The score for a tree trained and tested with apartment listings only is: 0.7049906090689563
0.690433783396


In [76]:
# Condominium Listings Only
sqrt_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

sqrt_forest.set_params(n_estimators=170)
sqrt_forest.fit(x_train_cond, y_train_cond)

print("The score for a tree trained and tested with condominium listings only is: {}.".format(sqrt_forest.score(x_test_cond, y_test_cond)))
print(sqrt_forest.oob_score_)

The score for a tree trained and tested with condominium listings only is: 0.7384615384615385.
0.667938931298


In [77]:
# Loft listings only
sqrt_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

sqrt_forest.set_params(n_estimators=170)
sqrt_forest.fit(x_train_loft, y_train_loft)
print("The score for a tree trained and tested with loft listings only is: {}.".format(sqrt_forest.score(x_test_loft, y_test_loft)))
print(sqrt_forest.oob_score_)

The score for a tree trained and tested with loft listings only is: 0.652317880794702.
0.672740524781


In [78]:
# Penthouse listings only
sqrt_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

sqrt_forest.set_params(n_estimators=170)
sqrt_forest.fit(x_train_ph, y_train_ph)

print("The score for a tree trained and tested with penthouse listings only is: {}.".format(sqrt_forest.score(x_test_ph, y_test_ph)))
print(sqrt_forest.oob_score_)

The score for a tree trained and tested with penthouse listings only is: 0.6415929203539823.
0.67941712204


In [80]:
# Walk in listings only
sqrt_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

sqrt_forest.set_params(n_estimators=170)
sqrt_forest.fit(x_train_walk, y_train_walk)

print("The score for a tree trained and tested with walk-in listings only is: {}.".format(sqrt_forest.score(x_test_walk, y_test_walk)))
print(sqrt_forest.oob_score_)

The score for a tree trained and tested with walk-in listings only is: 0.6666666666666666.
0.702702702703


In [120]:
# First, investigate differences in loft and Loft
loft_descrepancies = train['loft'] == train['Loft']

num_desc = [True for t in loft_descrepancies if t == False]
print(len(num_desc))
print(len(num_desc)/len(train['loft']))

brad_loft_num = train[train['loft'] == 1]
print(len(brad_loft_num))
print(len(brad_loft_num[brad_loft_num['Loft'] == 1]))

given_loft_num = len(train[train['Loft'] == 1])

print(given_loft_num)

1088
0.05499671435070515
156
28
988


In [82]:
train_untyped = train.drop(['loft','ph','condominium','apartment','townhome','studio','walk_up'],axis=1)
test_untyped = test.drop(['loft','ph','condominium','apartment','townhome','studio','walk_up'],axis=1)

In [83]:
# Tree WITHOUT type labels
sqrt_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

sqrt_forest.set_params(n_estimators=170)
sqrt_forest.fit(train_untyped, y_train)

print(sqrt_forest.score(test_untyped,y_test))
print(sqrt_forest.oob_score_)

0.720976463104
0.71157385318


In [84]:
# single tree including type labels
gen_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

gen_forest.set_params(n_estimators=170)
gen_forest.fit(train, y_train)

print(gen_forest.score(test,y_test))
print(gen_forest.oob_score_)

0.720499363868
0.712289550815


In [None]:
# The true indicator is seeing the difference in score for the 
# combined typed tree and the partitioned typed tree

In [85]:
#Partitioned Studio score: 0.6911595866819747 
print(gen_forest.score(x_test_stud,y_test_stud))

0.692408376963


In [86]:
#Partitioned Townhome score: 0.71875 
print(gen_forest.score(x_test_town,y_test_town))

0.710526315789


In [87]:
#Partitioned apartment score: 0.7007227734901385
print(gen_forest.score(x_test_appt, y_test_appt))

0.706198014489


In [88]:
#Partitioned condominium score: 0.742647058823529
print(gen_forest.score(x_test_cond, y_test_cond))

0.784615384615


In [89]:
# Partitioned Loft score: 0.6470588235294118
print(gen_forest.score(x_test_loft, y_test_loft))

0.662251655629


In [90]:
# Partitioned Penthouse score: 0.6334661354581673
print(gen_forest.score(x_test_ph, y_test_ph))

0.663716814159


In [91]:
# Partitioned Walk In score: 0.66666
print(gen_forest.score(x_test_walk, y_test_walk))

0.691358024691


In [152]:
# Notable changes: Condominium increased with general tree, Loft increased with general tree
# Minor changes: penthouse, loft, apartment did better on general tree by a very small amount

Conclusion: housing type does not seem to play a significant role in creating a better model to predict interest level, nor does it seem to play a significant role in interest level at all.

In [6]:
data = pd.read_json("rawData/cleanedTyped.json")

In [7]:
data['interest_level'].value_counts()

low       29147
medium     9491
high       3280
Name: interest_level, dtype: int64

In [8]:
train, test = train_test_split(data, test_size=0.3, random_state=42)

train['hi_other'] = ["high" if interest == "high" else "other" for interest in train['interest_level']]
train['low_other'] = ["low" if interest == "low" else "other" for interest in train['interest_level']]

hi_other = train['hi_other']
low_other = train['low_other']

train_ = train.drop(['interest_level','interestVal','type','hi_other','low_other','description','display_address','created','building_id','features', 'manager_id', 'street_address'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [9]:
test['hi_other'] = ["high" if interest == "high" else "other" for interest in test['interest_level']]
test['low_other'] = ["low" if interest == "low" else "other" for interest in test['interest_level']]

hi_other_t = test['hi_other']
low_other_t = test['low_other']

test_ = test.drop(['interest_level','interestVal','type','hi_other','low_other','description','display_address','created','building_id','features', 'manager_id', 'street_address'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [108]:
# single tree including type labels
gen_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

gen_forest.set_params(n_estimators=170)
gen_forest.fit(train_, hi_other)

print(gen_forest.score(test_,hi_other_t))
print(gen_forest.oob_score_)

0.92183524173
0.923829323155


In [115]:
print(hi_other_t.value_counts())
print(classification_report(hi_other_t,gen_forest.predict(test_)))
print(confusion_matrix(hi_other_t,gen_forest.predict(test_),labels=['high','other']))

other    11574
high      1002
Name: hi_other, dtype: int64
             precision    recall  f1-score   support

       high       0.53      0.16      0.25      1002
      other       0.93      0.99      0.96     11574

avg / total       0.90      0.92      0.90     12576

[[  161   841]
 [  142 11432]]


In [11]:
# single tree including type labels
gen_forest = RandomForestClassifier(warm_start=True, 
                             oob_score=True,
                             max_features="sqrt",
                             random_state = 1)

gen_forest.set_params(n_estimators=170)
gen_forest.fit(train_, low_other)
predicted1 = gen_forest.predict(test_)


print(gen_forest.score(test_,low_other_t))
print(gen_forest.oob_score_)
print(predicted1)

0.770038167939
0.761809010974
['low' 'other' 'low' ..., 'low' 'low' 'other']


In [117]:
print(low_other_t.value_counts())
print(classification_report(low_other_t,gen_forest.predict(test_)))
print(confusion_matrix(low_other_t,gen_forest.predict(test_),labels=['low','other']))

low      8780
other    3796
Name: low_other, dtype: int64
             precision    recall  f1-score   support

        low       0.80      0.90      0.85      8780
      other       0.67      0.47      0.55      3796

avg / total       0.76      0.77      0.76     12576

[[7907  873]
 [2019 1777]]
