# SVM Exploration

In [1]:
# Necessary Imports
import pandas as pd
import numpy as np

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## SVM on Full Data Set

In [2]:
# Load Data non interaction data
df = pd.read_json('./rawData/cleaned.json')
df.columns

Index(['Balcony', 'Cats_Allowed', 'Common_Outdoor_Space', 'Dining_Room',
       'Dishwasher', 'Dogs_Allowed', 'Doorman', 'Elevator', 'Exclusive',
       'Fitness_Center', 'Garden_Patio', 'Hardwood_Floors',
       'High_Speed_Internet', 'Laundry_in_Unit', 'Loft', 'New_Construction',
       'No_Fee', 'Outdoor_Space', 'Pre_War', 'Roof_Deck', 'Swimming_Pool',
       'Terrace', 'Wheelchair_Access', 'bathrooms', 'bedrooms', 'building_id',
       'created', 'description', 'display_address', 'features', 'interestVal',
       'interest_level', 'latitude', 'laundry_in_building', 'listing_id',
       'longitude', 'manager_id', 'price', 'street_address'],
      dtype='object')

In [3]:
# Split into Training and Test Sets
X_tr, X_test, Y_tr, Y_test = train_test_split(df.drop(['interest_level'],axis=1), df['interest_level'], test_size=0.3, random_state=42)
X_tr_wo_obj = X_tr.select_dtypes(exclude=['object'])
X_tr_wo_obj = X_tr_wo_obj.drop(['interestVal'],axis=1)
X_test_wo_obj = X_test.select_dtypes(exclude=['object'])
X_test_wo_obj = X_test_wo_obj.drop(['interestVal'],axis=1)

# Fit SVM to Data with Interaction Terms
svm_model = SVC()
svm_fit = svm_model.fit(X_tr_wo_obj, Y_tr)

In [4]:
X_test_wo_obj.columns

Index(['Balcony', 'Cats_Allowed', 'Common_Outdoor_Space', 'Dining_Room',
       'Dishwasher', 'Dogs_Allowed', 'Doorman', 'Elevator', 'Exclusive',
       'Fitness_Center', 'Garden_Patio', 'Hardwood_Floors',
       'High_Speed_Internet', 'Laundry_in_Unit', 'Loft', 'New_Construction',
       'No_Fee', 'Outdoor_Space', 'Pre_War', 'Roof_Deck', 'Swimming_Pool',
       'Terrace', 'Wheelchair_Access', 'bathrooms', 'bedrooms', 'latitude',
       'laundry_in_building', 'listing_id', 'longitude', 'price'],
      dtype='object')

In [5]:
# Predict Test Data with SVM
print('Score of SVM with no interaction terms = %s' % svm_model.score(X_test_wo_obj, Y_test))

Score of SVM with no interaction terms = 0.698075699746


This is a better percent accuracy than linear regression.  Let's see if interaction terms can further improve this.

In [6]:
# Load Data with Interaction Terms
df2 = pd.read_json('./typedData/withInteraction.json')
df2.columns

Index(['Balcony', 'Cats_Allowed', 'Common_Outdoor_Space', 'Dining_Room',
       'Dishwasher', 'Dogs_Allowed', 'Doorman', 'Elevator', 'Exclusive',
       'Fitness_Center', 'Garden_Patio', 'Hardwood_Floors',
       'High_Speed_Internet', 'Laundry_in_Unit', 'Loft', 'New_Construction',
       'No_Fee', 'Outdoor_Space', 'Pre_War', 'Roof_Deck', 'Swimming_Pool',
       'Terrace', 'Wheelchair_Access', 'ada', 'bathrooms', 'bedrooms',
       'building_id', 'created', 'description', 'display_address', 'door_excl',
       'features', 'fitness_oriented', 'interestVal', 'interest_level',
       'latitude', 'laundry_in_building', 'listing_id', 'longitude',
       'lux_score', 'manager_id', 'num_features', 'num_luxury',
       'outdoor_score', 'pets_allowed', 'price', 'street_address'],
      dtype='object')

In [8]:
# Split into Training and Test Sets
X_tr_2, X_test_2, Y_tr_2, Y_test_2 = train_test_split(df2.drop(['interest_level'],axis=1), df2['interest_level'], test_size=0.3, random_state=42)
X_tr_wo_obj_2 = X_tr_2.select_dtypes(exclude=['object'])
X_tr_wo_obj_2 = X_tr_wo_obj_2.drop(['interestVal'],axis=1)
X_test_wo_obj_2 = X_test_2.select_dtypes(exclude=['object'])
X_test_wo_obj_2 = X_test_wo_obj_2.drop(['interestVal'],axis=1)

# Fit SVM to Data with Interaction Terms
svm_model_2 = SVC()
svm_fit_2 = svm_model_2.fit(X_tr_wo_obj_2, Y_tr_2)

In [9]:
# Predict Test Data with SVM
print('Score of SVM with no interaction terms = %s' % svm_model_2.score(X_test_wo_obj_2, Y_test_2))

Score of SVM with no interaction terms = 0.697996183206


Including interaction terms does not improve the classification accuracy of SVM.  With and without interaction terms, the percent accuracy is around 69.8%.  I will now investigate if breaking out the listings by home type (apartment, penthouse, townhome, etc.) will improve the percent accuracy.

## SVM on Typed Data Set

In [10]:
# Load Typed Data without Interaction Terms
df3 = pd.read_json('./rawData/cleanedTyped.json')
df3.columns

Index(['Balcony', 'Cats_Allowed', 'Common_Outdoor_Space', 'Dining_Room',
       'Dishwasher', 'Dogs_Allowed', 'Doorman', 'Elevator', 'Exclusive',
       'Fitness_Center', 'Garden_Patio', 'Hardwood_Floors',
       'High_Speed_Internet', 'Laundry_in_Unit', 'New_Construction', 'No_Fee',
       'Outdoor_Space', 'Pre_War', 'Roof_Deck', 'Swimming_Pool', 'Terrace',
       'Wheelchair_Access', 'apartment', 'bathrooms', 'bedrooms',
       'building_id', 'condominium', 'created', 'description',
       'display_address', 'features', 'foundType', 'interestVal',
       'interest_level', 'latitude', 'laundry_in_building', 'listing_id',
       'loft', 'longitude', 'manager_id', 'other', 'ph', 'price',
       'street_address', 'studio', 'townhome', 'type', 'walk_up'],
      dtype='object')

In [17]:
df3['type'].value_counts()

apartment      24859
other          12390
studio          2611
ph               775
townhome         401
condominium      392
walk_up          266
loft             224
Name: type, dtype: int64

In [28]:
grouped = df3.groupby(['type'])
grouped.groups

{'apartment': Int64Index([    0, 10002, 10003, 10005, 10006, 10008, 10009,  1001, 10010,
             10012,
             ...
              9987,  9988,  9991,  9992,  9993,  9994,  9995,  9996,  9997,
              9998],
            dtype='int64', length=24859),
 'condominium': Int64Index([10001, 10022, 10167,  1036, 10360, 10540, 10813, 10964, 10974,
             11334,
             ...
              8958,  8977,  9006,   903,  9226,  9394,    95,  9697,  9768,
              9911],
            dtype='int64', length=392),
 'loft': Int64Index([10179, 10197, 10227, 10357, 10405, 10671, 10701, 10864,  1090,
             11093,
             ...
               745,  7881,   859,  8618,  8679,  8752,  8966,  9385,  9896,
               998],
            dtype='int64', length=224),
 'other': Int64Index([    1,    10,   100, 10000, 10007, 10011, 10015, 10016, 10017,
             10019,
             ...
              9956,  9957,  9963,  9968,  9970,  9971,  9978,  9979,  9986,
              

In [41]:
# Load Typed Data with Interaction Terms
df4 = pd.read_json('./typedData/compsTypedInteraction.json')
df4.columns

Index(['Balcony', 'Cats_Allowed', 'Common_Outdoor_Space', 'Dining_Room',
       'Dishwasher', 'Dogs_Allowed', 'Doorman', 'Elevator', 'Exclusive',
       'Fitness_Center', 'Garden_Patio', 'Hardwood_Floors',
       'High_Speed_Internet', 'Laundry_in_Unit', 'New_Construction', 'No_Fee',
       'Outdoor_Space', 'Pre_War', 'Roof_Deck', 'Swimming_Pool', 'Terrace',
       'Wheelchair_Access', 'ada', 'apartment', 'avg_lux_score',
       'avg_num_features', 'avg_num_luxury', 'avg_outdoor_score',
       'avg_price_per_feature', 'avg_price_per_num_lux', 'bathrooms',
       'bedrooms', 'building_id', 'condominium', 'created', 'description',
       'display_address', 'door_excl', 'features', 'fitness_oriented',
       'foundType', 'interestVal', 'interest_level', 'latitude',
       'laundry_in_building', 'listing_id', 'loft', 'longitude',
       'lux_per_dollar', 'lux_ratio', 'lux_score', 'manager_id', 'numPhotos',
       'num_features', 'num_luxury', 'other', 'outdoor_ratio', 'outdoor_score',
    

In [44]:
grouped_2 = df4.groupby(['type'])
grouped_2.groups

{'apartment': Int64Index([    0,     1,    10,   100,  1000, 10000, 10001, 10002, 10003,
             10004,
             ...
              9990,  9991,  9992,  9993,  9994,  9995,  9996,  9997,  9998,
              9999],
            dtype='int64', length=24859),
 'condominium': Int64Index([39860, 39861, 39862, 39863, 39864, 39865, 39866, 39867, 39868,
             39869,
             ...
             40242, 40243, 40244, 40245, 40246, 40247, 40248, 40249, 40250,
             40251],
            dtype='int64', length=392),
 'loft': Int64Index([41694, 41695, 41696, 41697, 41698, 41699, 41700, 41701, 41702,
             41703,
             ...
             41908, 41909, 41910, 41911, 41912, 41913, 41914, 41915, 41916,
             41917],
            dtype='int64', length=224),
 'other': Int64Index([24859, 24860, 24861, 24862, 24863, 24864, 24865, 24866, 24867,
             24868,
             ...
             37239, 37240, 37241, 37242, 37243, 37244, 37245, 37246, 37247,
             3

### SVM on studios only

In [37]:
studios = grouped.get_group('studio').copy()
studios.drop(['other','ph','apartment','townhome','type','condominium','loft','walk_up','foundType'],axis=1, inplace=True)
studios.columns

Index(['Balcony', 'Cats_Allowed', 'Common_Outdoor_Space', 'Dining_Room',
       'Dishwasher', 'Dogs_Allowed', 'Doorman', 'Elevator', 'Exclusive',
       'Fitness_Center', 'Garden_Patio', 'Hardwood_Floors',
       'High_Speed_Internet', 'Laundry_in_Unit', 'New_Construction', 'No_Fee',
       'Outdoor_Space', 'Pre_War', 'Roof_Deck', 'Swimming_Pool', 'Terrace',
       'Wheelchair_Access', 'bathrooms', 'bedrooms', 'building_id', 'created',
       'description', 'display_address', 'features', 'interestVal',
       'interest_level', 'latitude', 'laundry_in_building', 'listing_id',
       'longitude', 'manager_id', 'price', 'street_address', 'studio'],
      dtype='object')

In [39]:
# Split into Training and Test Sets
X_tr_studio, X_test_studio, Y_tr_studio, Y_test_studio = train_test_split(studios.drop(['interest_level'],axis=1), studios['interest_level'], test_size=0.3, random_state=42)
X_tr_studio_wo_obj = X_tr_studio.select_dtypes(exclude=['object'])
X_tr_studio_wo_obj = X_tr_studio_wo_obj.drop(['interestVal'],axis=1)
X_test_studio_wo_obj = X_test_studio.select_dtypes(exclude=['object'])
X_test_studio_wo_obj = X_test_studio_wo_obj.drop(['interestVal'],axis=1)

# Fit SVM to Studio Data with Interaction Terms
svm_model_studio = SVC()
svm_fit_studio = svm_model_studio.fit(X_tr_studio_wo_obj, Y_tr_studio)

In [67]:
# Predict Studio Test Data with SVM
print('Score of SVM on studios with no interaction terms = %s' % svm_model_studio.score(X_test_studio_wo_obj, Y_test_studio))

Score of SVM on studios with no interaction terms = 0.633928571429


SVM on only studio listings is 63.4% classification accuracy.  This is less than the training SVM on all the listings.

In [46]:
studios2 = grouped_2.get_group('studio').copy()
studios2.drop(['other','ph','apartment','townhome','type','condominium','loft','walk_up','foundType'],axis=1, inplace=True)
studios2.columns

Index(['Balcony', 'Cats_Allowed', 'Common_Outdoor_Space', 'Dining_Room',
       'Dishwasher', 'Dogs_Allowed', 'Doorman', 'Elevator', 'Exclusive',
       'Fitness_Center', 'Garden_Patio', 'Hardwood_Floors',
       'High_Speed_Internet', 'Laundry_in_Unit', 'New_Construction', 'No_Fee',
       'Outdoor_Space', 'Pre_War', 'Roof_Deck', 'Swimming_Pool', 'Terrace',
       'Wheelchair_Access', 'ada', 'avg_lux_score', 'avg_num_features',
       'avg_num_luxury', 'avg_outdoor_score', 'avg_price_per_feature',
       'avg_price_per_num_lux', 'bathrooms', 'bedrooms', 'building_id',
       'created', 'description', 'display_address', 'door_excl', 'features',
       'fitness_oriented', 'interestVal', 'interest_level', 'latitude',
       'laundry_in_building', 'listing_id', 'longitude', 'lux_per_dollar',
       'lux_ratio', 'lux_score', 'manager_id', 'numPhotos', 'num_features',
       'num_luxury', 'outdoor_ratio', 'outdoor_score', 'pets_allowed', 'price',
       'price_feature_ratio', 'price_lux_rat

In [47]:
# Split into Training and Test Sets
X_tr_studio2, X_test_studio2, Y_tr_studio2, Y_test_studio2 = train_test_split(studios2.drop(['interest_level'],axis=1), studios2['interest_level'], test_size=0.3, random_state=42)
X_tr_studio2_wo_obj = X_tr_studio2.select_dtypes(exclude=['object'])
X_tr_studio2_wo_obj = X_tr_studio2_wo_obj.drop(['interestVal'],axis=1)
X_test_studio2_wo_obj = X_test_studio2.select_dtypes(exclude=['object'])
X_test_studio2_wo_obj = X_test_studio2_wo_obj.drop(['interestVal'],axis=1)

# Fit SVM to Studio Data with Interaction Terms
svm_model_studio2 = SVC()
svm_fit_studio2 = svm_model_studio2.fit(X_tr_studio2_wo_obj, Y_tr_studio2)

In [68]:
# Predict Studio Test Data with SVM
print('Score of SVM on studios with interaction terms = %s' % svm_model_studio2.score(X_test_studio2_wo_obj, Y_test_studio2))

Score of SVM on studios with interaction terms = 0.635204081633


Can see that adding interaction terms does not improve the fit that much.

### SVM on all other types of listings with interaction terms

In [74]:
for name, group in grouped:
    if name == 'studio':
        continue
    typed = grouped.get_group(name).copy()
    typed.drop(['other','ph','apartment','townhome','type','condominium','loft','walk_up','foundType'],axis=1, inplace=True)
    # Split into Training and Test Sets
    X_tr_typed, X_test_typed, Y_tr_typed, Y_test_typed = train_test_split(typed.drop(['interest_level'],axis=1), typed['interest_level'], test_size=0.3, random_state=42)
    X_tr_typed_wo_obj = X_tr_typed.select_dtypes(exclude=['object'])
    X_tr_typed_wo_obj = X_tr_typed_wo_obj.drop(['interestVal'],axis=1)
    X_test_typed_wo_obj = X_test_typed.select_dtypes(exclude=['object'])
    X_test_typed_wo_obj = X_test_typed_wo_obj.drop(['interestVal'],axis=1)

    # Fit SVM to Specific Lisitng Type with Interaction Terms
    svm_model_typed = SVC()
    svm_fit_typed = svm_model_typed.fit(X_tr_typed_wo_obj, Y_tr_typed)
    
    # Predict Studio Test Data with SVM
    print('Score of SVM on %s with interaction terms = %s' % (name, svm_model_typed.score(X_test_typed_wo_obj, Y_test_typed)))

Score of SVM on apartment with interaction terms = 0.675918476803
Score of SVM on condominium with interaction terms = 0.694915254237
Score of SVM on loft with interaction terms = 0.75
Score of SVM on other with interaction terms = 0.756793112725
Score of SVM on ph with interaction terms = 0.643776824034
Score of SVM on townhome with interaction terms = 0.677685950413
Score of SVM on walk_up with interaction terms = 0.575


Using SVM, our model is pretty good at classifying lofts.  Surprisingly, our model is also good at classifying listings we were not able to type.  Our SVM model is mediocre at classifying interest level in apartments, condos, studios, penthouses, and townhomes.  Contrarily, our SVM model performs poorly on classifying interest level in walk ups.  We do not get that much improvement by fitting a different SVM model by home type, thus we are better off having one SVM model for all home types.

## SVM on Reduced Feature Data

In [6]:
# Load Typed Data with Building Manager Probability and Building Interest Probability of Interaction Terms
df5 = pd.read_json('./typedData/compsTypedInteraction.json')
grouped3 = df5.groupby(['type'])
df5.select_dtypes(exclude=['object']).columns

Index(['Balcony', 'Cats_Allowed', 'Common_Outdoor_Space', 'Dining_Room',
       'Dishwasher', 'Dogs_Allowed', 'Doorman', 'Elevator', 'Exclusive',
       'Fitness_Center', 'Garden_Patio', 'Hardwood_Floors',
       'High_Speed_Internet', 'Laundry_in_Unit', 'New_Construction', 'No_Fee',
       'Outdoor_Space', 'Pre_War', 'Roof_Deck', 'Swimming_Pool', 'Terrace',
       'Wheelchair_Access', 'ada', 'apartment', 'avg_lux_score',
       'avg_num_features', 'avg_num_luxury', 'avg_outdoor_score',
       'avg_price_per_feature', 'avg_price_per_num_lux', 'bathrooms',
       'bedrooms', 'condominium', 'door_excl', 'fitness_oriented', 'foundType',
       'interestVal', 'latitude', 'laundry_in_building', 'listing_id', 'loft',
       'longitude', 'lux_per_dollar', 'lux_ratio', 'lux_score', 'numPhotos',
       'num_features', 'num_luxury', 'other', 'outdoor_ratio', 'outdoor_score',
       'pets_allowed', 'ph', 'price', 'price_feature_ratio', 'price_lux_ratio',
       'price_per_feature', 'price_per_num

In [7]:
for name, group in grouped3:
    typed = grouped3.get_group(name).copy()
    typed.drop(['other','ph','apartment','townhome','type','condominium','loft','walk_up','foundType','listing_id'],axis=1, inplace=True)
    # Split into Training and Test Sets
    X_tr_typed, X_test_typed, Y_tr_typed, Y_test_typed = train_test_split(typed.drop(['interest_level'],axis=1), typed['interest_level'], test_size=0.3, random_state=42)
    X_tr_typed_wo_obj = X_tr_typed.select_dtypes(exclude=['object'])
    X_tr_typed_wo_obj = X_tr_typed_wo_obj.drop(['interestVal'],axis=1)
    X_test_typed_wo_obj = X_test_typed.select_dtypes(exclude=['object'])
    X_test_typed_wo_obj = X_test_typed_wo_obj.drop(['interestVal'],axis=1)

    # Fit SVM to Specific Lisitng Type with Interaction Terms
    svm_model_typed = SVC()
    svm_fit_typed = svm_model_typed.fit(X_tr_typed_wo_obj, Y_tr_typed)
    
    # Predict Studio Test Data with SVM
    print('Score of SVM on %s with interaction terms = %s' % (name, svm_model_typed.score(X_test_typed_wo_obj, Y_test_typed)))

Score of SVM on apartment with interaction terms = 0.666264414052
Score of SVM on condominium with interaction terms = 0.686440677966
Score of SVM on loft with interaction terms = 0.735294117647
Score of SVM on other with interaction terms = 0.751950497713
Score of SVM on ph with interaction terms = 0.630901287554
Score of SVM on studio with interaction terms = 0.645408163265
Score of SVM on townhome with interaction terms = 0.669421487603
Score of SVM on walk_up with interaction terms = 0.6125


With a reduced number of features, we get the following improvements/non-improvements in percent accuracy by type:

Apartment: -0.97%

Condominium: -0.85%

Loft: -1.47%

Other: -0.48%

Penthouse: -1.29%

Studio: 1.02%

Townhome: -0.83%

Walk up: 3.75%

In general, reducing the number of features causes a regression in percent accuracy.  Thus, using all the interaction terms keeps the percent accuracy higher.