In [1]:
import pandas as pd
import sklearn
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn import linear_model
from sklearn.kernel_approximation import Nystroem

## Method Testing

In [3]:
sales = pd.read_csv('sales_mod.csv')

  sales = pd.read_csv('sales_mod.csv')


In [4]:
sales.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
sales.set_index('sale_key', inplace=True)

In [6]:
sales.columns

Index(['township_code', 'class', 'pin', 'year', 'sale_date', 'sale_price',
       'sale_price_log10', 'doc_no', 'deed_type', 'seller_name',
       'is_multisale', 'num_parcels_sale', 'buyer_name', 'sale_type',
       'sale_filter_lower_limit', 'sale_filter_upper_limit',
       'sale_filter_count', 'property_advertised',
       'is_installment_contract_fulfilled',
       'is_sale_between_related_individuals_or_corporate_affiliates',
       'is_transfer_of_less_than_100_percent_interest',
       'is_court_ordered_sale', 'is_sale_in_lieu_of_foreclosure',
       'is_condemnation', 'is_short_sale', 'is_bank_reo_real_estate_owned',
       'is_auction_sale', 'is_seller_buyer_a_relocation_company',
       'is_seller_buyer_a_financial_institution_or_government_agency',
       'is_buyer_a_real_estate_investment_trust', 'is_buyer_a_pension_fund',
       'is_buyer_an_adjacent_property_owner',
       'is_buyer_exercising_an_option_to_purchase',
       'is_simultaneous_trade_of_property', 'is_sale_l

In [7]:
feed_data = sales[['sale_price_log10zscore',
       'price_per_sqft_log10zscore', 'pctzscore', 'countszscore',
       'days_since_last_transactionzscore']]

In [8]:
feed_data.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feed_data.fillna(0, inplace=True)


## Isolation Forest

In [9]:
isof = IsolationForest()
prediction = isof.fit_predict(feed_data)

In [10]:
# -1 is outlier and 1 is normal
# Looking at human readable CSV, this looks to be about similar to (1,2) in terms of outlier detections just from raw numbers.
np.unique(prediction, return_counts=True)

(array([-1,  1]), array([ 25741, 192398]))

## Logical Outlier Factor

In [11]:
# -1 is outlier and 1 is normal
# Looking at human readable CSV, this looks to be about similar to (1,2) in terms of outlier detections just from raw numbers.
for val in [1,2,3,4,5,10,15,20,25,30,35,40,45,50,55,60]:
    lof = LocalOutlierFactor(n_neighbors = val)
    prediction = lof.fit_predict(feed_data)
    print(f'{val} nearest neighbors:', np.unique(prediction, return_counts=True))


1 nearest neighbors: (array([-1,  1]), array([ 50284, 167855]))
2 nearest neighbors: (array([-1,  1]), array([ 36734, 181405]))
3 nearest neighbors: (array([-1,  1]), array([ 25378, 192761]))
4 nearest neighbors: (array([-1,  1]), array([ 17821, 200318]))
5 nearest neighbors: (array([-1,  1]), array([ 12840, 205299]))
10 nearest neighbors: (array([-1,  1]), array([  3669, 214470]))
15 nearest neighbors: (array([-1,  1]), array([  2050, 216089]))
20 nearest neighbors: (array([-1,  1]), array([  1795, 216344]))
25 nearest neighbors: (array([-1,  1]), array([  1905, 216234]))
30 nearest neighbors: (array([-1,  1]), array([  2075, 216064]))
35 nearest neighbors: (array([-1,  1]), array([  2233, 215906]))
40 nearest neighbors: (array([-1,  1]), array([  2382, 215757]))
45 nearest neighbors: (array([-1,  1]), array([  2558, 215581]))
50 nearest neighbors: (array([-1,  1]), array([  2710, 215429]))
55 nearest neighbors: (array([-1,  1]), array([  2909, 215230]))
60 nearest neighbors: (array([

In [12]:
# Random Forest feature importance

## SVM

In [20]:
feature_map_nystroem = Nystroem(random_state=1,
                                n_components=300)
data_transformed = feature_map_nystroem.fit_transform(feed_data)

In [23]:
for val in [.25,.5,.75]:
    svm = linear_model.SGDOneClassSVM(nu=val)
    prediction = svm.fit_predict(data_transformed)
    print(f'nu is {val}', np.unique(prediction, return_counts=True))

nu is 0.25 (array([-1,  1], dtype=int32), array([ 54368, 163771]))
nu is 0.5 (array([-1,  1], dtype=int32), array([109216, 108923]))
nu is 0.75 (array([-1,  1], dtype=int32), array([163539,  54600]))
