In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import auc, roc_curve, confusion_matrix

In [2]:
df = pd.read_csv("C:/Users/Avisia/Downloads/AB_NYC_20192.csv")
print("Format: {}".format(df.shape))
df.head(5)

Format: (48895, 16)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
df = pd.concat([
    df,
    pd.get_dummies(df[['neighbourhood','neighbourhood_group','room_type']])
], axis=1)

df = df.drop(['neighbourhood','neighbourhood_group','room_type'], axis=1)

In [4]:
df['price_cat'] = df['price'] < df['price'].median()

In [5]:
X, y = df.drop('price_cat', axis=1), df.price_cat
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Model 1

In [6]:
# Fit & predict
neigh = KNeighborsClassifier(n_neighbors=50)
neigh.fit(X_train[['latitude','longitude']], y_train)
y_pred = neigh.predict(X_test[['latitude','longitude']])

In [7]:
# Evaluation
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=True)
auc_score = round(auc(fpr, tpr), 2)
threshold = 0.5
print("AUC: {}".format(auc_score))

print(confusion_matrix(y_test, y_pred>threshold))

AUC: 0.72
[[5639 2396]
 [2133 5968]]


# Model 2

In [8]:
categorical_variables = [
    col for col in df if col.startswith('neighbourhood') or col.startswith('room_type')
]
variables = [
    'latitude', 'longitude',
    'number_of_reviews', 'reviews_per_month'
]

variables.extend(categorical_variables)
scaler = StandardScaler().fit(X_train[variables])
X_train_norm = np.nan_to_num(scaler.transform(X_train[variables]))
X_test_norm = np.nan_to_num(scaler.transform(X_test[variables]))

In [9]:
# Fit & predict
neigh = KNeighborsClassifier(n_neighbors=50)
neigh.fit(X_train_norm, y_train)
y_pred = neigh.predict(X_test_norm)

In [10]:
# Evaluation
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=True)
auc_score = round(auc(fpr, tpr), 2)
threshold = 0.5
print("AUC: {}".format(auc_score))

print(confusion_matrix(y_test, y_pred>threshold))

AUC: 0.82
[[6694 1341]
 [1531 6570]]


In [11]:
for dist in ['manhattan', 'euclidean']:
    print("Distance metric: {}".format(dist))
    for k in [2, 4, 10, 50, 100, 500]:
        # Fit & predict
        neigh = KNeighborsClassifier(n_neighbors=k, metric=dist)
        neigh.fit(X_train_norm, y_train)
        y_pred = neigh.predict(X_test_norm)

        # Evaluation
        fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=True)
        auc_score = round(auc(fpr, tpr), 2)
        print("For K = {}, AUC: {}".format(k, auc_score))

Distance metric: manhattan
For K = 2, AUC: 0.77
For K = 4, AUC: 0.8
For K = 10, AUC: 0.83
For K = 50, AUC: 0.82
For K = 100, AUC: 0.82
For K = 500, AUC: 0.81
Distance metric: euclidean
For K = 2, AUC: 0.77
For K = 4, AUC: 0.8
For K = 10, AUC: 0.83
For K = 50, AUC: 0.82
For K = 100, AUC: 0.82
For K = 500, AUC: 0.81
