# Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.model_selection import GridSearchCV

  import pandas.util.testing as tm


# Importo dataset de Airbnb 
Desde Airbnb obtenemos el siguiente dataset para entrenar nuestros modelos de regresion http://insideairbnb.com/get-the-data.html (dataset "listings.csv" de London). 

Objetivo: Vamos a querer predecir el precio dadas ciertas features.

In [3]:
# importo el dataset de Airbnb London, lo guardamos en el dataframe "london".
path_to_csv = r'clusterai_regresion_dataset_airbnb_london.csv'
london = pd.read_csv(path_to_csv, delimiter=',', parse_dates = True)

# Data cleaning

In [4]:
# elimino las rows que tienen NaNs en la columna "reviews_per_month" , que otras opciones existen?
london = london.dropna(subset = ["reviews_per_month"])
london = london.drop(['id', 'name', 'host_id', 'host_name',
                      'neighbourhood_group', 'last_review', 'latitude', 'longitude'], axis=1)

# Outlier Filtering.
price_q97 = london.price.quantile(0.975)
min_nights_q97 = london.minimum_nights.quantile(0.975)
london_filt = london.loc[(london.price < price_q97) & (london.minimum_nights < min_nights_q97)]

## Feature Engineering: 

### Se generan variables dummies para las features categoricas (Neighbourhood y Room Type)

In [5]:
# dummies para la feature "neighbourhood"
neighs_dummie = pd.get_dummies(london_filt.neighbourhood)
room_dummie = pd.get_dummies(london_filt.room_type)

## Se agregan las nuevas variables dummies creadas al dataframe de trabajo 'london_filt'

In [6]:
london_filt = london_filt.join([neighs_dummie, room_dummie])

## Preparación de variables para el modelado:

In [7]:
# Creación de variable dependiente (label - etiqueta) y
y = np.array(london_filt[["price"]])
# Creación de variable independiente X
x = london_filt.drop(['price', 'neighbourhood','room_type'], axis=1)
#División de dataset
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.90, random_state=42)

# Escalado
scaler = preprocessing.StandardScaler().fit(xtrain)
xtrain_scal = scaler.transform(xtrain)  
xtest_scal = scaler.transform(xtest)  

# Modelado: K Neighbors Scalar Regression

In [24]:
knn= KNeighborsRegressor()
parameters_k = np.arange(20,31,5)
parameters_knn = [{'n_neighbors': parameters_k}]

n_folds = 5

gs = GridSearchCV(knn, param_grid=parameters_knn, refit=True,
                  cv=n_folds, scoring="neg_mean_squared_error",
                  verbose=3, n_jobs=3)

In [25]:
gs.fit(xtrain_scal, ytrain)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  15 out of  15 | elapsed:    5.1s finished


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=3,
             param_grid=[{'n_neighbors': array([20, 25, 30])}],
             scoring='neg_mean_squared_error', verbose=3)

In [26]:
gs.best_params_

{'n_neighbors': 20}

In [27]:
knn_prediction = gs.best_estimator_.predict(xtest_scal)
knn_r2 = r2_score(y_true=ytest, y_pred=knn_prediction)
knn_mse = mean_squared_error(y_true=ytest, y_pred=knn_prediction)
knn_mae = mean_absolute_error(y_true=ytest, y_pred=knn_prediction)

In [28]:
print(f'R2 score: {knn_r2:.6f}')
print(f'MSE: {knn_mse:.6f}')
print(f'MAE: {knn_mae:.6f}')

R2 score: 0.493432
MSE: 1586.600613
MAE: 28.067338


In [23]:
print(f'R2 score: {knn_r2:.6f}')
print(f'MSE: {knn_mse:.6f}')
print(f'MAE: {knn_mae:.6f}')

R2 score: 0.487836
MSE: 1604.128026
MAE: 28.179976
