# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.model_selection import GridSearchCV

  import pandas.util.testing as tm


# Importo dataset de Airbnb 
Desde Airbnb obtenemos el siguiente dataset para entrenar nuestros modelos de regresion http://insideairbnb.com/get-the-data.html (dataset "listings.csv" de London). 

Objetivo: Vamos a querer predecir el precio dadas ciertas features.

In [2]:
# importo el dataset de Airbnb London, lo guardamos en el dataframe "london".
path_to_csv = r'clusterai_regresion_dataset_airbnb_london.csv'
london = pd.read_csv(path_to_csv, delimiter=',', parse_dates = True)

# Data cleaning

In [3]:
# elimino las rows que tienen NaNs en la columna "reviews_per_month" , que otras opciones existen?
london = london.dropna(subset = ["reviews_per_month"])
london = london.drop(['id', 'name', 'host_id', 'host_name',
                      'neighbourhood_group', 'last_review', 'latitude', 'longitude'], axis=1)

# Outlier Filtering.
price_q97 = london.price.quantile(0.975)
min_nights_q97 = london.minimum_nights.quantile(0.975)
london_filt = london.loc[(london.price < price_q97) & (london.minimum_nights < min_nights_q97)]

## Feature Engineering: 

### Se generan variables dummies para las features categoricas (Neighbourhood y Room Type)

In [4]:
# dummies para la feature "neighbourhood"
neighs_dummie = pd.get_dummies(london_filt.neighbourhood)
room_dummie = pd.get_dummies(london_filt.room_type)

## Se agregan las nuevas variables dummies creadas al dataframe de trabajo 'london_filt'

In [5]:
london_filt = london_filt.join([neighs_dummie, room_dummie])

## Preparación de variables para el modelado:

In [6]:
# Creación de variable dependiente (label - etiqueta) y
y = np.array(london_filt[["price"]])
# Creación de variable independiente X
x = london_filt.drop(['price', 'neighbourhood','room_type'], axis=1)
#División de dataset
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.90, random_state=42)

# Escalado
scaler = preprocessing.StandardScaler().fit(xtrain)
xtrain_scal = scaler.transform(xtrain)  
xtest_scal = scaler.transform(xtest)  

# Modelado: Support Vector Regression

In [12]:
est = SVR(max_iter=35000)
parameters = {'C' : [1, 100, 300],
              'kernel':['rbf'],
             'gamma':[0.1, 0.5]}
n_folds = 5
gs = GridSearchCV(est, param_grid=parameters,refit=True,
                  cv=n_folds, scoring="neg_mean_squared_error",
                  verbose=3, n_jobs=3)

In [13]:
gs.fit(xtrain_scal, ytrain.ravel())

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  30 out of  30 | elapsed:  1.1min finished


GridSearchCV(cv=5, estimator=SVR(max_iter=35000), n_jobs=3,
             param_grid={'C': [1, 100, 300], 'gamma': [0.1, 0.5],
                         'kernel': ['rbf']},
             scoring='neg_mean_squared_error', verbose=3)

In [14]:
gs.best_params_

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}

In [15]:
svr_prediction = gs.best_estimator_.predict(xtest_scal)
svr_r2 = r2_score(y_true=ytest, y_pred=svr_prediction)
svr_mse = mean_squared_error(y_true=ytest, y_pred=svr_prediction)
svr_mae = mean_absolute_error(y_true=ytest, y_pred=svr_prediction)

In [16]:
print(f'R2 score: {svr_r2:.6f}')
print(f'MSE: {svr_mse:.6f}')
print(f'MAE: {svr_mae:.6f}')

R2 score: 0.479678
MSE: 1629.680183
MAE: 27.193622
