# Modelo de Regresión

Tenemos el conjunto de datos regressiondata.csv con una serie de variables:


The dataset consists of information on 22,000 properties. The dataset consists of historic data of houses sold between May 2014 to May 2015.

The goal is to predict the house price

These are the definitions of data points provided:

Note: For some of the variables are self-explanatory, no definition has been provided.

 - Id: Unique identification number for the property

 - date: the date the house was sold

 - price: the price of the house

 - waterfront: the house which has a view to a waterfront

 - condition: How good the condition is (Overall). 1 indicates worn-out property and 5 excellent.

 - grade: Overall grade given to the housing unit, based on the King County grading system. 1 poor, 13 excellent.

 - Sqft_above: square footage of house apart from the basement

 - Sqft_living15: Living room area in 2015(implies-- some renovations) This might or might not have affected the lotSize area.

 - Sqft_lot15: lotSize area in 2015(implies-- some renovations)

## Liberias

In [None]:
#!pip install catboost
#!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from tqdm.notebook import tqdm

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
df = pd.read_csv('regression_data.csv', sep=';')
df.head()

In [None]:
df.info()

### Limpieza y transformaciones

Comenzamos por la columna date, vamos a transformarla y quedarnos solo con el año

In [None]:
df.date = df.date.apply(lambda x: int('20'+x[-2:]) if int(x[-2:]) < 20 else int('19'+x[-2:]))

In [None]:
df.date.unique()

In [None]:
df['years_old'] = df.date - df.yr_built

Como ya tenemos una columna con los años de antiguedad de la casa, podemos eliminar las columnas de date y de yr_built

In [None]:
df.drop(['date','yr_built'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df[df.bathrooms < 0.75]

In [None]:
df['living_renovated'] = [1 if df.sqft_living[i]-df.sqft_living15[i] != 0 else 0 for i in range(len(df))]

In [None]:
df['lot_renovated'] = [1 if df.sqft_lot[i]-df.sqft_lot15[i] != 0 else 0 for i in range(len(df))]

In [None]:
df['dif_living'] = [df.sqft_living[i]-df.sqft_living15[i] for i in range(len(df))]
df['dif_lot'] = [df.sqft_lot[i]-df.sqft_lot15[i] for i in range(len(df))]
df['year_renovated'] = [1 if df.yr_renovated[i]>0 else 0 for i in range(len(df))]

In [None]:
df.head()

In [None]:
X = df.drop(['id','sqft_living','sqft_living15','sqft_lot','sqft_lot15', 'yr_renovated'], axis=1)

In [None]:
X.living_renovated.value_counts()

In [None]:
X.lot_renovated.value_counts()

In [None]:
fig = plt.figure(figsize=(20,15))
ax = fig.add_subplot(projection='3d')

ax.bar3d(x=X.long, y=X.lat, z=np.zeros_like(X.price), dx=0.001, dy=0.001, dz=X.price);

In [None]:
plt.figure(figsize=(20,15))
sns.scatterplot(data=X, y='lat', x='long', hue='zipcode', size='price')
plt.show();

In [None]:
plt.figure(figsize=(20,15))
sns.boxplot(data=X, x='bedrooms', y='price');

In [None]:
X.info()

In [None]:
#!pip install geopandas


In [None]:
#!pip install keplergl

In [None]:
#!jupyter nbextension install --py --sys-prefix keplergl
#!jupyter nbextension enable --py --sys-prefix keplergl

In [None]:
import geopandas as gpd
from keplergl import KeplerGl

In [None]:
geo = gpd.GeoDataFrame(X, geometry=gpd.points_from_xy(X.long, X.lat))

geo.head()

In [None]:
geo.to_file('geo_data.geojson', driver='GeoJSON')

In [None]:
cfg_map = {'version': 'v1',
 'config': {'visState': {'filters': [{'dataId': ['Prueba'],
     'id': '7q3p00ilg',
     'name': ['zipcode'],
     'type': 'range',
     'value': [98001, 98199],
     'enlarged': False,
     'plotType': 'histogram',
     'animationWindow': 'free',
     'yAxis': None,
     'speed': 1},
    {'dataId': ['Prueba'],
     'id': 'xs6ilv2ju3',
     'name': ['price'],
     'type': 'range',
     'value': [78000, 7700000],
     'enlarged': False,
     'plotType': 'histogram',
     'animationWindow': 'free',
     'yAxis': None,
     'speed': 1}],
   'layers': [{'id': 'ghxo1m',
     'type': 'heatmap',
     'config': {'dataId': 'Prueba',
      'label': 'Prueba',
      'color': [218, 0, 0],
      'highlightColor': [252, 242, 26, 255],
      'columns': {'lat': 'lat', 'lng': 'long'},
      'isVisible': True,
      'visConfig': {'opacity': 0.8,
       'colorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'radius': 16},
      'hidden': False,
      'textLabel': [{'field': None,
        'color': [255, 255, 255],
        'size': 18,
        'offset': [0, 0],
        'anchor': 'start',
        'alignment': 'center'}]},
     'visualChannels': {'weightField': {'name': 'price', 'type': 'integer'},
      'weightScale': 'linear'}},
    {'id': 'brb3upi',
     'type': 'point',
     'config': {'dataId': 'Prueba',
      'label': 'new layer',
      'color': [130, 154, 227],
      'highlightColor': [252, 242, 26, 255],
      'columns': {'lat': 'lat', 'lng': 'long', 'altitude': 'price'},
      'isVisible': True,
      'visConfig': {'radius': 10,
       'fixedRadius': False,
       'opacity': 0.8,
       'outline': False,
       'thickness': 2,
       'strokeColor': None,
       'colorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'strokeColorRange': {'name': 'Global Warming',
        'type': 'sequential',
        'category': 'Uber',
        'colors': ['#5A1846',
         '#900C3F',
         '#C70039',
         '#E3611C',
         '#F1920E',
         '#FFC300']},
       'radiusRange': [0, 50],
       'filled': True},
      'hidden': False,
      'textLabel': [{'field': {'name': 'price', 'type': 'integer'},
        'color': [255, 255, 255],
        'size': 18,
        'offset': [0, 0],
        'anchor': 'start',
        'alignment': 'center'}]},
     'visualChannels': {'colorField': None,
      'colorScale': 'quantile',
      'strokeColorField': None,
      'strokeColorScale': 'quantile',
      'sizeField': None,
      'sizeScale': 'linear'}}],
   'interactionConfig': {'tooltip': {'fieldsToShow': {'Prueba': [{'name': 'bedrooms',
        'format': None},
       {'name': 'bathrooms', 'format': None},
       {'name': 'floors', 'format': None},
       {'name': 'waterfront', 'format': None},
       {'name': 'view', 'format': None},
       {'name': 'price', 'format': None}]},
     'compareMode': False,
     'compareType': 'absolute',
     'enabled': True},
    'brush': {'size': 0.5, 'enabled': False},
    'geocoder': {'enabled': True},
    'coordinate': {'enabled': False}},
   'layerBlending': 'normal',
   'splitMaps': [],
   'animationConfig': {'currentTime': None, 'speed': 1}},
  'mapState': {'bearing': -17.394846207465637,
   'dragRotate': True,
   'latitude': 47.474522237255556,
   'longitude': -122.19059602024436,
   'pitch': 53.61835852608127,
   'zoom': 9.096117833833905,
   'isSplit': False},
  'mapStyle': {'styleType': 'satellite',
   'topLayerGroups': {},
   'visibleLayerGroups': {},
   'threeDBuildingColor': [3.7245996603793508,
    6.518049405663864,
    13.036098811327728],
   'mapStyles': {}}}}



In [None]:
mapa = KeplerGl(height=600, width=800, config= cfg_map)

mapa.add_data(geo.copy(), 'Prueba')

mapa.save_to_html(file_name='prueba.html')

mapa

In [None]:
X.head()

In [None]:
price_per_zipcode = X.groupby('zipcode')['price'].agg('mean').sort_values(ascending=True)

In [None]:
zp = {}
enc = 1
for i, v in price_per_zipcode.items():
    zp[i]= enc
    enc += 1

In [None]:
zp

In [None]:
zp_encoded = X.zipcode.apply(lambda x : zp[x])

zp_encoded

In [None]:
X.zipcode = zp_encoded

In [None]:
X.head()

In [None]:
sqrt_tot = X.sqft_above + X.sqft_basement

sqrt_tot

In [None]:
X['sqrt_tot'] = sqrt_tot

X['basement'] = X.sqft_basement.apply(lambda x: 1 if x != 0 else 0)

In [None]:
X.head()

In [None]:
X.info()

In [None]:
X_final = X.drop(['sqft_above','sqft_basement', 'geometry', 'lat', 'long', 'price'], axis=1)
y = X.price

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(X_final.corr(), annot=True);

In [None]:
plt.figure(figsize=(16,6))
sns.histplot(y);

## Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=.2, random_state=42)

In [None]:
mm = MinMaxScaler()

X_train_mm = mm.fit_transform(X_train)
X_test_mm = mm.transform(X_test)

In [None]:
X_train.columns

## Modelos

In [None]:
ln = LinearRegression()
rf = RandomForestRegressor()
xgb = XGBRegressor()
cat = CatBoostRegressor(verbose=0)

models = [ln, rf, xgb, cat]

In [None]:
res = {}
for m in models:
    name = str(m).split('(')[0]
    name = name.split('.')[0].replace('<','')

    print(f'Entrenando {name} \n')

    m.fit(X_train_mm, y_train)

    pred = m.predict(X_test_mm)

    r2 = r2_score(y_test, pred)
    rmse = mean_squared_error(y_test, pred, squared=False)
    mae = mean_absolute_error(y_test, pred)

    res[name]= {'r2':round(r2,2),
                'rmse':round(rmse, 2),
                'mae':round(mae,2)}

res_df = pd.DataFrame(res)

res_df

In [None]:
dict(zip(X_train.columns, rf.feature_importances_))

In [None]:
X.head()

### Filtramos el modelo y quitamos "outliers", casas con un precio superior a 1.9 millones

In [None]:
X_clean = X[X.price < 1.9e+6]
X_clean.info()

In [None]:
X_final.info()

In [None]:
X_clean.loc[X_clean.bedrooms == 33, 'bedrooms'] = 3

In [None]:
X_final = X_clean.drop(['sqft_above','sqft_basement', 'geometry', 'lat', 'long', 'price'], axis=1)
y = X_clean.price

In [None]:
X_final.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=.2, random_state=42)
mm = MinMaxScaler()

X_train_mm = mm.fit_transform(X_train)
X_test_mm = mm.transform(X_test)

In [None]:
ln = LinearRegression()
rf = RandomForestRegressor()
xgb = XGBRegressor()
cat = CatBoostRegressor(verbose=0)

models = [ln, rf, xgb, cat]

In [None]:
res = {}
for m in models:
    name = str(m).split('(')[0]
    name = name.split('.')[0].replace('<','')

    print(f'Entrenando {name} \n')

    m.fit(X_train, y_train)

    pred = m.predict(X_test)

    r2 = r2_score(y_test, pred)
    rmse = mean_squared_error(y_test, pred, squared=False)
    mae = mean_absolute_error(y_test, pred)

    res[name]= {'r2':round(r2,2),
                'rmse':round(rmse, 2),
                'mae':round(mae,2)}

res_df = pd.DataFrame(res)

res_df

In [None]:
prueba = pd.DataFrame()
prueba['real_price']= y_test
prueba['pred'] = pred
prueba['dif'] = abs(prueba.pred-prueba.real_price)
prueba['rate_dif'] = 100 - ((prueba.pred/prueba.real_price)*100)
prueba['rate_dif_abs'] = abs(100 - ((prueba.pred/prueba.real_price)*100))
plt.figure(figsize=(16,8))
plt.plot(prueba.real_price.values, c='r')
plt.plot(prueba.pred.values, c='blue')
plt.plot(prueba.dif.values, c='green')
plt.title('Prediction vs Real Price')
plt.legend(labels=prueba.columns);


In [None]:
plt.figure(figsize=(16,8))
sns.histplot(data= prueba, x='rate_dif');

In [None]:
plt.figure(figsize=(16,8))
sns.histplot(data= prueba, x='rate_dif_abs');

In [None]:
plt.figure(figsize=(16,8))
sns.lineplot(data= prueba, x = prueba.index, y='rate_dif');

In [None]:
plt.figure(figsize=(16,8))
sns.lineplot(data= prueba, x = prueba.index, y='rate_dif_abs');

In [None]:
prueba[prueba.rate_dif_abs > 15].info()

In [None]:
y_test[:20].values

In [None]:
len(pred), len(y_test)

In [None]:
ft = pd.DataFrame(dict(zip(rf.feature_names_in_,rf.feature_importances_ )), index=[0])
ft.T[0] = round(ft.T[0],3)
ft.T

In [None]:
X2 = X_clean[['grade','zipcode','sqrt_tot', 'dif_living', 'dif_lot', 'years_old']]
y = X_clean.price

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=.2, random_state=42)
mm = MinMaxScaler()
X_train_mm = mm.fit_transform(X_train)
X_test_mm = mm.transform(X_test)
res = {}
for m in models:
    name = str(m).split('(')[0]
    name = name.split('.')[0].replace('<','')

    print(f'Entrenando {name} \n')

    m.fit(X_train, y_train)

    pred = m.predict(X_test)

    r2 = r2_score(y_test, pred)
    rmse = mean_squared_error(y_test, pred, squared=False)
    mae = mean_absolute_error(y_test, pred)

    res[name]= {'r2':round(r2,2),
                'rmse':round(rmse, 2),
                'mae':round(mae,2)}

res_df = pd.DataFrame(res)

res_df