In [196]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd

# Otros objetivos
# -----------------------------------------------------------------------
import math

from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from tqdm import tqdm
import time

# Gráficos
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append("../../")
from src.SupportPreProcesamiento import (
    exploracion_dataframe,
    separarar_df,
    plot_numericas,
    plot_categoricas,
    relacion_vr_categoricas,
    relacion_vr_numericas,
    matriz_correlacion,
    detectar_outliers,
    diferencia_tras_rellenar_nulos
)

pd.set_option('display.float_format', '{:.2f}'.format)


# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',100)


In [197]:
df = pd.read_csv("../../datos/02_api_rent_limpieza_cols.csv")
df.sample()

Unnamed: 0,propertyType,status,price,priceByArea,size,rooms,bathrooms,floor,neighborhood,address,district,province,municipality,exterior,hasLift,hasPlan,has3DTour,has360,distance
160,flat,good,610.0,8.0,72.0,1,1,bj,,Calle de Pedro Faura,San José - Buenos Aires,Madrid,Pinto,False,True,True,False,True,19544


# Gestión de Valores erróneos
Vamos a revisarlos en:
- Precio
- pricebyarea
- size
- rooms
- bathrooms


In [198]:
df["province"].unique()

array(['Toledo', 'Madrid', 'Guadalajara', 'Segovia', 'Ávila'],
      dtype=object)

In [199]:
df["price"].unique()

array([550., 750., 400., 450., 590., 684., 600., 700., 695., 650., 675.,
       500., 680., 747., 730., 640., 530., 625., 720., 699., 620., 595.,
       725., 666., 630., 635., 740., 690., 624., 633., 645., 735., 705.,
       710., 610., 580., 560., 670., 723., 660., 692., 609., 718., 694.,
       658., 728., 495., 715., 475., 470., 749., 667., 340., 525., 685.,
       460., 350., 430., 380., 733., 605., 480., 745.])

No se ven anómalos

### Price by Area

In [200]:
df["priceByArea"].unique()

array([ 1., 13., 11.,  6.,  5.,  8., nan, 10., 25., 16., 23., 20.,  3.,
        7., 17.,  4., 18., 15., 30., 19., 12.,  9., 14., 26., 21., 24.,
       22., 29., 33., 31., 28.,  2.])

Se ven anómalos aquellos menores a 5€ el m2 Teniendo en cuenta que estamos con 'Toledo', 'Madrid', 'Guadalajara', 'Segovia', 'Ávila'
- Veamos cuantos hay por debajo de 5

In [201]:
cond = df["priceByArea"] < 5
df[cond].shape

(15, 19)

Los nulificamos

In [202]:
df.loc[cond,"priceByArea"] = np.nan

### size

In [203]:
df["size"].nlargest(5)

0     371.00
407   341.00
260   238.00
71    230.00
237   216.00
Name: size, dtype: float64

In [204]:
df["size"].nsmallest(5)

335   20.00
169   23.00
280   23.00
34    25.00
133   25.00
Name: size, dtype: float64

No parece erróneos

### rooms

In [205]:
df["rooms"].unique()

array([6, 1, 2, 3, 0, 4])

0 es que son estudios

### bathrooms

In [206]:
df["bathrooms"].unique()

array([3, 1, 2])

No se ven valores erróneos

# Corregir "floor"
- Vamos a pasar todo a texto para que sea una categórica

In [207]:
df["floor"].unique()

array([nan, '3', 'bj', '2', '1', '5', 'en', '4', 'st', '8', '7', '6',
       '14', 'ss'], dtype=object)

### Generamos un diccionario para aplicar los cambios

In [208]:
diccionario_pisos = {
    "1": "primero",
    "2": "segundo",
    "3": "tercero",
    "4": "cuarto",
    "5": "quinto",
    "6": "sexto",
    "7": "septimo",
    "8": "octavo",
    "14": "decimo cuarto",
    "bj": "bajo",
    "en": "entreplanta",
    "ss": "sotano",
    "st": "sotano",
}

df["floor"] = df["floor"].map(diccionario_pisos)
df["floor"].unique()


array([nan, 'tercero', 'bajo', 'segundo', 'primero', 'quinto',
       'entreplanta', 'cuarto', 'sotano', 'octavo', 'septimo', 'sexto',
       'decimo cuarto'], dtype=object)

# Reducción / Simplificación de columnas
- Vamos a quitarnos todo lo que:
    - Especifique demasiado para el precio
- Vamos a simplificar todo lo que:
    - Especifique demasiado, pero al generalizarlo, aporte valor

In [209]:
df.sample()

Unnamed: 0,propertyType,status,price,priceByArea,size,rooms,bathrooms,floor,neighborhood,address,district,province,municipality,exterior,hasLift,hasPlan,has3DTour,has360,distance
330,flat,good,600.0,9.0,70.0,3,1,segundo,,Colonia Esperanza,,Segovia,San Ildefonso o la Granja,True,False,False,False,False,59919


### Dropeamos
- address: No se puede generalizar

In [210]:
df.drop(columns="address",inplace=True)

### Generalizamos
- distance: Lo haremos por rangos de distancia al centro

In [211]:
df["distance"].nlargest()

330    59919
359    59674
361    59601
293    59351
352    59106
Name: distance, dtype: int64

In [212]:
df["distance"].nsmallest()

133    183
154    470
34     533
168    625
394    691
Name: distance, dtype: int64

In [213]:
df["distancia_centro"] = "unknown"
df.loc[df["distance"].between(0,1000,inclusive="left"), "distancia_centro"] = "Menos de 1 km"
df.loc[df["distance"].between(1000,5000,inclusive="left"), "distancia_centro"] = "Entre 1 y 5 km"
df.loc[df["distance"].between(5000,10000,inclusive="left"), "distancia_centro"] =  "Entre 5 y 10 km"
df.loc[df["distance"].between(10000,20000,inclusive="left"), "distancia_centro"] = "Entre 10 y 20 km"
df.loc[df["distance"].between(20000,30000,inclusive="left"), "distancia_centro"] = "Entre 20 y 30 km"
df.loc[df["distance"].between(30000,40000,inclusive="left"), "distancia_centro"] = "Entre 30 y 40 km"
df.loc[df["distance"].between(40000,50000,inclusive="both"), "distancia_centro"] = "Entre 40 y 50 km"
df.loc[df["distance"] > 50000, "distancia_centro"] = "Mas de 50 km"

In [214]:
df["distancia_centro"].value_counts()

distancia_centro
Entre 5 y 10 km     82
Entre 20 y 30 km    74
Entre 10 y 20 km    65
Entre 30 y 40 km    62
Entre 1 y 5 km      62
Entre 40 y 50 km    45
Mas de 50 km        40
Menos de 1 km       12
Name: count, dtype: int64

# Dropeamos distance

In [215]:
df.drop(columns="distance",inplace=True)

# Gestionar nulos variables categóricas
- Las booleanas las volvemos categóricas
- Los nans los pondremos en desconocido

In [216]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   propertyType      442 non-null    object 
 1   status            424 non-null    object 
 2   price             442 non-null    float64
 3   priceByArea       409 non-null    float64
 4   size              442 non-null    float64
 5   rooms             442 non-null    int64  
 6   bathrooms         442 non-null    int64  
 7   floor             361 non-null    object 
 8   neighborhood      174 non-null    object 
 9   district          333 non-null    object 
 10  province          442 non-null    object 
 11  municipality      442 non-null    object 
 12  exterior          442 non-null    bool   
 13  hasLift           411 non-null    object 
 14  hasPlan           424 non-null    object 
 15  has3DTour         424 non-null    object 
 16  has360            424 non-null    object 
 1

# Columnas Categóricas

- hasPlan
- has3DTour
- has360
- distancia_centro

### propertyType

In [217]:
df["propertyType"].isnull().sum()

np.int64(0)

### status

### floor

### neighborhood 

### district

### province

### municipality

###  exterior

### hasLift