In [1]:
# types
from numpy import array
# Imports 
import time
import datetime as dt
# Math
import pandas as pd
import numpy as np
# Plot
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Ml
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
# Save Models
import joblib
# Directories
from cleansting.cleansting import CleanSting
from cleansting.express import express_work_setup, rm_outliers_by_colonia, set_dummies_features, colonies_encoder, identify_skew_features
from cleansting.ml.select_model import compare_ml_models, per_error_predicts, error_resume, hpt_random_forest, hpt_ridge, hpt_linear, compare_model_error, compare_score_models
from cleansting.data_visualization.plots import heatmap_corr

In [2]:
d1 = r'C:\Users\albin\PycharmProjects\webscraper_lamudi\lamudi\json_raw\enero\lamudi_22_01_2022.json'
d2 = r'C:\Users\albin\PycharmProjects\webscraper_lamudi\lamudi\json_raw\febrero\01_02_2022.json'
d3 = r'C:\Users\albin\PycharmProjects\webscraper_lamudi\lamudi\json_raw\febrero\lamudi_15_02_22.json'
d4 = r'C:\Users\albin\PycharmProjects\webscraper_lamudi\lamudi\json_raw\marzo\lamudi_28_02_22.json'
d5 = r'C:\Users\albin\PycharmProjects\webscraper_lamudi\lamudi\json_raw\marzo\lamudi_12_03_22.json'
d6 = r'C:\Users\albin\PycharmProjects\webscraper_lamudi\lamudi\json_raw\marzo\lamudi_18_03_22.json'
d7 = r'C:\Users\albin\PycharmProjects\webscraper_lamudi\lamudi\json_raw\marzo\lamudi_26_03_2022.json'
d8 = r'C:\Users\albin\PycharmProjects\webscraper_lamudi\lamudi\json_raw\abril\lamudi_01_04_22.json'
d9 = r'C:\Users\albin\PycharmProjects\webscraper_lamudi\lamudi\json_raw\abril\lamudi_08_04_22.json'

In [3]:
cl = CleanSting()
cl.fit_json(d1, d2, d3,d4,d5,d6,d7,d8, d9)
# Clean Information and add new columns
df  =  cl.factory()
# Main Columns
columns = cl.COLUMNS[0:-1]

In [4]:
columns

['price', 'm2_const', 'm2_terreno', 'habitaciones', 'banos', 'autos']

In [5]:
cl = CleanSting()
cl = cl.setup_model_data(d1, d2, d3,d4,d5,d6,d7,d8, d9, filter_by = {'tipo_inmueble': 'Casa', 'tipo_oferta': 'Venta'})

In [6]:
zona = cl.df['filtered']

In [7]:
above_50_properties = zona.colonia.value_counts().pipe(lambda serie_ : serie_[serie_ > 50]).index

In [8]:
len(above_50_properties)

33

# Test Models

In [9]:
def qualify_each_zone(data, colonies: list, **kwargs):
    
    df = data
    
    results = list()
    
    for colonie in colonies:
        
        df_colonia = df.loc[df.colonia == colonie]
        
        # Split the Data
        X = df_colonia.loc[:, columns].values
    
        y = df_colonia.price.values

        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 72, test_size=.30)
        

        model = hpt_random_forest(X_train, y_train, max_depth= [4, 6, 8])
        y_pred = model.predict(X_test)
        score_pred = model.score(X_test, y_test)


        diff = np.abs(((y_pred - y_test) / y_test) * 100)
        error_per = 100 - np.mean(diff)
        
        result = {'colonia': colonie,'score': score_pred ,'error' : np.mean(diff), 'diff_score':  error_per, 'Good': True if score_pred >= .70 else False}
        print(result, '\n')
        results.append(result)
    
    return pd.DataFrame(results)

In [10]:
result = qualify_each_zone(zona, above_50_properties)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
{'colonia': 'Valle Imperial', 'score': 0.9984938113410328, 'error': 0.49304447369068416, 'diff_score': 99.50695552630931, 'Good': True} 

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
{'colonia': 'Zapopan Centro', 'score': 0.9978927385182632, 'error': 2.187164837029583, 'diff_score': 97.81283516297042, 'Good': True} 

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
{'colonia': 'Solares', 'score': 0.9932391494222591, 'error': 0.6228641645096346, 'diff_score': 99.37713583549036, 'Good': True} 

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
{'colonia': 'Valle Real', 'score': 0.9975908698637271, 'error': 1.4190554975782486, 'diff_score': 98.58094450242176, 'Good': True} 

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
{'colonia': 'Bosques de Santa Anita', 'score': 0.9989775238107967, 'error': 0.9225476703752399, 'diff_score': 99.07745232962476, 'Good': True} 

In [11]:
result.Good.sum() / len(result)

0.8787878787878788

In [30]:
result = pd.read_json('casa_venta_by_colonia')

In [65]:
result[result.Good == True]

Unnamed: 0,colonia,score,error,diff_score,Good
0,Valle Imperial,0.998494,0.493044,99.506956,True
1,Zapopan Centro,0.997893,2.187165,97.812835,True
2,Solares,0.993239,0.622864,99.377136,True
3,Valle Real,0.997591,1.419055,98.580945,True
4,Bosques de Santa Anita,0.998978,0.922548,99.077452,True
5,Bugambilias,0.998105,1.269194,98.730806,True
6,Nuevo Vallarta,0.989307,9.634431,90.365569,True
7,Puerta de Hierro,0.995137,1.214185,98.785815,True
8,Tlajomulco de Zuñiga,0.988652,5.623222,94.376778,True
10,Virreyes Residencial,0.987341,2.059438,97.940562,True


In [93]:
#
true_price = 13900000
#
m2_terreno = 530                  
m2_const= 420                  
habitaciones=4
banos=4
autos=4
to_predict = [m2_terreno, m2_const, habitaciones, banos, autos, endoder]

In [33]:
def compare(data, train_model, encoder,   colonie, to_predict):
    
    single = np.array([to_predict[:-1]])
    general = np.array([to_predict])
    columns = ['m2_terreno', 'm2_const', 'habitaciones', 'banos', 'autos']
    
    df = data
    
    select_colonie = df[df.colonia == colonie]
    
    # Select Features
    X = zona.loc[:, columns].values
    # Select Target
    y = zona.price.values
    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 72, test_size=.30)
    # Select Best Stimator
    result_model = hpt_random_forest(X_train, y_train, n_estimators= [int(x) for x in np.linspace(80, 100, 4)])
    # Predict 
    result_single = result_model.predict(single)
    result_general = train_model.predict(general)
    
    
    return result_single, result_general
    

In [90]:
colonia = 'Las Cañadas'
encoder = zona[zona.colonia == colonia].endocer_colonia.unique()[0]

In [91]:
encoder

468

In [36]:
columns = ['m2_terreno', 'm2_const', 'habitaciones', 'banos', 'autos', 'endocer_colonia']
# Select Features
X = zona.loc[:, columns].values

# Select Target
y = zona.price.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 72, test_size=.30)

In [14]:
rf = hpt_random_forest(X_train, y_train, n_estimators= [int(x) for x in np.linspace(80, 100, 4)])

Fitting 3 folds for each of 192 candidates, totalling 576 fits


In [94]:
single, general = compare(zona, rf, encoder,  colonia, to_predict)
dif_single = (single - true_price) /true_price * 100
dif_general = (general - true_price) /true_price * 100

print(f'\nColonia: {colonia.title()}\n')
print(f'Valor Real {true_price:,.2f}')
print(f'Resultadon individual: {single[0]:,.0f}')
print(f'Error individual: {np.abs(dif_single[0]):.2f}\n')
print('#####################################')
print(f'Resultadon General: {general[0]:,.0f}')
print(f'Error General: {np.abs(dif_general[0]):.2f}')

Fitting 3 folds for each of 192 candidates, totalling 576 fits
Colonia: Las Cañadas

Valor Real 13,900,000.00
Resultadon individual: 13,723,663
Error individual: 1.27

#####################################
Resultadon General: 15,241,823
Error General: 9.65
