In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")


In [None]:
from src.preprocessing.data_extraction import extract_initial_data, get_info_from_polygons_and_ine
from src.preprocessing.preprocessing import (remove_duplicated_assets_id, 
                                             find_single_value_columns,
                                             treatment_missing_values, 
                                             feature_engineering, 
                                             imputation_values_not_nulls,
                                             detect_outliers_by_percentile,
                                             add_aggregated_features,
                                             correlation_values)
from sklearn.model_selection import train_test_split
from src.constants import NEW_COLUMNS_NAMES, REMOVE_COLUMNS_BY_INPUT, REMOVE_COLUMNS_BY_CORRELATIONS

Extracción de los datasets disponibilizados

In [None]:
df, df_ine, df_osm, df_pois, df_polygons = extract_initial_data(
    root_dir="input_data"
)

División train y test: dejar datos sin cambiar para probar la pipeline y los modelos

In [None]:
# Remove target
X = df.drop(columns=['PRICE'])
y = df['PRICE']

#Split train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Keep predictors variables and target together to do preprocessing
df_train = pd.concat([y_train, X_train], axis=1)
df_test = pd.concat([y_test, X_test], axis=1)

Empezar el pre procesado solo con df_train

In [None]:
# Change columns names to friendly ones
df_train.columns = NEW_COLUMNS_NAMES

In [None]:
# Add columns 'geometry', 'precio_logaritmico', 'cusec', 'barrio_id', 'barrio'
df_train = get_info_from_polygons_and_ine(df_polygons=df_polygons, df_ine=df_ine, df=df_train)

In [None]:
# Remove duplicate assets
df_train = remove_duplicated_assets_id(df_assets=df_train, criteria="last")

In [None]:
# Add variables interior (1/0) and antiguidade. 
# Remove assets that are "nueva_construccion"
_, columns_to_drop_by_creation_of_new_ones, df_train = feature_engineering(df=df_train)

In [None]:
df_train = treatment_missing_values(df=df_train, columns_to_drop_null_values=['n_piso', 'cat_calidad', 'interior'])

In [None]:
# Impute number of bathrooms and price_parking when asset does not have parking
df_train, median_bathrooms_per_sqm = imputation_values_not_nulls(df=df_train)

In [20]:
# Remove columns that only have one different value
remove_unique_value_columns = find_single_value_columns(df=df_train)
df_train = df_train.drop(columns=remove_unique_value_columns)

# Remove columns by input (team decision)
df_train = df_train.drop(columns=REMOVE_COLUMNS_BY_INPUT)

# Remove columns by creation of new ones (team decision)
df_train = df_train.drop(columns=columns_to_drop_by_creation_of_new_ones)

# Remove columns by correlations (team decision) 
df_train=df_train.drop(columns=REMOVE_COLUMNS_BY_CORRELATIONS)

Columns with only one distinct value: ['tipologia_imueble', 'operacion', 'ciudad', 'ADTYPOLOGY', 'ADOPERATION']


In [None]:
# Identifiy and remove outliers by percentile 995 for most correlated variables with target
df_train = detect_outliers_by_percentile(df=df_train, percentile=0.995, variables_most_correlated_w_target=["n_banos",
                                                                                                            "n_habitaciones",
                                                                                                            "area_construida",
                                                                                                            "distancia_castellana",
                                                                                                            ])

In [None]:
correlation_matrix, _=correlation_values(df=df_train, threshold=0.8)

In [None]:
df_train = add_aggregated_features(df=df_train, variable="barrio")

In [None]:
df_train.to_csv(r'output_data/df_train_util_v2.csv')