In [126]:
import pandas as pd # type: ignore

# Read the CSV file
data = pd.read_csv('train.csv')

# Display the first few rows of the data
data.head()
data.columns = data.columns.str.replace("'", "", regex=False)

In [127]:
def extract_Location(data_set: pd.DataFrame) -> pd.DataFrame:
    data_set['Location'] = data_set['Location'].str.extract(r'([^-\s]+)')
    data_set['Location'].fillna('NA', inplace=True)
    return data_set
data= extract_Location(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Location'].fillna('NA', inplace=True)


In [128]:
def clean_beach_resort(dataset):
    allowed_values = [ 'Nulo' ,'Not specified', '0' , '1']
    dataset['Beach_Resort'] = dataset['Beach_Resort'].apply(lambda x: 'NA' if pd.isna(x) or x in allowed_values else x)
    return dataset

In [129]:
data = clean_beach_resort(data)

In [130]:
def extract_date(data_set: pd.DataFrame) -> pd.DataFrame:
    #data_set['Publication_Date'] = data_set['Publication_Date'].str.extract(r'Publicado el (.+)')
    #data_set['Publication_Date'] = pd.to_datetime(data_set['Publication_Date'], errors='coerce')
    #data_set['Publication_Date'] = data_set['Publication_Date'].fillna(data_set['Publication_Date'].mode()[0])
    #data_set['Publication_Year'] = data_set['Publication_Date'].dt.year
    #data_set['Publication_Month'] = data_set['Publication_Date'].dt.month
    #data_set['Publication_Day'] = data_set['Publication_Date'].dt.day
    data_set.drop(columns='Publication_Date', inplace=True)
    return data_set

In [131]:
data = extract_date(data)

In [132]:
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

In [133]:
data.groupby('Total_Area_m2').size()

Total_Area_m2
0                 43
1                  2
100               31
1000             143
1002               5
                ... 
99                 3
990                4
997                1
999                2
Not specified      1
Length: 1121, dtype: int64

In [134]:
for column in data.columns:
    print('--------------------------------')
    print(data.groupby(column).size())

--------------------------------
Id
0       1
1       1
2       1
3       1
4       1
       ..
6995    1
6996    1
6997    1
6998    1
6999    1
Length: 7000, dtype: int64
--------------------------------
Gas_Connection
0                3175
1                 484
Not specified    3341
dtype: int64
--------------------------------
Fireplace
0                3270
1                1319
Not specified    2411
dtype: int64
--------------------------------
Entrance_Hall
0                1864
1                2695
Not specified    2441
dtype: int64
--------------------------------
Kitchenette
0                6791
Not specified     209
dtype: int64
--------------------------------
Equipped
0                4161
1                 428
Not specified    2411
dtype: int64
--------------------------------
Garage_Type
Lineales          604
NoTiene           835
Not specified    2892
Paralelas        2505
Separadas         164
dtype: int64
--------------------------------
Furnished
0                3

In [135]:
# Remove Construction_Area column, its redundant with construction area
data.drop(columns=['Construction_Area'], inplace=True)


In [136]:
data['Total_Area_m2'] = pd.to_numeric(data['Total_Area_m2'], errors='coerce')

In [137]:
dict_bedrooms = {
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5+': 5
}
def preprocess_data(data_set):
    data_set['Bedrooms'] = data_set['Bedrooms'].apply(lambda x: dict_bedrooms.get(x, x))
    data_set['Bedrooms'].fillna(data_set['Bedrooms'].mode()[0], inplace=True)
    return data_set

In [138]:
data = preprocess_data(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Bedrooms'].fillna(data_set['Bedrooms'].mode()[0], inplace=True)


In [139]:
numeric_columns = ['Number_Floors', 'Age']
def fill_numeric_column(column_name, dataset):
    dataset[column_name] = pd.to_numeric(dataset[column_name], errors='coerce')
    dataset[column_name].fillna(dataset[column_name].mode()[0], inplace=True)
    return dataset

In [140]:

for column in numeric_columns:
    data = fill_numeric_column(column, data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column_name].fillna(dataset[column_name].mode()[0], inplace=True)


In [141]:
allowed_values = ['0', '1', 'Nulo', 'Not specified', 'Si', 'No']
def remove_trash_data(column,dataset):
    print('--------------------------------')
    print('Removing trash data from column:', column)
    print(dataset.groupby(column).size())
    dataset = dataset[(dataset[column].isin(allowed_values)) | (dataset[column].isnull())]
    print(dataset.groupby(column).size())
    return dataset

In [142]:
trash_columns = ['Internet', 'Sauna_Area', 'Air_Conditioning', 'Independent_Entrance', 'Jacuzzi', 'Service_Bathroom'
                 ,'Cable', 'Service_Room', 'Internet_Room', 'Daycare', 'Daily_Dining_Room', 'Heating', 'Pets']
for column in trash_columns:
    data = remove_trash_data(column, data)

--------------------------------
Removing trash data from column: Internet
Internet
0                                                                2105
1                                                                1194
Not specified                                                    3160
Panamericana Sur Km 94.5 Asia                                       1
Panamericana Sur Km 94.5 Club Playa Las Arenas Asia                 1
Panamericana Sur Km 94.5 Club Playa Las Arenas Casa B_22 Asia       1
Publicado el 05.09.19                                               1
Publicado el 10.09.19                                               1
Publicado el 12.08.19                                               2
Publicado el 13.04.19                                               5
Publicado el 13.09.19                                               1
Publicado el 14.09.19                                               1
Publicado el 17.09.19                                               6
Public

In [143]:
data = data[data['Age']!= 'Not specified']
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

In [144]:
import numpy as np
def replace_not_specified_with_mode(column, dataset):
    dataset[column] = dataset[column].apply(lambda x: np.nan if (x == 'Not specified' or x == 'Nulo') else
                                      '1' if (x == '1' or x == 'Si')
                                      else '0' if (x == '0' or x == 'No') else x)
    #dataset[column+'_missing'] = dataset[column].isna().astype(int)
    #dataset[column].fillna(None, inplace=True)
    dataset[column].fillna('NA', inplace=True)
    dataset[column].replace('NA', None, inplace=True)
    dataset[column] = dataset[column].astype('category')
    print(dataset.groupby(column).size())
    return dataset

In [145]:
# Posibles columnas repetidas en el dataset
# Sauna, Sauna_area,
# Oceanfront, Near_Sea
# Kitchenette, Kitchen_with_Cabinets


columns_replace_with_mode = ['Gas_Connection', 'Fireplace', 'Entrance_Hall', 'Kitchenette',
           'Equipped', 'Furnished', 'Drainage', 'Telephone', 'Guest_Bathroom', 'BBQ_Area',
           'Living_Room', 'Nearby_Parks', 'Solarium', 'Commercial_Use', 'Internal_Garden',
           'Electricity', 'Patio', 'Children_Playground', 'Green_Areas', 'Electric_Doorman',
           'Intercom', 'Near_Sea', 'Sauna', 'Cinema_Room', 'Cleaning_Service', 'Terrace',
           'Sports_Area', 'Security_System', 'Water_Heater', 'Professional_Use', 'Club_House',
           'Internal_Park', 'Laundry_Room', 'Nearby_Schools', 'Balcony', 'Attic', 'Oceanfront', 'Security_Guard',
           'Swimming_Pool', 'Electric_Fence', 'Air_Conditioning', 'Hall', 'Nearby_Shopping_Centers', 'Kitchen',
           'Water', 'Basement', 'Independent_Bathroom', 'Walk_in_Closet', 'Grill', 'Closet',
           'Internet', 'Sauna_Area', 'Kitchen_with_Cabinets', 'Gym', 'Handicap_Access', 'Dining_Room',
           'Office', 'Service_Bathroom', 'Storage_Room', 'Cable', 'Jacuzzi', 'Independent_Entrance', 'Service_Room',
           'Internet_Room', 'Daycare', 'Daily_Dining_Room', 'Heating', 'Match', 'Pets', 'Garden']
for column in columns_replace_with_mode:
    print(f'Processing column: {column}')
    data = replace_not_specified_with_mode(column, data)

Processing column: Gas_Connection
Gas_Connection
0    3139
1     477
dtype: int64
Processing column: Fireplace
Fireplace
0    3240
1    1311
dtype: int64
Processing column: Entrance_Hall
Entrance_Hall
0    1856
1    2690
dtype: int64
Processing column: Kitchenette
Kitchenette
0    6748
dtype: int64
Processing column: Equipped
Equipped
0    4134
1     417
dtype: int64
Processing column: Furnished
Furnished
0    3738
1     813
dtype: int64
Processing column: Drainage
Drainage
0    3277
1    1274
dtype: int64
Processing column: Telephone
Telephone
0    2352
1    1264
dtype: int64
Processing column: Guest_Bathroom
Guest_Bathroom
0    3167
1    3581
dtype: int64
Processing column: BBQ_Area
BBQ_Area
0    3745
1     801
dtype: int64
Processing column: Living_Room
Living_Room
0    3834
1    2914
dtype: int64
Processing column: Nearby_Parks
Nearby_Parks
0     672
1    2184
dtype: int64
Processing column: Solarium
Solarium
0    4538
1       8
dtype: int64
Processing column: Commercial_Use
Commer

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will n

Terrace
0    3081
1    3667
dtype: int64
Processing column: Sports_Area
Sports_Area
0    3994
1     552
dtype: int64
Processing column: Security_System
Security_System
0    2669
1     947
dtype: int64
Processing column: Water_Heater
Water_Heater
0    3182
1     992
dtype: int64
Processing column: Professional_Use
Professional_Use
0    273
1    386
dtype: int64
Processing column: Club_House
Club_House
0    4218
1      32
dtype: int64
Processing column: Internal_Park
Internal_Park
0    4134
1     116
dtype: int64
Processing column: Laundry_Room
Laundry_Room
0    2458
1    3814
dtype: int64
Processing column: Nearby_Schools
Nearby_Schools
0     718
1    1874
dtype: int64
Processing column: Balcony
Balcony
0    5413
1     859
dtype: int64
Processing column: Attic
Attic
0    6084
1     188
dtype: int64
Processing column: Oceanfront
Oceanfront
0    2337
1     255
dtype: int64
Processing column: Security_Guard
Security_Guard
0    1795
1    1505
dtype: int64
Processing column: Swimming_Pool
Sw

  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change i

Handicap_Access
0    2187
1     405
dtype: int64
Processing column: Dining_Room
Dining_Room
0     531
1    5739
dtype: int64
Processing column: Office
Office
0    4217
1    2053
dtype: int64
Processing column: Service_Bathroom
Service_Bathroom
0    1390
1    4880
dtype: int64
Processing column: Storage_Room
Storage_Room
0    4821
1    1449
dtype: int64
Processing column: Cable
Cable
0    1997
1    1302
dtype: int64
Processing column: Jacuzzi
Jacuzzi
0    3320
1     854
dtype: int64
Processing column: Independent_Entrance
Independent_Entrance
0    2592
dtype: int64
Processing column: Service_Room
Service_Room
0    1381
1    4889
dtype: int64
Processing column: Internet_Room
Internet_Room
0    4227
1      23
dtype: int64
Processing column: Daycare
Daycare
0    4230
1      20
dtype: int64
Processing column: Daily_Dining_Room
Daily_Dining_Room
0    2096
1    4176
dtype: int64
Processing column: Heating
Heating
0    4126
1      48
dtype: int64
Processing column: Match
Match
0.0    6461
dtyp

  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change i

In [146]:
def clean_categorical_columns(dataset):
    categorical_cols = ['Garage_Type', 'Province', 'Type', 'Location', 'Natural_Light', 'Advertiser', 'Bedrooms', 'Pets', 'Garden', 'Beach_Resort', 'District', 'Property_Condition']
    for column in categorical_cols:
        print(f'Processing categorical column: {column}')
        dataset[column] = dataset[column].astype('category')
        #dataset[column].cat.add_categories(None)
        dataset[column] = dataset[column].replace('NA', None)
        dataset[column] = dataset[column].replace('Not specified', None)

        #dataset[column] = dataset[column].fillna(dataset[column].mode()[0])
        #dataset[column] = dataset[column].fillna(None)

    return dataset


In [147]:
data = clean_categorical_columns(data)

  dataset[column] = dataset[column].replace('Not specified', None)
  dataset[column] = dataset[column].replace('NA', None)


Processing categorical column: Garage_Type
Processing categorical column: Province
Processing categorical column: Type
Processing categorical column: Location
Processing categorical column: Natural_Light
Processing categorical column: Advertiser
Processing categorical column: Bedrooms
Processing categorical column: Pets
Processing categorical column: Garden
Processing categorical column: Beach_Resort
Processing categorical column: District
Processing categorical column: Property_Condition


  dataset[column] = dataset[column].replace('Not specified', None)
  dataset[column] = dataset[column].replace('NA', None)
  dataset[column] = dataset[column].replace('Not specified', None)


In [148]:
# Drop unique columns
for column in data.columns:
    if len(data[column].unique()) == len(data):
        print(f'Dropping unique column: {column}')
        data.drop(columns=[column], inplace=True)
    elif len(data[column].unique()) == 1:
        print(f'Dropping column with one value: {column}')
        data.drop(columns=[column], inplace=True)

Dropping unique column: Id


In [149]:
columns_delete = [ 'Advertiser', 'Cleaning_Service', 'Telephone', 'Internet_Room', 'Service_Bathroom', 'Service_Room',
                    'Handicap_Access', 'Office', 'Club_House', 'Kitchen', 'Air_Conditioning', 'Location', 'Match']

In [150]:
# Identify categorical and numerical columns
columns_object =data.select_dtypes(include=['object']).columns.tolist()
print(columns_object)
data.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()
#data.drop(columns=columns_object, inplace=True)

[]


['Garages',
 'Number_Bathrooms',
 'Construction_Area_m2',
 'Age',
 'Number_Floors',
 'Total_Area_m2',
 'Price']

In [151]:
data.select_dtypes(include=['int64', 'float64']).columns.tolist()

['Garages',
 'Number_Bathrooms',
 'Construction_Area_m2',
 'Age',
 'Number_Floors',
 'Total_Area_m2',
 'Price']

In [152]:
import numpy as np
def replace_not_specified_with_mediam(column, dataset):
    dataset[column] = dataset[column].apply(lambda x: np.nan if (x == 'Not specified' or x == 'Nulo') else x)
    dataset[column].fillna(dataset[column].median(), inplace=True)
    return dataset

In [153]:
data = replace_not_specified_with_mediam('Total_Area_m2', data)
#data.drop(columns='Beach_Resort', inplace=True)
data.drop_duplicates(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].median(), inplace=True)


In [154]:
data.drop(columns=columns_delete, inplace=True)

In [155]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

# Select features and target
X = data.drop(columns=['Price'])
y = data['Price']
y = np.log(y)  # Log-transform the target variable for better performance

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['category']).columns.tolist()
numerical_cols = ['Garages', 'Number_Bathrooms', 'Age', 'Number_Floors']
log_cols = ['Construction_Area_m2', 'Total_Area_m2']
boolean_cols = X.select_dtypes(include=['bool']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('bool', OneHotEncoder() ,boolean_cols),
        ('num', StandardScaler(), numerical_cols),
        ('log', FunctionTransformer(np.log), log_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit and transform data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Build neural network
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train_processed.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)


optimizer = Adam(clipvalue=0.001)  # Clip gradients to a value between -1 and 1
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

# Train model
history = model.fit(X_train_processed, y_train, epochs=150, batch_size=64, validation_split=0.1, callbacks=[early_stop], verbose=1)

# Evaluate model
loss, mae = model.evaluate(X_test_processed, y_test, verbose=0)
print(f"Test MAE: {mae:.2f}")



Epoch 1/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - loss: 20.6322 - mae: 2.6679 - val_loss: 0.4707 - val_mae: 0.5645
Epoch 2/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 0.2534 - mae: 0.3838 - val_loss: 0.2303 - val_mae: 0.3581
Epoch 3/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.1949 - mae: 0.3352 - val_loss: 0.2309 - val_mae: 0.3666
Epoch 4/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 0.1936 - mae: 0.3314 - val_loss: 0.1821 - val_mae: 0.3101
Epoch 5/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.1731 - mae: 0.3124 - val_loss: 0.4050 - val_mae: 0.5415
Epoch 6/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2368 - mae: 0.3786 - val_loss: 0.2691 - val_mae: 0.3888
Epoch 7/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss

In [156]:
'''
import tensorflow as tf
from tensorflow.keras import layers, models
from xgboost import XGBRegressor

input_dim = X_train_processed.shape[1]
encoding_dim = 20  # compress to 4 features

# Build autoencoder model
input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(8, activation='relu')(input_layer)
encoded = layers.Dense(encoding_dim, activation='relu')(encoded)  # bottleneck

decoded = layers.Dense(8, activation='relu')(encoded)
decoded = layers.Dense(input_dim, activation='linear')(decoded)

autoencoder = models.Model(inputs=input_layer, outputs=decoded)

# Compile and train
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_processed, X_train_processed, epochs=100, batch_size=32, shuffle=True, validation_data=(X_test_processed, X_test_processed), verbose=0)
'''

"\nimport tensorflow as tf\nfrom tensorflow.keras import layers, models\nfrom xgboost import XGBRegressor\n\ninput_dim = X_train_processed.shape[1]\nencoding_dim = 20  # compress to 4 features\n\n# Build autoencoder model\ninput_layer = layers.Input(shape=(input_dim,))\nencoded = layers.Dense(8, activation='relu')(input_layer)\nencoded = layers.Dense(encoding_dim, activation='relu')(encoded)  # bottleneck\n\ndecoded = layers.Dense(8, activation='relu')(encoded)\ndecoded = layers.Dense(input_dim, activation='linear')(decoded)\n\nautoencoder = models.Model(inputs=input_layer, outputs=decoded)\n\n# Compile and train\nautoencoder.compile(optimizer='adam', loss='mse')\nautoencoder.fit(X_train_processed, X_train_processed, epochs=100, batch_size=32, shuffle=True, validation_data=(X_test_processed, X_test_processed), verbose=0)\n"

In [157]:
'''
encoder = models.Model(inputs=input_layer, outputs=encoded)

# Encode training and test data
X_train_encoded = encoder.predict(X_train_processed)
X_test_encoded = encoder.predict(X_test_processed)
'''

'\nencoder = models.Model(inputs=input_layer, outputs=encoded)\n\n# Encode training and test data\nX_train_encoded = encoder.predict(X_train_processed)\nX_test_encoded = encoder.predict(X_test_processed)\n'

In [158]:

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
# Create the model
mlp = MLPRegressor(hidden_layer_sizes=(50, 30), max_iter=10000, random_state=1)

# Train the model
mlp.fit(X_train_processed, y_train)
y_pred = mlp.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

MAE: 0.31784626381552444
R2 score on test set: 0.7282


In [159]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# 7. Train Random Forest Regressor
randomForest = RandomForestRegressor(n_estimators=500, random_state=42, max_depth=10,)
randomForest.fit(X_train_processed, y_train)

# 8. Predict and evaluate
y_pred = randomForest.predict(X_test_processed)
# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

MAE: 0.2881817018281523
R2 score on test set: 0.7893


In [160]:
import numpy as np
from sklearn.linear_model import Lasso

# Create Lasso model with regularization strength (alpha)
lasso = Lasso(alpha=0.1)

# Fit to training data
lasso.fit(X_train_processed, y_train)

y_pred = lasso.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

MAE: 0.4154630389227033
R2 score on test set: 0.5901


In [161]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=10,
    random_state=151
)

# Train the model
xgb_model.fit(X_train_processed, y_train)

y_pred = xgb_model.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

MAE: 0.25557489190396715
R2 score on test set: 0.8265


In [None]:

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Crear el clasificador XGBoost
xgb_model = xgb.XGBRegressor(objective="reg:squarederror")
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'min_child_weight': [1, 5, 10]
}
# Realizar GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)

# Ajustar el modelo
grid_search.fit(X_train_processed, y_train)

# Obtener los mejores parámetros
best_params = grid_search.best_params_
print(f"Mejores parámetros: {best_params}")
# Predecir con el mejor modelo
best_model = grid_search.best_estimator_

# Predicciones
y_pred = best_model.predict(X_test_processed)

# Evaluar el rendimiento
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")


Fitting 3 folds for each of 1458 candidates, totalling 4374 fits


In [None]:
'''
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

xgb_model_best = XGBRegressor(
    colsample_bytree=0.7,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=1,
    n_estimators=500,
    subsample=1.0
)

# Train the model
xgb_model_best.fit(X_train_processed, y_train)

y_pred = xgb_model_best.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")
'''

MAE: 0.24579752737948088
R2 score on test set: 0.8406


In [None]:
from sklearn.metrics import r2_score

# Predict on test set
y_pred = model.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
MAE: 0.27692590526046346
R2 score on test set: 0.7970


In [None]:
test_data = pd.read_csv('/content/sample_data/test.csv')
test_data.columns = test_data.columns.str.replace("'", "", regex=False)
test_data.head()



Unnamed: 0,Id,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Telephone,...,Handicap_Access,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom
0,0,Not specified,0,1,0,0,Paralelas,0,0,Not specified,...,Not specified,1,1,340,0,LaMolina,0,Not specified,0,1
1,1,Not specified,Not specified,1,0,Not specified,Lineales,Not specified,Not specified,Not specified,...,Not specified,1,0,1138,Not specified,LaMolina,0,Bueno,0,1
2,2,Not specified,0,0,0,0,Paralelas,0,0,Not specified,...,Not specified,0,1,1353,0,LaMolina,0,Bueno,0,1
3,3,0,1,0,0,0,Paralelas,0,0,0,...,Not specified,1,0,230,0,SantiagoDeSurco,0,Muy bueno,0,1
4,4,0,Not specified,Not specified,0,Not specified,Not specified,Not specified,Not specified,0,...,Not specified,1,0,305,Not specified,LaMolina,Not specified,Not specified,Not specified,1


In [None]:
test_data

Unnamed: 0,Id,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Telephone,...,Handicap_Access,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom
0,0,Not specified,0,1,0,0,Paralelas,0,0,Not specified,...,Not specified,1,1,340,0,LaMolina,0,Not specified,0,1
1,1,Not specified,Not specified,1,0,Not specified,Lineales,Not specified,Not specified,Not specified,...,Not specified,1,0,1138,Not specified,LaMolina,0,Bueno,0,1
2,2,Not specified,0,0,0,0,Paralelas,0,0,Not specified,...,Not specified,0,1,1353,0,LaMolina,0,Bueno,0,1
3,3,0,1,0,0,0,Paralelas,0,0,0,...,Not specified,1,0,230,0,SantiagoDeSurco,0,Muy bueno,0,1
4,4,0,Not specified,Not specified,0,Not specified,Not specified,Not specified,Not specified,0,...,Not specified,1,0,305,Not specified,LaMolina,Not specified,Not specified,Not specified,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,652,Not specified,Not specified,1,0,Not specified,Paralelas,Not specified,Not specified,Not specified,...,Not specified,1,1,600,Not specified,LaMolina,0,Not specified,0,1
653,653,1,0,1,0,1,Separadas,1,0,0,...,0,1,0,204,0,Asia,0,Excelente,0,1
654,654,0,Not specified,1,0,Not specified,Paralelas,Not specified,Not specified,0,...,Not specified,1,1,270,Not specified,LaMolina,0,Not specified,0,1
655,655,0,0,0,0,0,Paralelas,0,0,0,...,Not specified,1,0,160,0,Asia,0,Not specified,0,1


In [None]:
# Remove Construction_Area column, its redundant with construction area
#test_data.drop(columns=['Beach:Resort'], inplace=True)
test_data = extract_date(test_data)
test_data= extract_Location(test_data)
test_data = clean_beach_resort(test_data)
test_data.drop(columns=['Construction_Area'], inplace=True)
test_data['Total_Area_m2'] = pd.to_numeric(test_data['Total_Area_m2'], errors='coerce')
test_data = replace_not_specified_with_mediam('Total_Area_m2', test_data)
test_data = preprocess_data(test_data)
for column in numeric_columns:
    test_data = fill_numeric_column(column, test_data)

test_data['Age'] = pd.to_numeric(test_data['Age'], errors='coerce')

for column in trash_columns:
    test_data = remove_trash_data(column, test_data)
for column in columns_replace_with_mode:
    print(f'Processing column: {column}')
    test_data = replace_not_specified_with_mode(column, test_data)

test_data = clean_categorical_columns(test_data)

# Drop unique columns
for column in test_data.columns:
    if len(test_data[column].unique()) == len(test_data):
        print(f'Dropping unique column: {column}')
        test_data.drop(columns=[column], inplace=True)
    elif len(test_data[column].unique()) == 1:
        print(f'Dropping column with one value: {column}')
        test_data.drop(columns=[column], inplace=True)

test_data.drop(columns=columns_object, inplace=True)
test_data.drop(columns=columns_delete, inplace=True)
#test_data.drop(columns='Beach_Resort', inplace=True)
test_data.info()

--------------------------------
Removing trash data from column: Internet
Internet
0                192
1                112
Not specified    309
dtype: int64
Internet
0                192
1                112
Not specified    309
dtype: int64
--------------------------------
Removing trash data from column: Sauna_Area
Sauna_Area
0                404
1                  8
Not specified    201
dtype: int64
Sauna_Area
0                404
1                  8
Not specified    201
dtype: int64
--------------------------------
Removing trash data from column: Air_Conditioning
Air_Conditioning
0                317
1                 72
Not specified    224
dtype: int64
Air_Conditioning
0                317
1                 72
Not specified    224
dtype: int64
--------------------------------
Removing trash data from column: Independent_Entrance
Independent_Entrance
0                249
Not specified    364
dtype: int64
Independent_Entrance
0                249
Not specified    364
dtype: in

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Location'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

Independent_Bathroom
0    599
dtype: int64
Processing column: Walk_in_Closet
Walk_in_Closet
0    454
1    145
dtype: int64
Processing column: Grill
Grill
0    189
1    200
dtype: int64
Processing column: Closet
Closet
0    560
1     39
dtype: int64
Processing column: Internet
Internet
0    192
1    112
dtype: int64
Processing column: Sauna_Area
Sauna_Area
0    404
1      8
dtype: int64
Processing column: Kitchen_with_Cabinets
Kitchen_with_Cabinets
0    245
1    144
dtype: int64
Processing column: Gym
Gym
0    383
1     29
dtype: int64
Processing column: Handicap_Access
Handicap_Access
0    206
1     43
dtype: int64
Processing column: Dining_Room
Dining_Room
0     60
1    539
dtype: int64
Processing column: Office
Office
0    386
1    213
dtype: int64
Processing column: Service_Bathroom
Service_Bathroom
0    134
1    465
dtype: int64
Processing column: Storage_Room
Storage_Room
0    470
1    129
dtype: int64
Processing column: Cable
Cable
0    175
1    129
dtype: int64
Processing column

  dataset[column] = dataset[column].replace('NA', None)
  dataset[column] = dataset[column].replace('Not specified', None)


In [None]:
test_data

Unnamed: 0,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Guest_Bathroom,BBQ_Area,...,Cable,Gym,Jacuzzi,Independent_Entrance,Dining_Room,Total_Area_m2,Heating,District,Daycare,Property_Condition
0,,0,1,0,0,Paralelas,0,0,0,0,...,,0,0,,1,340.0,0,LaMolina,0,
1,,,1,0,,Lineales,,,1,0,...,,0,,,1,1138.0,,LaMolina,0,Bueno
2,,0,0,0,0,Paralelas,0,0,0,0,...,,0,0,,0,1353.0,0,LaMolina,0,Bueno
3,0,1,0,0,0,Paralelas,0,0,0,0,...,0,0,0,,1,230.0,0,SantiagoDeSurco,0,Muy bueno
4,0,,,0,,,,,1,,...,1,,,,1,305.0,,LaMolina,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,,,1,0,,Paralelas,,,0,0,...,,0,,,1,600.0,,LaMolina,0,
653,1,0,1,0,1,Separadas,1,0,1,1,...,1,1,0,0,1,204.0,0,Asia,0,Excelente
654,0,,1,0,,Paralelas,,,1,0,...,0,0,,,1,270.0,,LaMolina,0,
655,0,0,0,0,0,Paralelas,0,0,0,1,...,0,0,0,,1,160.0,0,Asia,0,


In [None]:
# Identify categorical and numerical columns
#columns_object =test_data.select_dtypes(include=['object']).columns.tolist()
print(columns_object)
data.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()
#test_data.drop(columns=columns_object, inplace=True)

[]


['Garages',
 'Number_Bathrooms',
 'Construction_Area_m2',
 'Age',
 'Number_Floors',
 'Total_Area_m2',
 'Price']

In [None]:
data.columns

Index(['Gas_Connection', 'Fireplace', 'Entrance_Hall', 'Kitchenette',
       'Equipped', 'Garage_Type', 'Furnished', 'Drainage', 'Guest_Bathroom',
       'BBQ_Area', 'Living_Room', 'Nearby_Parks', 'Solarium', 'Commercial_Use',
       'Province', 'Internal_Garden', 'Garages', 'Electricity', 'Patio',
       'Children_Playground', 'Type', 'Number_Bathrooms', 'Green_Areas',
       'Electric_Doorman', 'Construction_Area_m2', 'Intercom', 'Near_Sea',
       'Sauna', 'Cinema_Room', 'Terrace', 'Sports_Area', 'Security_System',
       'Water_Heater', 'Professional_Use', 'Internal_Park', 'Laundry_Room',
       'Nearby_Schools', 'Balcony', 'Attic', 'Oceanfront', 'Security_Guard',
       'Natural_Light', 'Swimming_Pool', 'Electric_Fence', 'Hall',
       'Nearby_Shopping_Centers', 'Water', 'Basement', 'Independent_Bathroom',
       'Walk_in_Closet', 'Age', 'Number_Floors', 'Daily_Dining_Room', 'Grill',
       'Closet', 'Internet', 'Sauna_Area', 'Bedrooms', 'Pets',
       'Kitchen_with_Cabinets', 'Ga

In [None]:
test_data

Unnamed: 0,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Guest_Bathroom,BBQ_Area,...,Cable,Gym,Jacuzzi,Independent_Entrance,Dining_Room,Total_Area_m2,Heating,District,Daycare,Property_Condition
0,,0,1,0,0,Paralelas,0,0,0,0,...,,0,0,,1,340.0,0,LaMolina,0,
1,,,1,0,,Lineales,,,1,0,...,,0,,,1,1138.0,,LaMolina,0,Bueno
2,,0,0,0,0,Paralelas,0,0,0,0,...,,0,0,,0,1353.0,0,LaMolina,0,Bueno
3,0,1,0,0,0,Paralelas,0,0,0,0,...,0,0,0,,1,230.0,0,SantiagoDeSurco,0,Muy bueno
4,0,,,0,,,,,1,,...,1,,,,1,305.0,,LaMolina,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,,,1,0,,Paralelas,,,0,0,...,,0,,,1,600.0,,LaMolina,0,
653,1,0,1,0,1,Separadas,1,0,1,1,...,1,1,0,0,1,204.0,0,Asia,0,Excelente
654,0,,1,0,,Paralelas,,,1,0,...,0,0,,,1,270.0,,LaMolina,0,
655,0,0,0,0,0,Paralelas,0,0,0,1,...,0,0,0,,1,160.0,0,Asia,0,


In [None]:
test_data_processed = preprocessor.transform(test_data)
# Predict prices
predicted_prices = best_model.predict(test_data_processed)
print(predicted_prices)
predicted_prices = np.exp(predicted_prices)  # Inverse log transformation to get actual prices
# Add predictions to test_data
test_data['Predicted_Price'] = predicted_prices.flatten()
test_data[['Predicted_Price']].head()

df = pd.DataFrame(predicted_prices, columns=['Price'])
df.index.name = 'Id'
print(df)
df.to_csv('submission.csv', index=True)

[13.611223  13.981842  14.164186  12.841844  13.343918  12.180581
 13.401308  13.92139   12.116114  13.243544  13.358929  11.879754
 13.3287325 14.137474  14.461741  13.502213  13.161041  13.761162
 13.075783  11.582027  13.381571  12.906412  13.804952  13.352725
 12.862575  14.0459    12.507291  13.717907  13.24307   12.551491
 12.605106  13.465185  13.034573  12.767188  12.668474  14.113707
 14.20915   12.242954  12.969327  12.47066   13.018472  12.243499
 12.82226   11.788592  13.816583  13.534219  13.577533  13.71768
 14.2509775 12.175694  14.165533  11.653478  13.934288  14.369823
 11.90917   12.6496    14.528574  14.053922  12.845125  13.042438
 13.6411495 12.925297  11.945521  14.508862  14.272545  14.167368
 13.437114  14.031452  12.992339  13.344766  14.152919  12.680695
 14.2493105 13.001434  13.199447  11.946218  14.033474  14.545697
 12.31591   11.2815485 14.041228  12.498621  13.144303  13.360877
 12.157699  11.834061  13.941112  11.746571  13.952025  13.483288
 13.430888 