In [32]:
import pandas as pd # type: ignore

# Read the CSV file
data = pd.read_csv('train.csv')

# Display the first few rows of the data
data.head()
data.columns = data.columns.str.replace("'", "", regex=False)

In [33]:
def extract_Location(data_set: pd.DataFrame) -> pd.DataFrame:
    data_set['Location'] = data_set['Location'].str.extract(r'([^-\s]+)')
    data_set['Location'].fillna('NA', inplace=True)
    return data_set
data= extract_Location(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Location'].fillna('NA', inplace=True)


In [34]:
def clean_beach_resort(dataset):
    allowed_values = [ 'Nulo' ,'Not specified', '0' , '1']
    dataset['Beach_Resort'] = dataset['Beach_Resort'].apply(lambda x: 'NA' if pd.isna(x) or x in allowed_values else x)
    return dataset

In [35]:
data = clean_beach_resort(data)

In [36]:
def extract_date(data_set: pd.DataFrame) -> pd.DataFrame:
    #data_set['Publication_Date'] = data_set['Publication_Date'].str.extract(r'Publicado el (.+)')
    #data_set['Publication_Date'] = pd.to_datetime(data_set['Publication_Date'], errors='coerce')
    #data_set['Publication_Date'] = data_set['Publication_Date'].fillna(data_set['Publication_Date'].mode()[0])
    #data_set['Publication_Year'] = data_set['Publication_Date'].dt.year
    #data_set['Publication_Month'] = data_set['Publication_Date'].dt.month
    #data_set['Publication_Day'] = data_set['Publication_Date'].dt.day
    data_set.drop(columns='Publication_Date', inplace=True)
    return data_set

In [37]:
data = extract_date(data)

In [38]:
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

In [39]:
data.groupby('Total_Area_m2').size()

Total_Area_m2
0                 43
1                  2
100               31
1000             143
1002               5
                ... 
99                 3
990                4
997                1
999                2
Not specified      1
Length: 1121, dtype: int64

In [40]:
for column in data.columns:
    print('--------------------------------')
    print(data.groupby(column).size())

--------------------------------
Id
0       1
1       1
2       1
3       1
4       1
       ..
6995    1
6996    1
6997    1
6998    1
6999    1
Length: 7000, dtype: int64
--------------------------------
Gas_Connection
0                3175
1                 484
Not specified    3341
dtype: int64
--------------------------------
Fireplace
0                3270
1                1319
Not specified    2411
dtype: int64
--------------------------------
Entrance_Hall
0                1864
1                2695
Not specified    2441
dtype: int64
--------------------------------
Kitchenette
0                6791
Not specified     209
dtype: int64
--------------------------------
Equipped
0                4161
1                 428
Not specified    2411
dtype: int64
--------------------------------
Garage_Type
Lineales          604
NoTiene           835
Not specified    2892
Paralelas        2505
Separadas         164
dtype: int64
--------------------------------
Furnished
0                3

Terrace
0                3120
1                3671
Not specified     209
dtype: int64
--------------------------------
Construction_Area
1.00 m2         4
100.00 m2      50
100.36 m2       1
1000.00 m2     25
10000.00 m2     2
               ..
975.00 m2       1
98.00 m2        7
99.00 m2        1
996.00 m2       1
997.00 m2       2
Length: 1003, dtype: int64
--------------------------------
Sports_Area
0                4003
1                 556
Not specified    2441
dtype: int64
--------------------------------
Security_System
0                2689
1                 970
Not specified    3341
dtype: int64
--------------------------------
Location
Abancay             2
AltoLaran           2
AltoSelvaAlegre     1
Amarilis            1
Ancon              14
                   ..
VillaRica           1
Yanahuara           2
Yarinacocha         2
Yurimaguas          1
Zorritos            5
Length: 155, dtype: int64
--------------------------------
Water_Heater
0                3193
1      

In [41]:
# Remove Construction_Area column, its redundant with construction area
data.drop(columns=['Construction_Area'], inplace=True)


In [42]:
data['Total_Area_m2'] = pd.to_numeric(data['Total_Area_m2'], errors='coerce')

In [43]:
dict_bedrooms = {
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5+': 5
}
def preprocess_data(data_set):
    data_set['Bedrooms'] = data_set['Bedrooms'].apply(lambda x: dict_bedrooms.get(x, x))
    data_set['Bedrooms'].fillna(data_set['Bedrooms'].mode()[0], inplace=True)
    return data_set

In [44]:
data = preprocess_data(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Bedrooms'].fillna(data_set['Bedrooms'].mode()[0], inplace=True)


In [45]:
numeric_columns = ['Number_Floors', 'Age']
def fill_numeric_column(column_name, dataset):
    dataset[column_name] = pd.to_numeric(dataset[column_name], errors='coerce')
    dataset[column_name].fillna(dataset[column_name].mode()[0], inplace=True)
    return dataset

In [46]:

for column in numeric_columns:
    data = fill_numeric_column(column, data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column_name].fillna(dataset[column_name].mode()[0], inplace=True)


In [47]:
data['Beach_Resort']

0            NA
1            NA
2            NA
3            NA
4            NA
         ...   
6995         NA
6996    Del Sol
6997         NA
6998         NA
6999         NA
Name: Beach_Resort, Length: 7000, dtype: object

In [48]:
allowed_values = ['0', '1', 'Nulo', 'Not specified', 'Si', 'No']
def remove_trash_data(column,dataset):
    print('--------------------------------')
    print('Removing trash data from column:', column)
    print(dataset.groupby(column).size())
    dataset = dataset[(dataset[column].isin(allowed_values)) | (dataset[column].isnull())]
    print(dataset.groupby(column).size())
    return dataset

In [49]:
trash_columns = ['Internet', 'Sauna_Area', 'Air_Conditioning', 'Independent_Entrance', 'Jacuzzi', 'Service_Bathroom'
                 ,'Cable', 'Service_Room', 'Internet_Room', 'Daycare', 'Daily_Dining_Room', 'Heating', 'Pets']
for column in trash_columns:
    data = remove_trash_data(column, data)

--------------------------------
Removing trash data from column: Internet
Internet
0                                                                2105
1                                                                1194
Not specified                                                    3160
Panamericana Sur Km 94.5 Asia                                       1
Panamericana Sur Km 94.5 Club Playa Las Arenas Asia                 1
Panamericana Sur Km 94.5 Club Playa Las Arenas Casa B_22 Asia       1
Publicado el 05.09.19                                               1
Publicado el 10.09.19                                               1
Publicado el 12.08.19                                               2
Publicado el 13.04.19                                               5
Publicado el 13.09.19                                               1
Publicado el 14.09.19                                               1
Publicado el 17.09.19                                               6
Public

In [50]:
data = data[data['Age']!= 'Not specified']
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

In [51]:
import numpy as np
def replace_not_specified_with_mode(column, dataset):
    dataset[column] = dataset[column].apply(lambda x: np.nan if (x == 'Not specified' or x == 'Nulo') else 
                                      '1' if (x == '1' or x == 'Si') 
                                      else '0' if (x == '0' or x == 'No') else x)
    #dataset[column+'_missing'] = dataset[column].isna().astype(int)
    #dataset[column].fillna(None, inplace=True)
    dataset[column].fillna('NA', inplace=True)
    dataset[column].replace('NA', None, inplace=True)
    dataset[column] = dataset[column].astype('category')
    print(dataset.groupby(column).size())
    return dataset


In [52]:
# Posibles columnas repetidas en el dataset
# Sauna, Sauna_area,
# Oceanfront, Near_Sea
# Kitchenette, Kitchen_with_Cabinets


columns_replace_with_mode = ['Gas_Connection', 'Fireplace', 'Entrance_Hall', 'Kitchenette',
           'Equipped', 'Furnished', 'Drainage', 'Telephone', 'Guest_Bathroom', 'BBQ_Area',
           'Living_Room', 'Nearby_Parks', 'Solarium', 'Commercial_Use', 'Internal_Garden', 
           'Electricity', 'Patio', 'Children_Playground', 'Green_Areas', 'Electric_Doorman',
           'Intercom', 'Near_Sea', 'Sauna', 'Cinema_Room', 'Cleaning_Service', 'Terrace',
           'Sports_Area', 'Security_System', 'Water_Heater', 'Professional_Use', 'Club_House',
           'Internal_Park', 'Laundry_Room', 'Nearby_Schools', 'Balcony', 'Attic', 'Oceanfront', 'Security_Guard',
           'Swimming_Pool', 'Electric_Fence', 'Air_Conditioning', 'Hall', 'Nearby_Shopping_Centers', 'Kitchen',
           'Water', 'Basement', 'Independent_Bathroom', 'Walk_in_Closet', 'Grill', 'Closet',
           'Internet', 'Sauna_Area', 'Kitchen_with_Cabinets', 'Gym', 'Handicap_Access', 'Dining_Room',
           'Office', 'Service_Bathroom', 'Storage_Room', 'Cable', 'Jacuzzi', 'Independent_Entrance', 'Service_Room',
           'Internet_Room', 'Daycare', 'Daily_Dining_Room', 'Heating', 'Match', 'Pets', 'Garden']
for column in columns_replace_with_mode:
    print(f'Processing column: {column}')
    data = replace_not_specified_with_mode(column, data)

Processing column: Gas_Connection
Gas_Connection
0    3139
1     477
dtype: int64
Processing column: Fireplace
Fireplace
0    3240
1    1311
dtype: int64
Processing column: Entrance_Hall
Entrance_Hall
0    1856
1    2690
dtype: int64
Processing column: Kitchenette
Kitchenette
0    6748
dtype: int64
Processing column: Equipped
Equipped
0    4134
1     417
dtype: int64
Processing column: Furnished
Furnished
0    3738
1     813
dtype: int64
Processing column: Drainage
Drainage
0    3277
1    1274
dtype: int64
Processing column: Telephone
Telephone
0    2352
1    1264
dtype: int64
Processing column: Guest_Bathroom
Guest_Bathroom
0    3167
1    3581
dtype: int64
Processing column: BBQ_Area


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will n

BBQ_Area
0    3745
1     801
dtype: int64
Processing column: Living_Room
Living_Room
0    3834
1    2914
dtype: int64
Processing column: Nearby_Parks
Nearby_Parks
0     672
1    2184
dtype: int64
Processing column: Solarium
Solarium
0    4538
1       8
dtype: int64
Processing column: Commercial_Use
Commercial_Use
0    383
1    384
dtype: int64
Processing column: Internal_Garden
Internal_Garden
0    4551
1    2197
dtype: int64
Processing column: Electricity
Electricity
0     865
1    2751
dtype: int64
Processing column: Patio


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will n

Patio
0    4441
1    2307
dtype: int64
Processing column: Children_Playground
Children_Playground
0    4242
1     304
dtype: int64
Processing column: Green_Areas
Green_Areas
0    3595
1     951
dtype: int64
Processing column: Electric_Doorman
Electric_Doorman
0    3850
1     701
dtype: int64
Processing column: Intercom
Intercom
0    3583
1     968
dtype: int64
Processing column: Near_Sea


  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change i

Near_Sea
0    2464
1     392
dtype: int64
Processing column: Sauna
Sauna
0    4388
1     163
dtype: int64
Processing column: Cinema_Room
Cinema_Room
0    4528
1      18
dtype: int64
Processing column: Cleaning_Service
Cleaning_Service
0    3225
1     391
dtype: int64
Processing column: Terrace
Terrace
0    3081
1    3667
dtype: int64
Processing column: Sports_Area
Sports_Area
0    3994
1     552
dtype: int64
Processing column: Security_System
Security_System
0    2669
1     947
dtype: int64
Processing column: Water_Heater
Water_Heater
0    3182
1     992
dtype: int64
Processing column: Professional_Use
Professional_Use
0    273
1    386
dtype: int64
Processing column: Club_House
Club_House
0    4218
1      32
dtype: int64
Processing column: Internal_Park
Internal_Park
0    4134
1     116
dtype: int64
Processing column: Laundry_Room
Laundry_Room
0    2458
1    3814
dtype: int64
Processing column: Nearby_Schools
Nearby_Schools
0     718
1    1874
dtype: int64
Processing column: Balcony
B

  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change i

Electric_Fence
0    3829
1     345
dtype: int64
Processing column: Air_Conditioning
Air_Conditioning
0    3388
1     786
dtype: int64
Processing column: Hall
Hall
0    2453
1    3819
dtype: int64
Processing column: Nearby_Shopping_Centers
Nearby_Shopping_Centers
0     672
1    1920
dtype: int64
Processing column: Kitchen
Kitchen
0     364
1    5908
dtype: int64
Processing column: Water
Water
0     748
1    2552
dtype: int64
Processing column: Basement
Basement
0    5962
1     310
dtype: int64
Processing column: Independent_Bathroom
Independent_Bathroom
0    6272
dtype: int64
Processing column: Walk_in_Closet
Walk_in_Closet
0    4746
1    1526
dtype: int64
Processing column: Grill
Grill
0    1987
1    2187
dtype: int64
Processing column: Closet
Closet
0    5946
1     326
dtype: int64
Processing column: Internet
Internet
0    2105
1    1194
dtype: int64
Processing column: Sauna_Area
Sauna_Area
0    4183
1      67
dtype: int64
Processing column: Kitchen_with_Cabinets
Kitchen_with_Cabinets

  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change i

In [53]:
data

Unnamed: 0,Id,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Telephone,...,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom,Price
0,0,1,,0,0,,Paralelas,,,0,...,,,,,,,,,,210000
1,1,0,1,,0,0,Paralelas,0,0,0,...,1,0,311.0,0,LaMolina,,Bueno,,1,660000
2,2,,,1,0,,Lineales,,,,...,1,1,1400.0,,LaMolina,0,Bueno,0,1,1500000
3,3,0,1,,0,0,Separadas,0,0,0,...,1,0,317.0,0,SanIsidro,,Remodelado,,1,1100000
4,4,0,,,0,,Lineales,,,0,...,0,0,246.0,,VillaMariaDelTriunfo,,Regular,,0,110000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,6995,,,,0,,Not specified,,,,...,1,0,193.0,,LaVictoria,,Bueno,,0,450000
6996,6996,0,0,1,0,0,Not specified,1,0,0,...,1,0,144.0,0,Asia,0,Not specified,0,1,450000
6997,6997,0,0,,0,0,Not specified,0,1,1,...,1,0,396.0,0,SantiagoDeSurco,,Not specified,,1,450000
6998,6998,,,1,0,,Not specified,,,,...,1,0,188.0,,Bellavista,0,Muy bueno,0,1,240000


In [54]:
def clean_categorical_columns(dataset):
    categorical_cols = ['Garage_Type', 'Province', 'Type', 'Location', 'Natural_Light', 'Advertiser', 'Bedrooms', 'Pets', 'Garden', 'Beach_Resort', 'District', 'Property_Condition']
    for column in categorical_cols:
        print(f'Processing categorical column: {column}')
        dataset[column] = dataset[column].astype('category')
        #dataset[column].cat.add_categories(None)
        dataset[column] = dataset[column].replace('NA', None)
        dataset[column] = dataset[column].replace('Not specified', None)
        
        #dataset[column] = dataset[column].fillna(dataset[column].mode()[0])
        #dataset[column] = dataset[column].fillna(None)

    return dataset


In [55]:
data = clean_categorical_columns(data)

Processing categorical column: Garage_Type
Processing categorical column: Province
Processing categorical column: Type
Processing categorical column: Location
Processing categorical column: Natural_Light
Processing categorical column: Advertiser
Processing categorical column: Bedrooms
Processing categorical column: Pets
Processing categorical column: Garden
Processing categorical column: Beach_Resort
Processing categorical column: District
Processing categorical column: Property_Condition


  dataset[column] = dataset[column].replace('Not specified', None)
  dataset[column] = dataset[column].replace('NA', None)
  dataset[column] = dataset[column].replace('Not specified', None)
  dataset[column] = dataset[column].replace('NA', None)
  dataset[column] = dataset[column].replace('Not specified', None)


In [56]:
# Drop unique columns
for column in data.columns:
    if len(data[column].unique()) == len(data):
        print(f'Dropping unique column: {column}')
        data.drop(columns=[column], inplace=True)
    elif len(data[column].unique()) == 1:
        print(f'Dropping column with one value: {column}')
        data.drop(columns=[column], inplace=True)

Dropping unique column: Id


In [57]:
columns_delete = ['Advertiser', 'Cleaning_Service', 'Telephone', 'Internet_Room', 'Service_Bathroom', 'Service_Room', 'Handicap_Access', 'Office', 'Club_House', 'Kitchen', 'Air_Conditioning']

In [58]:
# Identify categorical and numerical columns
columns_object =data.select_dtypes(include=['object']).columns.tolist()
print(columns_object)
data.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()
#data.drop(columns=columns_object, inplace=True)

[]


['Garages',
 'Number_Bathrooms',
 'Construction_Area_m2',
 'Age',
 'Number_Floors',
 'Total_Area_m2',
 'Price']

In [59]:
data.select_dtypes(include=['int64', 'float64']).columns.tolist()

['Garages',
 'Number_Bathrooms',
 'Construction_Area_m2',
 'Age',
 'Number_Floors',
 'Total_Area_m2',
 'Price']

In [60]:
import numpy as np
def replace_not_specified_with_mediam(column, dataset):
    dataset[column] = dataset[column].apply(lambda x: np.nan if (x == 'Not specified' or x == 'Nulo') else x)
    dataset[column].fillna(dataset[column].median(), inplace=True)
    return dataset

In [61]:
data = replace_not_specified_with_mediam('Total_Area_m2', data)
#data.drop(columns='Beach_Resort', inplace=True)
data.drop_duplicates(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].median(), inplace=True)


In [62]:
data.drop(columns=columns_delete, inplace=True)

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

# Select features and target
X = data.drop(columns=['Price'])
y = data['Price']
y = np.log(y)  # Log-transform the target variable for better performance

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['category']).columns.tolist()
numerical_cols = ['Garages', 'Number_Bathrooms', 'Age', 'Number_Floors']
log_cols = ['Construction_Area_m2', 'Total_Area_m2']
boolean_cols = X.select_dtypes(include=['bool']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('bool', OneHotEncoder() ,boolean_cols),
        ('num', StandardScaler(), numerical_cols),
        ('log', FunctionTransformer(np.log), log_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit and transform data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Build neural network
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train_processed.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)


optimizer = Adam(clipvalue=0.001)  # Clip gradients to a value between -1 and 1
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

# Train model
history = model.fit(X_train_processed, y_train, epochs=150, batch_size=64, validation_split=0.1, callbacks=[early_stop], verbose=1)

# Evaluate model
loss, mae = model.evaluate(X_test_processed, y_test, verbose=0)
print(f"Test MAE: {mae:.2f}")

2025-07-13 01:58:49.918444: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-07-13 01:58:53.243068: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - loss: 17.2842 - mae: 2.5198 - val_loss: 0.3985 - val_mae: 0.4797
Epoch 2/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.4489 - mae: 0.5446 - val_loss: 0.2026 - val_mae: 0.3351
Epoch 3/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.1983 - mae: 0.3373 - val_loss: 0.1997 - val_mae: 0.3235
Epoch 4/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.2104 - mae: 0.3524 - val_loss: 0.2364 - val_mae: 0.3749
Epoch 5/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.2415 - mae: 0.3888 - val_loss: 0.4811 - val_mae: 0.5899
Epoch 6/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.3791 - mae: 0.5133 - val_loss: 0.2913 - val_mae: 0.4347
Epoch 7/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss

In [64]:
'''
import tensorflow as tf
from tensorflow.keras import layers, models
from xgboost import XGBRegressor

input_dim = X_train_processed.shape[1]
encoding_dim = 20  # compress to 4 features

# Build autoencoder model
input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(8, activation='relu')(input_layer)
encoded = layers.Dense(encoding_dim, activation='relu')(encoded)  # bottleneck

decoded = layers.Dense(8, activation='relu')(encoded)
decoded = layers.Dense(input_dim, activation='linear')(decoded)

autoencoder = models.Model(inputs=input_layer, outputs=decoded)

# Compile and train
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_processed, X_train_processed, epochs=100, batch_size=32, shuffle=True, validation_data=(X_test_processed, X_test_processed), verbose=0)
'''

"\nimport tensorflow as tf\nfrom tensorflow.keras import layers, models\nfrom xgboost import XGBRegressor\n\ninput_dim = X_train_processed.shape[1]\nencoding_dim = 20  # compress to 4 features\n\n# Build autoencoder model\ninput_layer = layers.Input(shape=(input_dim,))\nencoded = layers.Dense(8, activation='relu')(input_layer)\nencoded = layers.Dense(encoding_dim, activation='relu')(encoded)  # bottleneck\n\ndecoded = layers.Dense(8, activation='relu')(encoded)\ndecoded = layers.Dense(input_dim, activation='linear')(decoded)\n\nautoencoder = models.Model(inputs=input_layer, outputs=decoded)\n\n# Compile and train\nautoencoder.compile(optimizer='adam', loss='mse')\nautoencoder.fit(X_train_processed, X_train_processed, epochs=100, batch_size=32, shuffle=True, validation_data=(X_test_processed, X_test_processed), verbose=0)\n"

In [65]:
'''
encoder = models.Model(inputs=input_layer, outputs=encoded)

# Encode training and test data
X_train_encoded = encoder.predict(X_train_processed)
X_test_encoded = encoder.predict(X_test_processed)
'''

'\nencoder = models.Model(inputs=input_layer, outputs=encoded)\n\n# Encode training and test data\nX_train_encoded = encoder.predict(X_train_processed)\nX_test_encoded = encoder.predict(X_test_processed)\n'

In [66]:

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
# Create the model
mlp = MLPRegressor(hidden_layer_sizes=(50, 30), max_iter=10000, random_state=1)

# Train the model
mlp.fit(X_train_processed, y_train)
y_pred = mlp.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

MAE: 0.3039309747903107
R2 score on test set: 0.7437


In [67]:
import numpy as np
from sklearn.linear_model import Lasso

# Create Lasso model with regularization strength (alpha)
lasso = Lasso(alpha=0.1)

# Fit to training data
lasso.fit(X_train_processed, y_train)

y_pred = lasso.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

MAE: 0.4154630389227033
R2 score on test set: 0.5901


In [68]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=10,
    random_state=151
)

# Train the model
xgb_model.fit(X_train_processed, y_train)

y_pred = xgb_model.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

MAE: 0.25619137055947133
R2 score on test set: 0.8263


In [69]:
'''
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Crear el clasificador XGBoost
xgb_model = xgb.XGBRegressor(objective="reg:squarederror")
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'min_child_weight': [1, 5, 10]
}
# Realizar GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)

# Ajustar el modelo
grid_search.fit(X_train_processed, y_train)

# Obtener los mejores parámetros
best_params = grid_search.best_params_
print(f"Mejores parámetros: {best_params}")
# Predecir con el mejor modelo
best_model = grid_search.best_estimator_

# Predicciones
y_pred = best_model.predict(X_test_processed)

# Evaluar el rendimiento
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
'''

'\nimport xgboost as xgb\nfrom sklearn.model_selection import GridSearchCV\nfrom sklearn.metrics import mean_squared_error\n\n# Crear el clasificador XGBoost\nxgb_model = xgb.XGBRegressor(objective="reg:squarederror")\nparam_grid = {\n    \'n_estimators\': [50, 100, 200, 300, 400, 500],\n    \'max_depth\': [3, 6, 10],\n    \'learning_rate\': [0.01, 0.05, 0.1],\n    \'subsample\': [0.7, 0.8, 1.0],\n    \'colsample_bytree\': [0.7, 0.8, 1.0],\n    \'min_child_weight\': [1, 5, 10]\n}\n# Realizar GridSearchCV\ngrid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring=\'neg_mean_squared_error\', verbose=1)\n\n# Ajustar el modelo\ngrid_search.fit(X_train_processed, y_train)\n\n# Obtener los mejores parámetros\nbest_params = grid_search.best_params_\nprint(f"Mejores parámetros: {best_params}")\n# Predecir con el mejor modelo\nbest_model = grid_search.best_estimator_\n\n# Predicciones\ny_pred = best_model.predict(X_test_processed)\n\n# Evaluar el rendimient

In [70]:
'''import numpy as np
import pandas as pd
from xgboost import XGBRegressor

xgb_model_best = XGBRegressor(
    colsample_bytree=0.7,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=1, 
    n_estimators=500,
    subsample=0.7
)

# Train the model
xgb_model_best.fit(X_train_processed, y_train)

y_pred = xgb_model_best.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")
'''

'import numpy as np\nimport pandas as pd\nfrom xgboost import XGBRegressor\n\nxgb_model_best = XGBRegressor(\n    colsample_bytree=0.7,\n    learning_rate=0.05,\n    max_depth=6,\n    min_child_weight=1, \n    n_estimators=500,\n    subsample=0.7\n)\n\n# Train the model\nxgb_model_best.fit(X_train_processed, y_train)\n\ny_pred = xgb_model_best.predict(X_test_processed).flatten()\n\n# Calculate R2 score\nr2 = r2_score(y_test, y_pred)\nmae = mean_absolute_error(y_test, y_pred)\nprint(f"MAE: {mae}")\nprint(f"R2 score on test set: {r2:.4f}")\n'

In [71]:
from sklearn.metrics import r2_score

# Predict on test set
y_pred = model.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
MAE: 0.2850818142201014
R2 score on test set: 0.7847


In [72]:
test_data = pd.read_csv('test.csv')
test_data.columns = test_data.columns.str.replace("'", "", regex=False)
test_data.head()



Unnamed: 0,Id,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Telephone,...,Handicap_Access,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom
0,0,Not specified,0,1,0,0,Paralelas,0,0,Not specified,...,Not specified,1,1,340,0,LaMolina,0,Not specified,0,1
1,1,Not specified,Not specified,1,0,Not specified,Lineales,Not specified,Not specified,Not specified,...,Not specified,1,0,1138,Not specified,LaMolina,0,Bueno,0,1
2,2,Not specified,0,0,0,0,Paralelas,0,0,Not specified,...,Not specified,0,1,1353,0,LaMolina,0,Bueno,0,1
3,3,0,1,0,0,0,Paralelas,0,0,0,...,Not specified,1,0,230,0,SantiagoDeSurco,0,Muy bueno,0,1
4,4,0,Not specified,Not specified,0,Not specified,Not specified,Not specified,Not specified,0,...,Not specified,1,0,305,Not specified,LaMolina,Not specified,Not specified,Not specified,1


In [73]:
test_data

Unnamed: 0,Id,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Telephone,...,Handicap_Access,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom
0,0,Not specified,0,1,0,0,Paralelas,0,0,Not specified,...,Not specified,1,1,340,0,LaMolina,0,Not specified,0,1
1,1,Not specified,Not specified,1,0,Not specified,Lineales,Not specified,Not specified,Not specified,...,Not specified,1,0,1138,Not specified,LaMolina,0,Bueno,0,1
2,2,Not specified,0,0,0,0,Paralelas,0,0,Not specified,...,Not specified,0,1,1353,0,LaMolina,0,Bueno,0,1
3,3,0,1,0,0,0,Paralelas,0,0,0,...,Not specified,1,0,230,0,SantiagoDeSurco,0,Muy bueno,0,1
4,4,0,Not specified,Not specified,0,Not specified,Not specified,Not specified,Not specified,0,...,Not specified,1,0,305,Not specified,LaMolina,Not specified,Not specified,Not specified,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,652,Not specified,Not specified,1,0,Not specified,Paralelas,Not specified,Not specified,Not specified,...,Not specified,1,1,600,Not specified,LaMolina,0,Not specified,0,1
653,653,1,0,1,0,1,Separadas,1,0,0,...,0,1,0,204,0,Asia,0,Excelente,0,1
654,654,0,Not specified,1,0,Not specified,Paralelas,Not specified,Not specified,0,...,Not specified,1,1,270,Not specified,LaMolina,0,Not specified,0,1
655,655,0,0,0,0,0,Paralelas,0,0,0,...,Not specified,1,0,160,0,Asia,0,Not specified,0,1


In [74]:
# Remove Construction_Area column, its redundant with construction area
#test_data.drop(columns=['Beach:Resort'], inplace=True)
test_data = extract_date(test_data)
test_data= extract_Location(test_data)
test_data = clean_beach_resort(test_data)
test_data.drop(columns=['Construction_Area'], inplace=True)
test_data['Total_Area_m2'] = pd.to_numeric(test_data['Total_Area_m2'], errors='coerce')
test_data = replace_not_specified_with_mediam('Total_Area_m2', test_data)
test_data = preprocess_data(test_data)
for column in numeric_columns:
    test_data = fill_numeric_column(column, test_data)

test_data['Age'] = pd.to_numeric(test_data['Age'], errors='coerce')

for column in trash_columns:
    test_data = remove_trash_data(column, test_data)
for column in columns_replace_with_mode:
    print(f'Processing column: {column}')
    test_data = replace_not_specified_with_mode(column, test_data)

test_data = clean_categorical_columns(test_data)
    
# Drop unique columns
for column in test_data.columns:
    if len(test_data[column].unique()) == len(test_data):
        print(f'Dropping unique column: {column}')
        test_data.drop(columns=[column], inplace=True)
    elif len(test_data[column].unique()) == 1:
        print(f'Dropping column with one value: {column}')
        test_data.drop(columns=[column], inplace=True)

test_data.drop(columns=columns_object, inplace=True)
test_data.drop(columns=columns_delete, inplace=True)
#test_data.drop(columns='Beach_Resort', inplace=True)
test_data.info()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Location'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

--------------------------------
Removing trash data from column: Internet
Internet
0                192
1                112
Not specified    309
dtype: int64
Internet
0                192
1                112
Not specified    309
dtype: int64
--------------------------------
Removing trash data from column: Sauna_Area
Sauna_Area
0                404
1                  8
Not specified    201
dtype: int64
Sauna_Area
0                404
1                  8
Not specified    201
dtype: int64
--------------------------------
Removing trash data from column: Air_Conditioning
Air_Conditioning
0                317
1                 72
Not specified    224
dtype: int64
Air_Conditioning
0                317
1                 72
Not specified    224
dtype: int64
--------------------------------
Removing trash data from column: Independent_Entrance
Independent_Entrance
0                249
Not specified    364
dtype: int64
Independent_Entrance
0                249
Not specified    364
dtype: in

  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change i

Near_Sea
0    239
1     36
dtype: int64
Processing column: Sauna
Sauna
0    409
1     15
dtype: int64
Processing column: Cinema_Room
Cinema_Room
0    434
1      4
dtype: int64
Processing column: Cleaning_Service
Cleaning_Service
0    299
1     37
dtype: int64
Processing column: Terrace
Terrace
0    304
1    339
dtype: int64
Processing column: Sports_Area
Sports_Area
0    383
1     55
dtype: int64
Processing column: Security_System
Security_System
0    251
1     85
dtype: int64
Processing column: Water_Heater
Water_Heater
0    289
1    100
dtype: int64
Processing column: Professional_Use
Professional_Use
0    21
1    37
dtype: int64
Processing column: Club_House
Club_House
0    410
1      2
dtype: int64
Processing column: Internal_Park
Internal_Park
0    396
1     16
dtype: int64
Processing column: Laundry_Room
Laundry_Room
0    225
1    374
dtype: int64
Processing column: Nearby_Schools
Nearby_Schools
0     65
1    184
dtype: int64
Processing column: Balcony
Balcony
0    520
1     79
d

  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna('NA', inplace=True)
  print(dataset.groupby(column).size())
The behavior will change i

In [75]:
test_data

Unnamed: 0,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Guest_Bathroom,BBQ_Area,...,Cable,Gym,Jacuzzi,Independent_Entrance,Dining_Room,Total_Area_m2,Heating,District,Daycare,Property_Condition
0,,0,1,0,0,Paralelas,0,0,0,0,...,,0,0,,1,340.0,0,LaMolina,0,
1,,,1,0,,Lineales,,,1,0,...,,0,,,1,1138.0,,LaMolina,0,Bueno
2,,0,0,0,0,Paralelas,0,0,0,0,...,,0,0,,0,1353.0,0,LaMolina,0,Bueno
3,0,1,0,0,0,Paralelas,0,0,0,0,...,0,0,0,,1,230.0,0,SantiagoDeSurco,0,Muy bueno
4,0,,,0,,,,,1,,...,1,,,,1,305.0,,LaMolina,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,,,1,0,,Paralelas,,,0,0,...,,0,,,1,600.0,,LaMolina,0,
653,1,0,1,0,1,Separadas,1,0,1,1,...,1,1,0,0,1,204.0,0,Asia,0,Excelente
654,0,,1,0,,Paralelas,,,1,0,...,0,0,,,1,270.0,,LaMolina,0,
655,0,0,0,0,0,Paralelas,0,0,0,1,...,0,0,0,,1,160.0,0,Asia,0,


In [76]:
# Identify categorical and numerical columns
#columns_object =test_data.select_dtypes(include=['object']).columns.tolist()
print(columns_object)
data.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()
#test_data.drop(columns=columns_object, inplace=True)

[]


['Garages',
 'Number_Bathrooms',
 'Construction_Area_m2',
 'Age',
 'Number_Floors',
 'Total_Area_m2',
 'Price']

In [77]:
data.columns

Index(['Gas_Connection', 'Fireplace', 'Entrance_Hall', 'Kitchenette',
       'Equipped', 'Garage_Type', 'Furnished', 'Drainage', 'Guest_Bathroom',
       'BBQ_Area', 'Living_Room', 'Nearby_Parks', 'Solarium', 'Commercial_Use',
       'Province', 'Internal_Garden', 'Garages', 'Electricity', 'Patio',
       'Children_Playground', 'Type', 'Number_Bathrooms', 'Green_Areas',
       'Electric_Doorman', 'Construction_Area_m2', 'Intercom', 'Near_Sea',
       'Sauna', 'Cinema_Room', 'Terrace', 'Sports_Area', 'Security_System',
       'Location', 'Water_Heater', 'Professional_Use', 'Match',
       'Internal_Park', 'Laundry_Room', 'Nearby_Schools', 'Balcony', 'Attic',
       'Oceanfront', 'Security_Guard', 'Natural_Light', 'Swimming_Pool',
       'Electric_Fence', 'Hall', 'Nearby_Shopping_Centers', 'Water',
       'Basement', 'Independent_Bathroom', 'Walk_in_Closet', 'Age',
       'Number_Floors', 'Daily_Dining_Room', 'Grill', 'Closet', 'Internet',
       'Sauna_Area', 'Bedrooms', 'Pets', 'Kitche

In [78]:
test_data

Unnamed: 0,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Guest_Bathroom,BBQ_Area,...,Cable,Gym,Jacuzzi,Independent_Entrance,Dining_Room,Total_Area_m2,Heating,District,Daycare,Property_Condition
0,,0,1,0,0,Paralelas,0,0,0,0,...,,0,0,,1,340.0,0,LaMolina,0,
1,,,1,0,,Lineales,,,1,0,...,,0,,,1,1138.0,,LaMolina,0,Bueno
2,,0,0,0,0,Paralelas,0,0,0,0,...,,0,0,,0,1353.0,0,LaMolina,0,Bueno
3,0,1,0,0,0,Paralelas,0,0,0,0,...,0,0,0,,1,230.0,0,SantiagoDeSurco,0,Muy bueno
4,0,,,0,,,,,1,,...,1,,,,1,305.0,,LaMolina,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,,,1,0,,Paralelas,,,0,0,...,,0,,,1,600.0,,LaMolina,0,
653,1,0,1,0,1,Separadas,1,0,1,1,...,1,1,0,0,1,204.0,0,Asia,0,Excelente
654,0,,1,0,,Paralelas,,,1,0,...,0,0,,,1,270.0,,LaMolina,0,
655,0,0,0,0,0,Paralelas,0,0,0,1,...,0,0,0,,1,160.0,0,Asia,0,


In [79]:
test_data_processed = preprocessor.transform(test_data)
# Predict prices
predicted_prices = xgb_model.predict(test_data_processed)
print(predicted_prices)
predicted_prices = np.exp(predicted_prices)  # Inverse log transformation to get actual prices
# Add predictions to test_data
test_data['Predicted_Price'] = predicted_prices.flatten()
test_data[['Predicted_Price']].head()

df = pd.DataFrame(predicted_prices, columns=['Price'])
df.index.name = 'Id'
print(df)
df.to_csv('submission.csv', index=True)

[13.6233635 14.023572  14.111133  12.874019  13.282793  12.123303
 13.584569  13.788716  12.117397  13.420574  13.27017   12.0725155
 13.496417  14.220894  14.451902  13.583208  12.998554  13.739983
 13.093013  11.555389  13.325868  12.820644  13.792844  13.23438
 13.013732  13.991399  12.619367  13.836592  13.151523  12.77654
 12.355902  13.471997  13.143517  12.771487  12.576541  13.985699
 14.265073  12.335247  13.461382  12.340057  12.867732  12.1821375
 12.812329  11.590152  13.92595   13.424034  13.458667  13.833288
 14.203641  12.215102  14.094231  11.690682  14.014963  14.656616
 12.2600355 12.631184  14.533741  14.051443  12.667941  12.975903
 13.6344185 12.832557  12.343364  14.537442  14.193785  13.919424
 13.314518  13.889707  12.997515  13.187395  14.226854  12.841742
 14.214844  12.8710375 13.292558  11.906313  14.009942  14.598205
 12.210101  11.926743  14.068445  12.558566  13.142048  13.331405
 11.977971  11.656195  13.949084  11.808423  13.9032545 13.337985
 13.242193