In [1]:
import pandas as pd # type: ignore

# Read the CSV file
data = pd.read_csv('train.csv')

# Display the first few rows of the data
data.head()
data.columns = data.columns.str.replace("'", "", regex=False)

In [2]:
def extract_Location(data_set: pd.DataFrame) -> pd.DataFrame:
    data_set['Location'] = data_set['Location'].str.extract(r'([^-\s]+)')
    data_set['Location'].fillna('NA', inplace=True)
    return data_set
data= extract_Location(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Location'].fillna('NA', inplace=True)


In [3]:
def clean_beach_resort(dataset):
    allowed_values = [ 'Nulo' ,'Not specified', '0' , '1']
    dataset['Beach_Resort'] = dataset['Beach_Resort'].apply(lambda x: 'NA' if pd.isna(x) or x in allowed_values else x)
    return dataset

In [4]:
data = clean_beach_resort(data)

In [5]:
def extract_date(data_set: pd.DataFrame) -> pd.DataFrame:
    #data_set['Publication_Date'] = data_set['Publication_Date'].str.extract(r'Publicado el (.+)')
    #data_set['Publication_Date'] = pd.to_datetime(data_set['Publication_Date'], errors='coerce')
    #data_set['Publication_Date'] = data_set['Publication_Date'].fillna(data_set['Publication_Date'].mode()[0])
    #data_set['Publication_Year'] = data_set['Publication_Date'].dt.year
    #data_set['Publication_Month'] = data_set['Publication_Date'].dt.month
    #data_set['Publication_Day'] = data_set['Publication_Date'].dt.day
    data_set.drop(columns='Publication_Date', inplace=True)
    return data_set

In [6]:
data = extract_date(data)

In [7]:
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

In [8]:
data.groupby('Total_Area_m2').size()

Total_Area_m2
0                 43
1                  2
100               31
1000             143
1002               5
                ... 
99                 3
990                4
997                1
999                2
Not specified      1
Length: 1121, dtype: int64

In [9]:
for column in data.columns:
    print('--------------------------------')
    print(data.groupby(column).size())

--------------------------------
Id
0       1
1       1
2       1
3       1
4       1
       ..
6995    1
6996    1
6997    1
6998    1
6999    1
Length: 7000, dtype: int64
--------------------------------
Gas_Connection
0                3175
1                 484
Not specified    3341
dtype: int64
--------------------------------
Fireplace
0                3270
1                1319
Not specified    2411
dtype: int64
--------------------------------
Entrance_Hall
0                1864
1                2695
Not specified    2441
dtype: int64
--------------------------------
Kitchenette
0                6791
Not specified     209
dtype: int64
--------------------------------
Equipped
0                4161
1                 428
Not specified    2411
dtype: int64
--------------------------------
Garage_Type
Lineales          604
NoTiene           835
Not specified    2892
Paralelas        2505
Separadas         164
dtype: int64
--------------------------------
Furnished
0                3

In [10]:
# Remove Construction_Area column, its redundant with construction area
data.drop(columns=['Construction_Area'], inplace=True)


In [11]:
data['Total_Area_m2'] = pd.to_numeric(data['Total_Area_m2'], errors='coerce')

In [12]:
dict_bedrooms = {
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5+': 5
}
def preprocess_data(data_set):
    data_set['Bedrooms'] = data_set['Bedrooms'].apply(lambda x: dict_bedrooms.get(x, x))
    data_set['Bedrooms'].fillna(data_set['Bedrooms'].mode()[0], inplace=True)
    return data_set

In [13]:
data = preprocess_data(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Bedrooms'].fillna(data_set['Bedrooms'].mode()[0], inplace=True)


In [14]:
numeric_columns = ['Number_Floors', 'Age']
def fill_numeric_column(column_name, dataset):
    dataset[column_name] = pd.to_numeric(dataset[column_name], errors='coerce')
    dataset[column_name].fillna(dataset[column_name].mode()[0], inplace=True)
    return dataset

In [15]:

for column in numeric_columns:
    data = fill_numeric_column(column, data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column_name].fillna(dataset[column_name].mode()[0], inplace=True)


In [16]:
data['Beach_Resort']

0            NA
1            NA
2            NA
3            NA
4            NA
         ...   
6995         NA
6996    Del Sol
6997         NA
6998         NA
6999         NA
Name: Beach_Resort, Length: 7000, dtype: object

In [17]:
allowed_values = ['0', '1', 'Nulo', 'Not specified', 'Si', 'No']
def remove_trash_data(column,dataset):
    print('--------------------------------')
    print('Removing trash data from column:', column)
    print(dataset.groupby(column).size())
    dataset = dataset[(dataset[column].isin(allowed_values)) | (dataset[column].isnull())]
    print(dataset.groupby(column).size())
    return dataset

In [18]:
trash_columns = ['Internet', 'Sauna_Area', 'Air_Conditioning', 'Independent_Entrance', 'Jacuzzi', 'Service_Bathroom'
                 ,'Cable', 'Service_Room', 'Internet_Room', 'Daycare', 'Daily_Dining_Room', 'Heating', 'Pets']
for column in trash_columns:
    data = remove_trash_data(column, data)

--------------------------------
Removing trash data from column: Internet
Internet
0                                                                2105
1                                                                1194
Not specified                                                    3160
Panamericana Sur Km 94.5 Asia                                       1
Panamericana Sur Km 94.5 Club Playa Las Arenas Asia                 1
Panamericana Sur Km 94.5 Club Playa Las Arenas Casa B_22 Asia       1
Publicado el 05.09.19                                               1
Publicado el 10.09.19                                               1
Publicado el 12.08.19                                               2
Publicado el 13.04.19                                               5
Publicado el 13.09.19                                               1
Publicado el 14.09.19                                               1
Publicado el 17.09.19                                               6
Public

In [19]:
data = data[data['Age']!= 'Not specified']
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

In [20]:
import numpy as np
def replace_not_specified_with_mode(column, dataset):
    dataset[column] = dataset[column].apply(lambda x: np.nan if (x == 'Not specified' or x == 'Nulo') else 
                                      True if (x == '1' or x == 'Si') 
                                      else False if (x == '0' or x == 'No') else x)
    #dataset[column+'_missing'] = dataset[column].isna().astype(int)
    dataset[column].fillna(dataset[column].mode()[0], inplace=True)
    #dataset[column].fillna(False, inplace=True)

    print(dataset.groupby(column).size())
    return dataset


In [21]:
# Posibles columnas repetidas en el dataset
# Sauna, Sauna_area,
# Oceanfront, Near_Sea
# Kitchenette, Kitchen_with_Cabinets


columns_replace_with_mode = ['Gas_Connection', 'Fireplace', 'Entrance_Hall', 'Kitchenette',
           'Equipped', 'Furnished', 'Drainage', 'Telephone', 'Guest_Bathroom', 'BBQ_Area',
           'Living_Room', 'Nearby_Parks', 'Solarium', 'Commercial_Use', 'Internal_Garden', 
           'Electricity', 'Patio', 'Children_Playground', 'Green_Areas', 'Electric_Doorman',
           'Intercom', 'Near_Sea', 'Sauna', 'Cinema_Room', 'Cleaning_Service', 'Terrace',
           'Sports_Area', 'Security_System', 'Water_Heater', 'Professional_Use', 'Club_House',
           'Internal_Park', 'Laundry_Room', 'Nearby_Schools', 'Balcony', 'Attic', 'Oceanfront', 'Security_Guard',
           'Swimming_Pool', 'Electric_Fence', 'Air_Conditioning', 'Hall', 'Nearby_Shopping_Centers', 'Kitchen',
           'Water', 'Basement', 'Independent_Bathroom', 'Walk_in_Closet', 'Grill', 'Closet',
           'Internet', 'Sauna_Area', 'Kitchen_with_Cabinets', 'Gym', 'Handicap_Access', 'Dining_Room',
           'Office', 'Service_Bathroom', 'Storage_Room', 'Cable', 'Jacuzzi', 'Independent_Entrance', 'Service_Room',
           'Internet_Room', 'Daycare', 'Daily_Dining_Room', 'Heating', 'Match', 'Pets', 'Garden']
for column in columns_replace_with_mode:
    print(f'Processing column: {column}')
    data = replace_not_specified_with_mode(column, data)

Processing column: Gas_Connection
Gas_Connection
False    6479
True      477
dtype: int64
Processing column: Fireplace
Fireplace
False    5645
True     1311
dtype: int64
Processing column: Entrance_Hall
Entrance_Hall
False    1856
True     5100
dtype: int64
Processing column: Kitchenette
Kitchenette
False    6956
dtype: int64
Processing column: Equipped
Equipped
False    6539
True      417
dtype: int64
Processing column: Furnished
Furnished
False    6143
True      813
dtype: int64
Processing column: Drainage
Drainage
False    5682
True     1274
dtype: int64
Processing column: Telephone
Telephone
False    5692
True     1264
dtype: int64
Processing column: Guest_Bathroom
Guest_Bathroom
False    3167
True     3789
dtype: int64
Processing column: BBQ_Area
BBQ_Area
False    6155
True      801
dtype: int64
Processing column: Living_Room
Living_Room
False    4042
True     2914
dtype: int64
Processing column: Nearby_Parks
Nearby_Parks
False     672
True     6284
dtype: int64
Processing column:

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[colum

Commercial_Use
False     383
True     6573
dtype: int64
Processing column: Internal_Garden
Internal_Garden
False    4759
True     2197
dtype: int64
Processing column: Electricity
Electricity
False     865
True     6091
dtype: int64
Processing column: Patio
Patio
False    4649
True     2307
dtype: int64
Processing column: Children_Playground
Children_Playground
False    6652
True      304
dtype: int64
Processing column: Green_Areas
Green_Areas
False    6005
True      951
dtype: int64
Processing column: Electric_Doorman
Electric_Doorman
False    6255
True      701
dtype: int64
Processing column: Intercom
Intercom
False    5988
True      968
dtype: int64
Processing column: Near_Sea
Near_Sea
False    6564
True      392
dtype: int64
Processing column: Sauna
Sauna
False    6793
True      163
dtype: int64
Processing column: Cinema_Room
Cinema_Room
False    6938
True       18
dtype: int64
Processing column: Cleaning_Service
Cleaning_Service
False    6565
True      391
dtype: int64
Processing c

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[colum

Water
False     748
True     6208
dtype: int64
Processing column: Basement
Basement
False    6646
True      310
dtype: int64
Processing column: Independent_Bathroom
Independent_Bathroom
False    6956
dtype: int64
Processing column: Walk_in_Closet
Walk_in_Closet
False    5430
True     1526
dtype: int64
Processing column: Grill
Grill
False    1987
True     4969
dtype: int64
Processing column: Closet
Closet
False    6630
True      326
dtype: int64
Processing column: Internet
Internet
False    5762
True     1194
dtype: int64
Processing column: Sauna_Area
Sauna_Area
False    6889
True       67
dtype: int64
Processing column: Kitchen_with_Cabinets
Kitchen_with_Cabinets
False    5494
True     1462
dtype: int64
Processing column: Gym
Gym
False    6617
True      339
dtype: int64
Processing column: Handicap_Access
Handicap_Access
False    6551
True      405
dtype: int64
Processing column: Dining_Room
Dining_Room
False     531
True     6425
dtype: int64
Processing column: Office
Office
False    4

  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[colum

Independent_Entrance
False    6956
dtype: int64
Processing column: Service_Room
Service_Room
False    1381
True     5575
dtype: int64
Processing column: Internet_Room
Internet_Room
False    6933
True       23
dtype: int64
Processing column: Daycare
Daycare
False    6936
True       20
dtype: int64
Processing column: Daily_Dining_Room
Daily_Dining_Room
False    2096
True     4860
dtype: int64
Processing column: Heating
Heating
False    6908
True       48
dtype: int64
Processing column: Match
Match
0.0    6956
dtype: int64
Processing column: Pets
Pets
False     223
True     6733
dtype: int64
Processing column: Garden
Garden
False     896
True     6060
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[colum

In [22]:
data

Unnamed: 0,Id,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Telephone,...,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom,Price
0,0,True,False,False,False,False,Paralelas,False,False,False,...,True,False,,False,,False,,False,True,210000
1,1,False,True,True,False,False,Paralelas,False,False,False,...,True,False,311.0,False,LaMolina,False,Bueno,False,True,660000
2,2,False,False,True,False,False,Lineales,False,False,False,...,True,True,1400.0,False,LaMolina,False,Bueno,False,True,1500000
3,3,False,True,True,False,False,Separadas,False,False,False,...,True,False,317.0,False,SanIsidro,False,Remodelado,False,True,1100000
4,4,False,False,True,False,False,Lineales,False,False,False,...,False,False,246.0,False,VillaMariaDelTriunfo,False,Regular,False,False,110000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,6995,False,False,True,False,False,Not specified,False,False,False,...,True,False,193.0,False,LaVictoria,False,Bueno,False,False,450000
6996,6996,False,False,True,False,False,Not specified,True,False,False,...,True,False,144.0,False,Asia,False,Not specified,False,True,450000
6997,6997,False,False,True,False,False,Not specified,False,True,True,...,True,False,396.0,False,SantiagoDeSurco,False,Not specified,False,True,450000
6998,6998,False,False,True,False,False,Not specified,False,False,False,...,True,False,188.0,False,Bellavista,False,Muy bueno,False,True,240000


In [23]:
def clean_categorical_columns(dataset):
    categorical_cols = ['Garage_Type', 'Province', 'Type', 'Location', 'Natural_Light', 'Advertiser', 'Bedrooms', 'Pets', 'Garden', 'Beach_Resort', 'District', 'Property_Condition']
    for column in categorical_cols:
        print(f'Processing categorical column: {column}')
        dataset[column] = dataset[column].astype('category')
        #dataset[column].cat.add_categories(None)
        dataset[column] = dataset[column].replace('NA', None)
        dataset[column] = dataset[column].replace('Not specified', None)
        
        #dataset[column] = dataset[column].fillna(dataset[column].mode()[0])
        #dataset[column] = dataset[column].fillna(None)

    return dataset


In [24]:
data = clean_categorical_columns(data)

Processing categorical column: Garage_Type
Processing categorical column: Province
Processing categorical column: Type
Processing categorical column: Location
Processing categorical column: Natural_Light
Processing categorical column: Advertiser
Processing categorical column: Bedrooms
Processing categorical column: Pets
Processing categorical column: Garden
Processing categorical column: Beach_Resort
Processing categorical column: District
Processing categorical column: Property_Condition


  dataset[column] = dataset[column].replace('Not specified', None)
  dataset[column] = dataset[column].replace('NA', None)
  dataset[column] = dataset[column].replace('Not specified', None)
  dataset[column] = dataset[column].replace('NA', None)
  dataset[column] = dataset[column].replace('Not specified', None)


In [25]:
# Drop unique columns
for column in data.columns:
    if len(data[column].unique()) == len(data):
        print(f'Dropping unique column: {column}')
        data.drop(columns=[column], inplace=True)
    elif len(data[column].unique()) == 1:
        print(f'Dropping column with one value: {column}')
        data.drop(columns=[column], inplace=True)

Dropping unique column: Id
Dropping column with one value: Kitchenette
Dropping column with one value: Match
Dropping column with one value: Independent_Bathroom
Dropping column with one value: Independent_Entrance


In [26]:
columns_delete = ['Advertiser', 'Cleaning_Service', 'Telephone', 'Internet_Room', 'Service_Bathroom', 'Service_Room', 'Handicap_Access', 'Office', 'Club_House', 'Kitchen', 'Air_Conditioning']

In [27]:
# Identify categorical and numerical columns
columns_object =data.select_dtypes(include=['object']).columns.tolist()
print(columns_object)
data.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()
data.drop(columns=columns_object, inplace=True)

[]


In [28]:
data.select_dtypes(include=['int64', 'float64']).columns.tolist()

['Garages',
 'Number_Bathrooms',
 'Construction_Area_m2',
 'Age',
 'Number_Floors',
 'Total_Area_m2',
 'Price']

In [29]:
import numpy as np
def replace_not_specified_with_mediam(column, dataset):
    dataset[column] = dataset[column].apply(lambda x: np.nan if (x == 'Not specified' or x == 'Nulo') else x)
    dataset[column].fillna(dataset[column].median(), inplace=True)
    return dataset

In [30]:
data = replace_not_specified_with_mediam('Total_Area_m2', data)
#data.drop(columns='Beach_Resort', inplace=True)
data.drop_duplicates(inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].median(), inplace=True)


In [31]:
data.drop(columns=columns_delete, inplace=True)

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

# Select features and target
X = data.drop(columns=['Price'])
y = data['Price']
y = np.log(y)  # Log-transform the target variable for better performance

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['category']).columns.tolist()
numerical_cols = ['Garages', 'Number_Bathrooms', 'Age', 'Number_Floors']
log_cols = ['Construction_Area_m2', 'Total_Area_m2']
boolean_cols = X.select_dtypes(include=['bool']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('bool', OneHotEncoder() ,boolean_cols),
        ('num', StandardScaler(), numerical_cols),
        ('log', FunctionTransformer(np.log), log_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit and transform data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Build neural network
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train_processed.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)


optimizer = Adam(clipvalue=0.001)  # Clip gradients to a value between -1 and 1
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

# Train model
history = model.fit(X_train_processed, y_train, epochs=150, batch_size=64, validation_split=0.1, callbacks=[early_stop], verbose=1)

# Evaluate model
loss, mae = model.evaluate(X_test_processed, y_test, verbose=0)
print(f"Test MAE: {mae:.2f}")

2025-07-13 01:21:41.672704: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-07-13 01:21:44.516884: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 16.1967 - mae: 2.5464 - val_loss: 0.2398 - val_mae: 0.3692
Epoch 2/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.3251 - mae: 0.4368 - val_loss: 0.1651 - val_mae: 0.3076
Epoch 3/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2200 - mae: 0.3589 - val_loss: 0.1755 - val_mae: 0.3154
Epoch 4/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.2662 - mae: 0.4006 - val_loss: 0.1467 - val_mae: 0.2829
Epoch 5/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.1632 - mae: 0.3038 - val_loss: 0.2883 - val_mae: 0.4441
Epoch 6/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.2127 - mae: 0.3451 - val_loss: 0.1828 - val_mae: 0.3296
Epoch 7/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss:

KeyboardInterrupt: 

In [None]:
'''
import tensorflow as tf
from tensorflow.keras import layers, models
from xgboost import XGBRegressor

input_dim = X_train_processed.shape[1]
encoding_dim = 20  # compress to 4 features

# Build autoencoder model
input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(8, activation='relu')(input_layer)
encoded = layers.Dense(encoding_dim, activation='relu')(encoded)  # bottleneck

decoded = layers.Dense(8, activation='relu')(encoded)
decoded = layers.Dense(input_dim, activation='linear')(decoded)

autoencoder = models.Model(inputs=input_layer, outputs=decoded)

# Compile and train
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_processed, X_train_processed, epochs=100, batch_size=32, shuffle=True, validation_data=(X_test_processed, X_test_processed), verbose=0)
'''

"\nimport tensorflow as tf\nfrom tensorflow.keras import layers, models\nfrom xgboost import XGBRegressor\n\ninput_dim = X_train_processed.shape[1]\nencoding_dim = 20  # compress to 4 features\n\n# Build autoencoder model\ninput_layer = layers.Input(shape=(input_dim,))\nencoded = layers.Dense(8, activation='relu')(input_layer)\nencoded = layers.Dense(encoding_dim, activation='relu')(encoded)  # bottleneck\n\ndecoded = layers.Dense(8, activation='relu')(encoded)\ndecoded = layers.Dense(input_dim, activation='linear')(decoded)\n\nautoencoder = models.Model(inputs=input_layer, outputs=decoded)\n\n# Compile and train\nautoencoder.compile(optimizer='adam', loss='mse')\nautoencoder.fit(X_train_processed, X_train_processed, epochs=100, batch_size=32, shuffle=True, validation_data=(X_test_processed, X_test_processed), verbose=0)\n"

In [None]:
'''
encoder = models.Model(inputs=input_layer, outputs=encoded)

# Encode training and test data
X_train_encoded = encoder.predict(X_train_processed)
X_test_encoded = encoder.predict(X_test_processed)
'''

'\nencoder = models.Model(inputs=input_layer, outputs=encoded)\n\n# Encode training and test data\nX_train_encoded = encoder.predict(X_train_processed)\nX_test_encoded = encoder.predict(X_test_processed)\n'

In [None]:

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
# Create the model
mlp = MLPRegressor(hidden_layer_sizes=(50, 30), max_iter=10000, random_state=1)

# Train the model
mlp.fit(X_train_processed, y_train)
y_pred = mlp.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

MAE: 0.28787039726418107
R2 score on test set: 0.7537


In [None]:
import numpy as np
from sklearn.linear_model import Lasso

# Create Lasso model with regularization strength (alpha)
lasso = Lasso(alpha=0.1)

# Fit to training data
lasso.fit(X_train_processed, y_train)

y_pred = lasso.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

MAE: 0.4158112104458211
R2 score on test set: 0.5764


In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=10,
    random_state=151
)

# Train the model
xgb_model.fit(X_train_processed, y_train)

y_pred = xgb_model.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

MAE: 0.26125180302305206
R2 score on test set: 0.8115


In [None]:

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Crear el clasificador XGBoost
xgb_model = xgb.XGBRegressor(objective="reg:squarederror")
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'min_child_weight': [1, 5, 10]
}
# Realizar GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)

# Ajustar el modelo
grid_search.fit(X_train_processed, y_train)

# Obtener los mejores parámetros
best_params = grid_search.best_params_
print(f"Mejores parámetros: {best_params}")
# Predecir con el mejor modelo
best_model = grid_search.best_estimator_

# Predicciones
y_pred = best_model.predict(X_test_processed)

# Evaluar el rendimiento
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")


Fitting 3 folds for each of 1458 candidates, totalling 4374 fits


KeyboardInterrupt: 

In [None]:
'''import numpy as np
import pandas as pd
from xgboost import XGBRegressor

xgb_model_best = XGBRegressor(
    colsample_bytree=0.7,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=1, 
    n_estimators=500,
    subsample=0.7
)

# Train the model
xgb_model_best.fit(X_train_processed, y_train)

y_pred = xgb_model_best.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")
'''

MAE: 0.2542879258249558
R2 score on test set: 0.8239


In [None]:
from sklearn.metrics import r2_score

# Predict on test set
y_pred = model.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")
print(f"R2 score on test set: {r2:.4f}")

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
MAE: 0.2950491282003542
R2 score on test set: 0.7568


In [None]:
test_data = pd.read_csv('test.csv')
test_data.columns = test_data.columns.str.replace("'", "", regex=False)
test_data.head()



Unnamed: 0,Id,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Telephone,...,Handicap_Access,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom
0,0,Not specified,0,1,0,0,Paralelas,0,0,Not specified,...,Not specified,1,1,340,0,LaMolina,0,Not specified,0,1
1,1,Not specified,Not specified,1,0,Not specified,Lineales,Not specified,Not specified,Not specified,...,Not specified,1,0,1138,Not specified,LaMolina,0,Bueno,0,1
2,2,Not specified,0,0,0,0,Paralelas,0,0,Not specified,...,Not specified,0,1,1353,0,LaMolina,0,Bueno,0,1
3,3,0,1,0,0,0,Paralelas,0,0,0,...,Not specified,1,0,230,0,SantiagoDeSurco,0,Muy bueno,0,1
4,4,0,Not specified,Not specified,0,Not specified,Not specified,Not specified,Not specified,0,...,Not specified,1,0,305,Not specified,LaMolina,Not specified,Not specified,Not specified,1


In [None]:
test_data

Unnamed: 0,Id,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Telephone,...,Handicap_Access,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom
0,0,Not specified,0,1,0,0,Paralelas,0,0,Not specified,...,Not specified,1,1,340,0,LaMolina,0,Not specified,0,1
1,1,Not specified,Not specified,1,0,Not specified,Lineales,Not specified,Not specified,Not specified,...,Not specified,1,0,1138,Not specified,LaMolina,0,Bueno,0,1
2,2,Not specified,0,0,0,0,Paralelas,0,0,Not specified,...,Not specified,0,1,1353,0,LaMolina,0,Bueno,0,1
3,3,0,1,0,0,0,Paralelas,0,0,0,...,Not specified,1,0,230,0,SantiagoDeSurco,0,Muy bueno,0,1
4,4,0,Not specified,Not specified,0,Not specified,Not specified,Not specified,Not specified,0,...,Not specified,1,0,305,Not specified,LaMolina,Not specified,Not specified,Not specified,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,652,Not specified,Not specified,1,0,Not specified,Paralelas,Not specified,Not specified,Not specified,...,Not specified,1,1,600,Not specified,LaMolina,0,Not specified,0,1
653,653,1,0,1,0,1,Separadas,1,0,0,...,0,1,0,204,0,Asia,0,Excelente,0,1
654,654,0,Not specified,1,0,Not specified,Paralelas,Not specified,Not specified,0,...,Not specified,1,1,270,Not specified,LaMolina,0,Not specified,0,1
655,655,0,0,0,0,0,Paralelas,0,0,0,...,Not specified,1,0,160,0,Asia,0,Not specified,0,1


In [None]:
# Remove Construction_Area column, its redundant with construction area
#test_data.drop(columns=['Beach:Resort'], inplace=True)
test_data = extract_date(test_data)
test_data= extract_Location(test_data)
test_data = clean_beach_resort(test_data)
test_data.drop(columns=['Construction_Area'], inplace=True)
test_data['Total_Area_m2'] = pd.to_numeric(test_data['Total_Area_m2'], errors='coerce')
test_data = replace_not_specified_with_mediam('Total_Area_m2', test_data)
test_data = preprocess_data(test_data)
for column in numeric_columns:
    test_data = fill_numeric_column(column, test_data)

test_data['Age'] = pd.to_numeric(test_data['Age'], errors='coerce')

for column in trash_columns:
    test_data = remove_trash_data(column, test_data)
for column in columns_replace_with_mode:
    print(f'Processing column: {column}')
    test_data = replace_not_specified_with_mode(column, test_data)

test_data = clean_categorical_columns(test_data)
    
# Drop unique columns
for column in test_data.columns:
    if len(test_data[column].unique()) == len(test_data):
        print(f'Dropping unique column: {column}')
        test_data.drop(columns=[column], inplace=True)
    elif len(test_data[column].unique()) == 1:
        print(f'Dropping column with one value: {column}')
        test_data.drop(columns=[column], inplace=True)

test_data.drop(columns=columns_object, inplace=True)
test_data.drop(columns=columns_delete, inplace=True)
#test_data.drop(columns='Beach_Resort', inplace=True)
test_data.info()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Location'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

--------------------------------
Removing trash data from column: Internet
Internet
0                192
1                112
Not specified    309
dtype: int64
Internet
0                192
1                112
Not specified    309
dtype: int64
--------------------------------
Removing trash data from column: Sauna_Area
Sauna_Area
0                404
1                  8
Not specified    201
dtype: int64
Sauna_Area
0                404
1                  8
Not specified    201
dtype: int64
--------------------------------
Removing trash data from column: Air_Conditioning
Air_Conditioning
0                317
1                 72
Not specified    224
dtype: int64
Air_Conditioning
0                317
1                 72
Not specified    224
dtype: int64
--------------------------------
Removing trash data from column: Independent_Entrance
Independent_Entrance
0                249
Not specified    364
dtype: int64
Independent_Entrance
0                249
Not specified    364
dtype: in

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[colum

Furnished
False    574
True      83
dtype: int64
Processing column: Drainage
Drainage
False    555
True     102
dtype: int64
Processing column: Telephone
Telephone
False    532
True     125
dtype: int64
Processing column: Guest_Bathroom
Guest_Bathroom
False    294
True     363
dtype: int64
Processing column: BBQ_Area
BBQ_Area
False    580
True      77
dtype: int64
Processing column: Living_Room
Living_Room
False    374
True     283
dtype: int64
Processing column: Nearby_Parks
Nearby_Parks
False     57
True     600
dtype: int64
Processing column: Solarium
Solarium
False    655
True       2
dtype: int64
Processing column: Commercial_Use
Commercial_Use
False    627
True      30
dtype: int64
Processing column: Internal_Garden
Internal_Garden
False    451
True     206
dtype: int64
Processing column: Electricity
Electricity
False     79
True     578
dtype: int64
Processing column: Patio
Patio
False    438
True     219
dtype: int64
Processing column: Children_Playground
Children_Playground
Fa

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[colum

Kitchen
False     37
True     620
dtype: int64
Processing column: Water
Water
False     65
True     592
dtype: int64
Processing column: Basement
Basement
False    636
True      21
dtype: int64
Processing column: Independent_Bathroom
Independent_Bathroom
False    657
dtype: int64
Processing column: Walk_in_Closet
Walk_in_Closet
False    512
True     145
dtype: int64
Processing column: Grill
Grill
False    189
True     468
dtype: int64
Processing column: Closet
Closet
False    618
True      39
dtype: int64
Processing column: Internet
Internet
False    545
True     112
dtype: int64
Processing column: Sauna_Area
Sauna_Area
False    649
True       8
dtype: int64
Processing column: Kitchen_with_Cabinets
Kitchen_with_Cabinets
False    513
True     144
dtype: int64
Processing column: Gym
Gym
False    628
True      29
dtype: int64
Processing column: Handicap_Access
Handicap_Access
False    614
True      43
dtype: int64
Processing column: Dining_Room
Dining_Room
False     60
True     597
dtype: 

In [None]:
test_data

Unnamed: 0,Gas_Connection,Fireplace,Entrance_Hall,Equipped,Garage_Type,Furnished,Drainage,Guest_Bathroom,BBQ_Area,Living_Room,...,Beach_Resort,Cable,Gym,Jacuzzi,Dining_Room,Total_Area_m2,Heating,District,Daycare,Property_Condition
0,False,False,True,False,Paralelas,False,False,False,False,True,...,,False,False,False,True,340.0,False,LaMolina,False,
1,False,False,True,False,Lineales,False,False,True,False,True,...,,False,False,False,True,1138.0,False,LaMolina,False,Bueno
2,False,False,False,False,Paralelas,False,False,False,False,False,...,,False,False,False,False,1353.0,False,LaMolina,False,Bueno
3,False,True,False,False,Paralelas,False,False,False,False,False,...,,False,False,False,True,230.0,False,SantiagoDeSurco,False,Muy bueno
4,False,False,True,False,,False,False,True,False,False,...,,True,False,False,True,305.0,False,LaMolina,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,False,False,True,False,Paralelas,False,False,False,False,True,...,,False,False,False,True,600.0,False,LaMolina,False,
653,True,False,True,True,Separadas,True,False,True,True,False,...,Coral,True,True,False,True,204.0,False,Asia,False,Excelente
654,False,False,True,False,Paralelas,False,False,True,False,True,...,,False,False,False,True,270.0,False,LaMolina,False,
655,False,False,False,False,Paralelas,False,False,False,True,True,...,Sarapampa,False,False,False,True,160.0,False,Asia,False,


In [None]:
# Identify categorical and numerical columns
#columns_object =test_data.select_dtypes(include=['object']).columns.tolist()
print(columns_object)
data.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()
#test_data.drop(columns=columns_object, inplace=True)

[]


['Gas_Connection',
 'Fireplace',
 'Entrance_Hall',
 'Equipped',
 'Furnished',
 'Drainage',
 'Guest_Bathroom',
 'BBQ_Area',
 'Living_Room',
 'Nearby_Parks',
 'Solarium',
 'Commercial_Use',
 'Internal_Garden',
 'Garages',
 'Electricity',
 'Patio',
 'Children_Playground',
 'Number_Bathrooms',
 'Green_Areas',
 'Electric_Doorman',
 'Construction_Area_m2',
 'Intercom',
 'Near_Sea',
 'Sauna',
 'Cinema_Room',
 'Terrace',
 'Sports_Area',
 'Security_System',
 'Water_Heater',
 'Professional_Use',
 'Internal_Park',
 'Laundry_Room',
 'Nearby_Schools',
 'Balcony',
 'Attic',
 'Oceanfront',
 'Security_Guard',
 'Swimming_Pool',
 'Electric_Fence',
 'Air_Conditioning',
 'Hall',
 'Nearby_Shopping_Centers',
 'Water',
 'Basement',
 'Walk_in_Closet',
 'Age',
 'Number_Floors',
 'Daily_Dining_Room',
 'Grill',
 'Closet',
 'Internet',
 'Sauna_Area',
 'Kitchen_with_Cabinets',
 'Storage_Room',
 'Cable',
 'Gym',
 'Jacuzzi',
 'Dining_Room',
 'Total_Area_m2',
 'Heating',
 'Daycare',
 'Price']

In [None]:
data.columns

Index(['Gas_Connection', 'Fireplace', 'Entrance_Hall', 'Equipped',
       'Garage_Type', 'Furnished', 'Drainage', 'Guest_Bathroom', 'BBQ_Area',
       'Living_Room', 'Nearby_Parks', 'Solarium', 'Commercial_Use', 'Province',
       'Internal_Garden', 'Garages', 'Electricity', 'Patio',
       'Children_Playground', 'Type', 'Number_Bathrooms', 'Green_Areas',
       'Electric_Doorman', 'Construction_Area_m2', 'Intercom', 'Near_Sea',
       'Sauna', 'Cinema_Room', 'Terrace', 'Sports_Area', 'Security_System',
       'Location', 'Water_Heater', 'Professional_Use', 'Internal_Park',
       'Laundry_Room', 'Nearby_Schools', 'Balcony', 'Attic', 'Oceanfront',
       'Security_Guard', 'Natural_Light', 'Swimming_Pool', 'Electric_Fence',
       'Air_Conditioning', 'Hall', 'Nearby_Shopping_Centers', 'Water',
       'Basement', 'Walk_in_Closet', 'Age', 'Number_Floors',
       'Daily_Dining_Room', 'Grill', 'Closet', 'Internet', 'Sauna_Area',
       'Bedrooms', 'Pets', 'Kitchen_with_Cabinets', 'Garden', 

In [None]:
test_data

Unnamed: 0,Gas_Connection,Fireplace,Entrance_Hall,Equipped,Garage_Type,Furnished,Drainage,Guest_Bathroom,BBQ_Area,Living_Room,...,Beach_Resort,Cable,Gym,Jacuzzi,Dining_Room,Total_Area_m2,Heating,District,Daycare,Property_Condition
0,False,False,True,False,Paralelas,False,False,False,False,True,...,,False,False,False,True,340.0,False,LaMolina,False,
1,False,False,True,False,Lineales,False,False,True,False,True,...,,False,False,False,True,1138.0,False,LaMolina,False,Bueno
2,False,False,False,False,Paralelas,False,False,False,False,False,...,,False,False,False,False,1353.0,False,LaMolina,False,Bueno
3,False,True,False,False,Paralelas,False,False,False,False,False,...,,False,False,False,True,230.0,False,SantiagoDeSurco,False,Muy bueno
4,False,False,True,False,,False,False,True,False,False,...,,True,False,False,True,305.0,False,LaMolina,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,False,False,True,False,Paralelas,False,False,False,False,True,...,,False,False,False,True,600.0,False,LaMolina,False,
653,True,False,True,True,Separadas,True,False,True,True,False,...,Coral,True,True,False,True,204.0,False,Asia,False,Excelente
654,False,False,True,False,Paralelas,False,False,True,False,True,...,,False,False,False,True,270.0,False,LaMolina,False,
655,False,False,False,False,Paralelas,False,False,False,True,True,...,Sarapampa,False,False,False,True,160.0,False,Asia,False,


In [None]:
test_data_processed = preprocessor.transform(test_data)
# Predict prices
predicted_prices = xgb_model.predict(test_data_processed)
print(predicted_prices)
predicted_prices = np.exp(predicted_prices)  # Inverse log transformation to get actual prices
# Add predictions to test_data
test_data['Predicted_Price'] = predicted_prices.flatten()
test_data[['Predicted_Price']].head()

df = pd.DataFrame(predicted_prices, columns=['Price'])
df.index.name = 'Id'
print(df)
df.to_csv('submission.csv', index=True)

[13.586368  14.027118  14.199769  12.957675  13.384352  12.114314
 13.630277  13.931029  11.998643  13.322478  13.022799  11.389049
 13.580704  14.140612  14.37553   13.819758  13.089595  13.76548
 13.115482  11.716016  13.315254  12.413759  13.89205   13.215159
 12.8684635 14.026849  12.492701  13.771371  13.190808  13.189117
 12.621523  13.433009  13.010748  13.053949  12.341791  14.101355
 14.415073  12.634887  13.159055  12.31419   12.67359   12.151887
 12.670358  11.583256  13.727984  13.252597  13.4409485 13.740129
 14.2082405 12.130499  14.024499  12.0753565 14.009837  14.606055
 11.873842  12.978539  14.524744  14.12035   12.637439  13.07649
 13.60733   12.76063   11.913258  14.441583  14.249505  14.111645
 13.356501  13.830561  12.8468485 13.208856  14.20646   12.855027
 14.16648   12.606851  13.284903  11.464807  13.967026  14.558995
 12.333013  11.638801  14.10149   12.452961  13.1667795 13.341306
 12.040335  12.062016  14.597533  11.668131  14.03669   13.411894
 13.233899  