In [393]:
import pandas as pd # type: ignore

# Read the CSV file
data = pd.read_csv('train.csv')

# Display the first few rows of the data
data.head()
data.columns = data.columns.str.replace("'", "", regex=False)

In [394]:
data

Unnamed: 0,Id,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Telephone,...,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom,Price
0,0,1,Not specified,0,0,Not specified,Paralelas,Not specified,Not specified,0,...,,,,,,,,,,210000
1,1,0,1,Not specified,0,0,Paralelas,0,0,0,...,1,0,311,0,LaMolina,Not specified,Bueno,Not specified,1,660000
2,2,Not specified,Not specified,1,0,Not specified,Lineales,Not specified,Not specified,Not specified,...,1,1,1400,Not specified,LaMolina,0,Bueno,0,1,1500000
3,3,0,1,Not specified,0,0,Separadas,0,0,0,...,1,0,317,0,SanIsidro,Not specified,Remodelado,Not specified,1,1100000
4,4,0,Not specified,Not specified,0,Not specified,Lineales,Not specified,Not specified,0,...,0,0,246,Not specified,VillaMariaDelTriunfo,Not specified,Regular,Not specified,0,110000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,6995,Not specified,Not specified,Not specified,0,Not specified,Not specified,Not specified,Not specified,Not specified,...,1,0,193,Not specified,LaVictoria,Not specified,Bueno,Not specified,0,450000
6996,6996,0,0,1,0,0,Not specified,1,0,0,...,1,0,144,0,Asia,0,Not specified,0,1,450000
6997,6997,0,0,Not specified,0,0,Not specified,0,1,1,...,1,0,396,0,SantiagoDeSurco,Not specified,Not specified,Not specified,1,450000
6998,6998,Not specified,Not specified,1,0,Not specified,Not specified,Not specified,Not specified,Not specified,...,1,0,188,Not specified,Bellavista,0,Muy bueno,0,1,240000


In [395]:
for column in data.columns:
    print('--------------------------------')
    print(data.groupby(column).size())

--------------------------------
Id
0       1
1       1
2       1
3       1
4       1
       ..
6995    1
6996    1
6997    1
6998    1
6999    1
Length: 7000, dtype: int64
--------------------------------
Gas_Connection
0                3175
1                 484
Not specified    3341
dtype: int64
--------------------------------
Fireplace
0                3270
1                1319
Not specified    2411
dtype: int64
--------------------------------
Entrance_Hall
0                1864
1                2695
Not specified    2441
dtype: int64
--------------------------------
Kitchenette
0                6791
Not specified     209
dtype: int64
--------------------------------
Equipped
0                4161
1                 428
Not specified    2411
dtype: int64
--------------------------------
Garage_Type
Lineales          604
NoTiene           835
Not specified    2892
Paralelas        2505
Separadas         164
dtype: int64
--------------------------------
Furnished
0                3

In [396]:
# Remove Construction_Area column, its redundant with construction area
data.drop(columns=['Construction_Area'], inplace=True)


In [397]:
data['Total_Area_m2'] = pd.to_numeric(data['Total_Area_m2'], errors='coerce')

In [398]:
dict_bedrooms = {
    '0': 0,
    '1': 1,
    '2': 2,
    '3': 3,
    '4': 4,
    '5+': 5
}
def preprocess_data(data_set):
    data_set['Bedrooms'] = data_set['Bedrooms'].apply(lambda x: dict_bedrooms.get(x, x))
    data_set['Bedrooms'].fillna(data_set['Bedrooms'].mode()[0], inplace=True)
    return data_set

In [399]:
data = preprocess_data(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_set['Bedrooms'].fillna(data_set['Bedrooms'].mode()[0], inplace=True)


In [400]:
numeric_columns = ['Number_Floors', 'Age']
def fill_numeric_column(column_name, dataset):
    dataset[column_name] = pd.to_numeric(data[column_name], errors='coerce')
    dataset[column_name].fillna(dataset[column_name].mode()[0], inplace=True)
    return dataset

In [401]:

for column in numeric_columns:
    data = fill_numeric_column(column, data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column_name].fillna(dataset[column_name].mode()[0], inplace=True)


In [402]:
allowed_values = ['0', '1', 'Nulo', 'Not specified', 'Si', 'No']
def remove_trash_data(column,dataset):
    print('--------------------------------')
    print('Removing trash data from column:', column)
    print(dataset.groupby(column).size())
    dataset = dataset[(dataset[column].isin(allowed_values)) | (dataset[column].isnull())]
    print(dataset.groupby(column).size())
    return dataset

In [403]:
trash_columns = ['Internet', 'Sauna_Area', 'Air_Conditioning', 'Independent_Entrance', 'Jacuzzi', 'Service_Bathroom'
                 ,'Cable', 'Service_Room', 'Internet_Room', 'Daycare', 'Daily_Dining_Room', 'Heating', 'Pets']
for column in trash_columns:
    data = remove_trash_data(column, data)

--------------------------------
Removing trash data from column: Internet
Internet
0                                                                2105
1                                                                1194
Not specified                                                    3160
Panamericana Sur Km 94.5 Asia                                       1
Panamericana Sur Km 94.5 Club Playa Las Arenas Asia                 1
Panamericana Sur Km 94.5 Club Playa Las Arenas Casa B_22 Asia       1
Publicado el 05.09.19                                               1
Publicado el 10.09.19                                               1
Publicado el 12.08.19                                               2
Publicado el 13.04.19                                               5
Publicado el 13.09.19                                               1
Publicado el 14.09.19                                               1
Publicado el 17.09.19                                               6
Public

--------------------------------
Removing trash data from column: Daycare
Daycare
0                4230
1                  20
Not specified    2209
dtype: int64
Daycare
0                4230
1                  20
Not specified    2209
dtype: int64
--------------------------------
Removing trash data from column: Daily_Dining_Room
Daily_Dining_Room
0                2096
1                4176
Not specified     189
dtype: int64
Daily_Dining_Room
0                2096
1                4176
Not specified     189
dtype: int64
--------------------------------
Removing trash data from column: Heating
Heating
0                4126
1                  48
Not specified    2285
dtype: int64
Heating
0                4126
1                  48
Not specified    2285
dtype: int64
--------------------------------
Removing trash data from column: Pets
Pets
No     223
Si    6236
dtype: int64
Pets
No     223
Si    6236
dtype: int64


In [404]:
data = data[data['Age']!= 'Not specified']
data['Age'] = pd.to_numeric(data['Age'], errors='coerce')

In [405]:
import numpy as np
def replace_not_specified_with_false(column, dataset):
    dataset[column] = dataset[column].apply(lambda x: np.nan if (x == 'Not specified' or x == 'Nulo') else 
                                      True if (x == '1' or x == 'Si') 
                                      else False if (x == '0' or x == 'No') else x)
    #dataset[column+'_missing'] = dataset[column].isna().astype(int)
    dataset[column].fillna(dataset[column].mode()[0], inplace=True)
    #dataset[column].fillna(False, inplace=True)

    print(dataset.groupby(column).size())
    return dataset


In [406]:
# Posibles columnas repetidas en el dataset
# Sauna, Sauna_area,
# Oceanfront, Near_Sea
# Kitchenette, Kitchen_with_Cabinets


columns = ['Gas_Connection', 'Fireplace', 'Entrance_Hall', 'Kitchenette',
           'Equipped', 'Furnished', 'Drainage', 'Telephone', 'Guest_Bathroom', 'BBQ_Area',
           'Living_Room', 'Nearby_Parks', 'Solarium', 'Commercial_Use', 'Internal_Garden', 
           'Electricity', 'Patio', 'Children_Playground', 'Green_Areas', 'Electric_Doorman',
           'Intercom', 'Near_Sea', 'Sauna', 'Cinema_Room', 'Cleaning_Service', 'Terrace',
           'Sports_Area', 'Security_System', 'Water_Heater', 'Professional_Use', 'Club_House',
           'Internal_Park', 'Laundry_Room', 'Nearby_Schools', 'Balcony', 'Attic', 'Oceanfront', 'Security_Guard',
           'Swimming_Pool', 'Electric_Fence', 'Air_Conditioning', 'Hall', 'Nearby_Shopping_Centers', 'Kitchen',
           'Water', 'Basement', 'Independent_Bathroom', 'Walk_in_Closet', 'Grill', 'Closet',
           'Internet', 'Sauna_Area', 'Kitchen_with_Cabinets', 'Gym', 'Handicap_Access', 'Dining_Room',
           'Office', 'Service_Bathroom', 'Storage_Room', 'Cable', 'Jacuzzi', 'Independent_Entrance', 'Service_Room',
           'Internet_Room', 'Daycare', 'Daily_Dining_Room', 'Heating', 'Match', 'Pets', 'Garden']
for column in columns:
    print(f'Processing column: {column}')
    data = replace_not_specified_with_false(column, data)

Processing column: Gas_Connection
Gas_Connection
False    6479
True      477
dtype: int64
Processing column: Fireplace
Fireplace
False    5645
True     1311
dtype: int64
Processing column: Entrance_Hall
Entrance_Hall
False    1856
True     5100
dtype: int64
Processing column: Kitchenette
Kitchenette
False    6956
dtype: int64
Processing column: Equipped
Equipped
False    6539
True      417
dtype: int64
Processing column: Furnished
Furnished
False    6143
True      813
dtype: int64
Processing column: Drainage
Drainage
False    5682
True     1274
dtype: int64
Processing column: Telephone
Telephone
False    5692
True     1264
dtype: int64
Processing column: Guest_Bathroom
Guest_Bathroom
False    3167
True     3789
dtype: int64
Processing column: BBQ_Area
BBQ_Area
False    6155
True      801
dtype: int64
Processing column: Living_Room
Living_Room
False    4042
True     2914
dtype: int64
Processing column: Nearby_Parks
Nearby_Parks
False     672
True     6284
dtype: int64
Processing column:

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[colum

Professional_Use
False     273
True     6683
dtype: int64
Processing column: Club_House
Club_House
False    6924
True       32
dtype: int64
Processing column: Internal_Park
Internal_Park
False    6840
True      116
dtype: int64
Processing column: Laundry_Room
Laundry_Room
False    2458
True     4498
dtype: int64
Processing column: Nearby_Schools
Nearby_Schools
False     718
True     6238
dtype: int64
Processing column: Balcony
Balcony
False    6097
True      859
dtype: int64
Processing column: Attic
Attic
False    6768
True      188
dtype: int64
Processing column: Oceanfront
Oceanfront
False    6701
True      255
dtype: int64
Processing column: Security_Guard
Security_Guard
False    5451
True     1505
dtype: int64
Processing column: Swimming_Pool
Swimming_Pool
False    2013
True     4943
dtype: int64
Processing column: Electric_Fence
Electric_Fence
False    6611
True      345
dtype: int64
Processing column: Air_Conditioning
Air_Conditioning
False    6170
True      786
dtype: int64
Proc

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[colum

Pets
False     223
True     6733
dtype: int64
Processing column: Garden
Garden
False     896
True     6060
dtype: int64


In [407]:
categorical_cols = ['Garage_Type', 'Province', 'Type', 'Location', 'Natural_Light', 'Advertiser', 'Bedrooms', 'Pets', 'Garden', 'Beach_Resort', 'District', 'Property_Condition']
for column in categorical_cols:
    print(f'Processing categorical column: {column}')
    data[column] = data[column].astype('category')
# Convert 'Publication_Date' to datetime
#data['Publication_Date'] = pd.to_datetime(data['Publication_Date'], errors='coerce')
# Convert 'Bedrooms' to numeric


Processing categorical column: Garage_Type
Processing categorical column: Province
Processing categorical column: Type
Processing categorical column: Location
Processing categorical column: Natural_Light
Processing categorical column: Advertiser
Processing categorical column: Bedrooms
Processing categorical column: Pets
Processing categorical column: Garden
Processing categorical column: Beach_Resort
Processing categorical column: District
Processing categorical column: Property_Condition


In [408]:
# Drop unique columns
for column in data.columns:
    if len(data[column].unique()) == len(data):
        print(f'Dropping unique column: {column}')
        data.drop(columns=[column], inplace=True)
    elif len(data[column].unique()) == 1:
        print(f'Dropping column with one value: {column}')
        data.drop(columns=[column], inplace=True)

Dropping unique column: Id
Dropping column with one value: Kitchenette


Dropping column with one value: Match
Dropping column with one value: Independent_Bathroom
Dropping column with one value: Independent_Entrance


In [409]:
data

Unnamed: 0,Gas_Connection,Fireplace,Entrance_Hall,Equipped,Garage_Type,Furnished,Drainage,Telephone,Guest_Bathroom,BBQ_Area,...,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom,Price
0,True,False,False,False,Paralelas,False,False,False,False,False,...,True,False,,False,,False,,False,True,210000
1,False,True,True,False,Paralelas,False,False,False,True,False,...,True,False,311.0,False,LaMolina,False,Bueno,False,True,660000
2,False,False,True,False,Lineales,False,False,False,False,False,...,True,True,1400.0,False,LaMolina,False,Bueno,False,True,1500000
3,False,True,True,False,Separadas,False,False,False,True,False,...,True,False,317.0,False,SanIsidro,False,Remodelado,False,True,1100000
4,False,False,True,False,Lineales,False,False,False,False,False,...,False,False,246.0,False,VillaMariaDelTriunfo,False,Regular,False,False,110000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,False,False,True,False,Not specified,False,False,False,False,False,...,True,False,193.0,False,LaVictoria,False,Bueno,False,False,450000
6996,False,False,True,False,Not specified,True,False,False,True,False,...,True,False,144.0,False,Asia,False,Not specified,False,True,450000
6997,False,False,True,False,Not specified,False,True,True,True,False,...,True,False,396.0,False,SantiagoDeSurco,False,Not specified,False,True,450000
6998,False,False,True,False,Not specified,False,False,False,True,False,...,True,False,188.0,False,Bellavista,False,Muy bueno,False,True,240000


In [410]:
# Identify categorical and numerical columns
columns_object =data.select_dtypes(include=['object']).columns.tolist()
print(columns_object)
data.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()
data.drop(columns=columns_object, inplace=True)

['Publication_Date']


In [411]:
data.select_dtypes(include=['int64', 'float64']).columns.tolist()

['Garages',
 'Number_Bathrooms',
 'Construction_Area_m2',
 'Age',
 'Number_Floors',
 'Total_Area_m2',
 'Price']

In [412]:
data.drop_duplicates(inplace=True)
data.drop(columns=['Total_Area_m2'], inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

# Select features and target
X = data.drop(columns=['Price'])
y = data['Price']
#y = np.log(y)  # Log-transform the target variable for better performance

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
boolean_cols = X.select_dtypes(include=['bool']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('bool', OneHotEncoder() ,boolean_cols),
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit and transform data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Build neural network
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train_processed.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)


optimizer = Adam(clipvalue=0.01)  # Clip gradients to a value between -1 and 1
model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

# Train model
history = model.fit(X_train_processed, y_train, epochs=150, batch_size=64, validation_split=0.1, callbacks=[early_stop], verbose=1)

# Evaluate model
loss, mae = model.evaluate(X_test_processed, y_test, verbose=0)
print(f"Test MAE: {mae:.2f}")

Epoch 1/150




[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 1431676256256.0000 - mae: 804622.1875 - val_loss: 1235325288448.0000 - val_mae: 798655.8750
Epoch 2/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 1430927572992.0000 - mae: 804162.3750 - val_loss: 1233987698688.0000 - val_mae: 797830.8750
Epoch 3/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1428759642112.0000 - mae: 802855.1250 - val_loss: 1230926774272.0000 - val_mae: 795942.0625
Epoch 4/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 1424594436096.0000 - mae: 800277.8750 - val_loss: 1225479815168.0000 - val_mae: 792570.4375
Epoch 5/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1417558097920.0000 - mae: 796015.2500 - val_loss: 1217029210112.0000 - val_mae: 787313.3125
Epoch 6/150
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - 

In [414]:
from sklearn.metrics import r2_score

# Predict on test set
y_pred = model.predict(X_test_processed).flatten()

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R2 score on test set: {r2:.4f}")

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
R2 score on test set: 0.4079


In [415]:
test_data = pd.read_csv('test.csv')
test_data.columns = test_data.columns.str.replace("'", "", regex=False)
test_data.head()



Unnamed: 0,Id,Gas_Connection,Fireplace,Entrance_Hall,Kitchenette,Equipped,Garage_Type,Furnished,Drainage,Telephone,...,Handicap_Access,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom
0,0,Not specified,0,1,0,0,Paralelas,0,0,Not specified,...,Not specified,1,1,340,0,LaMolina,0,Not specified,0,1
1,1,Not specified,Not specified,1,0,Not specified,Lineales,Not specified,Not specified,Not specified,...,Not specified,1,0,1138,Not specified,LaMolina,0,Bueno,0,1
2,2,Not specified,0,0,0,0,Paralelas,0,0,Not specified,...,Not specified,0,1,1353,0,LaMolina,0,Bueno,0,1
3,3,0,1,0,0,0,Paralelas,0,0,0,...,Not specified,1,0,230,0,SantiagoDeSurco,0,Muy bueno,0,1
4,4,0,Not specified,Not specified,0,Not specified,Not specified,Not specified,Not specified,0,...,Not specified,1,0,305,Not specified,LaMolina,Not specified,Not specified,Not specified,1


In [416]:
# Remove Construction_Area column, its redundant with construction area
test_data['Total_Area_m2'] = pd.to_numeric(test_data['Total_Area_m2'], errors='coerce')

In [417]:
# Remove Construction_Area column, its redundant with construction area
test_data.drop(columns=['Construction_Area'], inplace=True)
test_data['Total_Area_m2'] = pd.to_numeric(test_data['Total_Area_m2'], errors='coerce')
test_data['Age'] = pd.to_numeric(test_data['Age'], errors='coerce')

for column in trash_columns:
    test_data = remove_trash_data(column, test_data)
for column in columns:
    print(f'Processing column: {column}')
    test_data = replace_not_specified_with_false(column, test_data)
# Drop unique columns
for column in test_data.columns:
    if len(test_data[column].unique()) == len(test_data):
        print(f'Dropping unique column: {column}')
        test_data.drop(columns=[column], inplace=True)
    elif len(test_data[column].unique()) == 1:
        print(f'Dropping column with one value: {column}')
        test_data.drop(columns=[column], inplace=True)

for column in numeric_columns:
    test_data = fill_numeric_column(column, test_data)

for column in categorical_cols:
    print(f'Processing categorical column: {column}')
    test_data[column] = test_data[column].astype('category')

test_data = preprocess_data(test_data)
test_data.drop(columns=columns_object, inplace=True)
test_data.info()

--------------------------------
Removing trash data from column: Internet
Internet
0                192
1                112
Not specified    309
dtype: int64
Internet
0                192
1                112
Not specified    309
dtype: int64
--------------------------------
Removing trash data from column: Sauna_Area
Sauna_Area
0                404
1                  8
Not specified    201
dtype: int64
Sauna_Area
0                404
1                  8
Not specified    201
dtype: int64
--------------------------------
Removing trash data from column: Air_Conditioning
Air_Conditioning
0                317
1                 72
Not specified    224
dtype: int64
Air_Conditioning
0                317
1                 72
Not specified    224
dtype: int64
--------------------------------
Removing trash data from column: Independent_Entrance


Independent_Entrance
0                249
Not specified    364
dtype: int64
Independent_Entrance
0                249
Not specified    364
dtype: int64
--------------------------------
Removing trash data from column: Jacuzzi
Jacuzzi
0                309
1                 80
Not specified    224
dtype: int64
Jacuzzi
0                309
1                 80
Not specified    224
dtype: int64
--------------------------------
Removing trash data from column: Service_Bathroom
Service_Bathroom
0                134
1                465
Not specified     14
dtype: int64
Service_Bathroom
0                134
1                465
Not specified     14
dtype: int64
--------------------------------
Removing trash data from column: Cable
Cable
0                175
1                129
Not specified    309
dtype: int64
Cable
0                175
1                129
Not specified    309
dtype: int64
--------------------------------
Removing trash data from column: Service_Room
Service_Room
0        

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[colum

Balcony
False    578
True      79
dtype: int64
Processing column: Attic
Attic
False    637
True      20
dtype: int64
Processing column: Oceanfront
Oceanfront
False    625
True      32
dtype: int64
Processing column: Security_Guard
Security_Guard
False    523
True     134
dtype: int64
Processing column: Swimming_Pool
Swimming_Pool
False    452
True     205
dtype: int64
Processing column: Electric_Fence
Electric_Fence
False    623
True      34
dtype: int64
Processing column: Air_Conditioning
Air_Conditioning
False    585
True      72
dtype: int64
Processing column: Hall
Hall
False    236
True     421
dtype: int64
Processing column: Nearby_Shopping_Centers
Nearby_Shopping_Centers
False     57
True     600
dtype: int64
Processing column: Kitchen
Kitchen
False     37
True     620
dtype: int64
Processing column: Water
Water
False     65
True     592
dtype: int64
Processing column: Basement
Basement
False    636
True      21
dtype: int64
Processing column: Independent_Bathroom
Independent_Bat

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)
  dataset[column].fillna(dataset[colum

In [418]:
test_data

Unnamed: 0,Gas_Connection,Fireplace,Entrance_Hall,Equipped,Garage_Type,Furnished,Drainage,Telephone,Guest_Bathroom,BBQ_Area,...,Handicap_Access,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom
0,False,False,True,False,Paralelas,False,False,False,False,False,...,False,True,True,340.0,False,LaMolina,False,Not specified,False,True
1,False,False,True,False,Lineales,False,False,False,True,False,...,False,True,False,1138.0,False,LaMolina,False,Bueno,False,True
2,False,False,False,False,Paralelas,False,False,False,False,False,...,False,False,True,1353.0,False,LaMolina,False,Bueno,False,True
3,False,True,False,False,Paralelas,False,False,False,False,False,...,False,True,False,230.0,False,SantiagoDeSurco,False,Muy bueno,False,True
4,False,False,True,False,Not specified,False,False,False,True,False,...,False,True,False,305.0,False,LaMolina,False,Not specified,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,False,False,True,False,Paralelas,False,False,False,False,False,...,False,True,True,600.0,False,LaMolina,False,Not specified,False,True
653,True,False,True,True,Separadas,True,False,False,True,True,...,False,True,False,204.0,False,Asia,False,Excelente,False,True
654,False,False,True,False,Paralelas,False,False,False,True,False,...,False,True,True,270.0,False,LaMolina,False,Not specified,False,True
655,False,False,False,False,Paralelas,False,False,False,False,True,...,False,True,False,160.0,False,Asia,False,Not specified,False,True


In [419]:
# Identify categorical and numerical columns
#columns_object =test_data.select_dtypes(include=['object']).columns.tolist()
print(columns_object)
data.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()
#test_data.drop(columns=columns_object, inplace=True)

['Publication_Date']


['Gas_Connection',
 'Fireplace',
 'Entrance_Hall',
 'Equipped',
 'Furnished',
 'Drainage',
 'Telephone',
 'Guest_Bathroom',
 'BBQ_Area',
 'Living_Room',
 'Nearby_Parks',
 'Solarium',
 'Commercial_Use',
 'Internal_Garden',
 'Garages',
 'Electricity',
 'Patio',
 'Children_Playground',
 'Number_Bathrooms',
 'Green_Areas',
 'Electric_Doorman',
 'Construction_Area_m2',
 'Intercom',
 'Near_Sea',
 'Sauna',
 'Cinema_Room',
 'Cleaning_Service',
 'Terrace',
 'Sports_Area',
 'Security_System',
 'Water_Heater',
 'Professional_Use',
 'Club_House',
 'Internal_Park',
 'Laundry_Room',
 'Nearby_Schools',
 'Balcony',
 'Attic',
 'Oceanfront',
 'Security_Guard',
 'Swimming_Pool',
 'Electric_Fence',
 'Air_Conditioning',
 'Hall',
 'Nearby_Shopping_Centers',
 'Kitchen',
 'Water',
 'Basement',
 'Walk_in_Closet',
 'Age',
 'Number_Floors',
 'Daily_Dining_Room',
 'Grill',
 'Closet',
 'Internet',
 'Sauna_Area',
 'Kitchen_with_Cabinets',
 'Service_Room',
 'Storage_Room',
 'Cable',
 'Gym',
 'Jacuzzi',
 'Handicap_Ac

In [420]:
test_data

Unnamed: 0,Gas_Connection,Fireplace,Entrance_Hall,Equipped,Garage_Type,Furnished,Drainage,Telephone,Guest_Bathroom,BBQ_Area,...,Handicap_Access,Dining_Room,Office,Total_Area_m2,Heating,District,Daycare,Property_Condition,Internet_Room,Service_Bathroom
0,False,False,True,False,Paralelas,False,False,False,False,False,...,False,True,True,340.0,False,LaMolina,False,Not specified,False,True
1,False,False,True,False,Lineales,False,False,False,True,False,...,False,True,False,1138.0,False,LaMolina,False,Bueno,False,True
2,False,False,False,False,Paralelas,False,False,False,False,False,...,False,False,True,1353.0,False,LaMolina,False,Bueno,False,True
3,False,True,False,False,Paralelas,False,False,False,False,False,...,False,True,False,230.0,False,SantiagoDeSurco,False,Muy bueno,False,True
4,False,False,True,False,Not specified,False,False,False,True,False,...,False,True,False,305.0,False,LaMolina,False,Not specified,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,False,False,True,False,Paralelas,False,False,False,False,False,...,False,True,True,600.0,False,LaMolina,False,Not specified,False,True
653,True,False,True,True,Separadas,True,False,False,True,True,...,False,True,False,204.0,False,Asia,False,Excelente,False,True
654,False,False,True,False,Paralelas,False,False,False,True,False,...,False,True,True,270.0,False,LaMolina,False,Not specified,False,True
655,False,False,False,False,Paralelas,False,False,False,False,True,...,False,True,False,160.0,False,Asia,False,Not specified,False,True


In [421]:
test_data_processed = preprocessor.transform(test_data)
# Predict prices
predicted_prices = model.predict(test_data_processed)
print(predicted_prices)
predicted_prices = np.exp(predicted_prices)  # Inverse log transformation to get actual prices
# Add predictions to test_data
test_data['Predicted_Price'] = predicted_prices.flatten()
test_data[['Predicted_Price']].head()

df = pd.DataFrame(predicted_prices, columns=['Price'])
df.index.name = 'Id'
print(df)
df.to_csv('submission.csv', index=True)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[[1.19971762e+06]
 [1.32064638e+06]
 [1.40827188e+06]
 [3.61860250e+05]
 [8.88572750e+05]
 [9.54495234e+04]
 [8.12600188e+05]
 [9.16550125e+05]
 [9.20563984e+04]
 [9.62787062e+05]
 [9.04623000e+05]
 [2.39648297e+05]
 [3.51197500e+05]
 [1.16014938e+06]
 [1.61707438e+06]
 [2.60686625e+05]
 [6.48792500e+05]
 [1.29445862e+06]
 [5.99622562e+05]
 [7.73603672e+04]
 [6.18409812e+05]
 [1.36346812e+05]
 [1.71299562e+06]
 [1.42128438e+06]
 [6.38486250e+05]
 [1.36926975e+06]
 [3.64047938e+05]
 [1.18039088e+06]
 [3.68471125e+05]
 [4.12337531e+05]
 [1.53314781e+05]
 [9.65869000e+05]
 [2.36120938e+05]
 [7.56966375e+05]
 [2.47305453e+05]
 [1.38375825e+06]
 [1.15496675e+06]
 [1.71431812e+05]
 [5.56557812e+05]
 [5.02386500e+05]
 [6.88617312e+05]
 [2.24303750e+05]
 [5.04853438e+05]
 [1.29814392e+03]
 [1.53455588e+06]
 [1.13586588e+06]
 [1.35563012e+06]
 [6.42428000e+05]
 [2.02374038e+06]
 [4.56408242e+04]
 [1.45296562e+06]
 [3.37189

  predicted_prices = np.exp(predicted_prices)  # Inverse log transformation to get actual prices
