## 1. Data Preprocessing

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle


In [2]:
# Read files

# Open the file in binary read mode
with open('data/train.pickle', 'rb') as file:
    train = pickle.load(file)

# Open the file in binary read mode
with open('data/test_kaggle.pickle', 'rb') as file:
    test = pickle.load(file)

test = pd.DataFrame(test)
train = pd.DataFrame(train)

In [3]:
def data_preprocess(data):
    df = data.copy()

    df['house_size'] = df['features'].apply(lambda x: next((value for value in x if 'm2' in value), 0))
    df['house_size'] = df['house_size'].str.replace(' m2', '')
    df['house_size'] = df['house_size'].fillna(0).astype(float)

    df['Number_Bedrooms'] = df['features'].apply(lambda x: next((value for value in x if 'hab' in value), 0))
    df['Number_Bedrooms'] = df['Number_Bedrooms'].str.replace(' hab.', '')
    df['Number_Bedrooms'] = df['Number_Bedrooms'].fillna(0).astype(int)

    df['Number_Bathrooms'] = df['features'].apply(lambda x: next((value for value in x if 'baño' in value), 0))
    df['Number_Bathrooms'] = df['Number_Bathrooms'].str.replace(' baño', '')
    df['Number_Bathrooms'] = df['Number_Bathrooms'].str.replace('s', '')
    df['Number_Bathrooms'] = df['Number_Bathrooms'].fillna(0).astype(int)
    
    df.drop(columns=['features', 'selltype', 'subtype', 'loc'], inplace=True)
    return df

In [4]:
# Preprocess data
train = data_preprocess(train)
train['price'] = train['price'].str.replace('€', '').astype(float)

In [5]:
# Drop unnecessary columns
test = data_preprocess(test)
test.drop('description', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             132 non-null    object 
 1   loc_string        132 non-null    object 
 2   type              132 non-null    object 
 3   desc              132 non-null    object 
 4   house_size        132 non-null    float64
 5   Number_Bedrooms   132 non-null    int64  
 6   Number_Bathrooms  132 non-null    int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 7.3+ KB


In [3]:
# Combine text columns
train['combined'] = train['title'] + ' ' + train['desc']
test['combined'] = test['title'] + ' ' + test['desc']

## 2. Detect Mistyped Value

In [7]:
filtered_rows = train[train['Number_Bedrooms'] == 22]
filtered_rows

Unnamed: 0,price,title,loc_string,type,desc,house_size,Number_Bedrooms,Number_Bathrooms,combined
729,350.0,Piso Poblenou. Piso a reformar en rambla del p...,Barcelona - Poblenou,FLAT,¡Presentamos esta vivienda a REFORMAR en la Ra...,82.0,22,1,Piso Poblenou. Piso a reformar en rambla del p...


In [8]:
train['Number_Bedrooms'] = train['Number_Bedrooms'].replace(22, 2)
train.Number_Bedrooms.describe()

count    866.000000
mean       2.697460
std        1.020447
min        0.000000
25%        2.000000
50%        3.000000
75%        3.000000
max        6.000000
Name: Number_Bedrooms, dtype: float64

In [9]:
test.drop('desc', axis=1, inplace=True)
test.drop('title', axis=1, inplace=True)

train.drop('desc', axis=1, inplace=True)
train.drop('title', axis=1, inplace=True)

In [10]:
train.head()

Unnamed: 0,price,loc_string,type,house_size,Number_Bedrooms,Number_Bathrooms,combined
0,320.0,Barcelona - Sant Antoni,FLAT,85.0,2,1,Piso Tallers. Piso con 2 habitaciones con asce...
1,335.0,Barcelona - Dreta de l´Eixample,FLAT,65.0,2,1,Piso C/ de valència. Piso reformado en venta d...
2,330.0,Barcelona - Dreta de l´Eixample,FLAT,77.0,2,1,Piso en Dreta de l´Eixample. Acogedor piso al ...
3,435.0,Barcelona - Sant Antoni,FLAT,96.0,3,2,"Piso Barcelona - corts catalanes. Soleado, cén..."
4,410.0,Barcelona - Sagrada Família,FLAT,84.0,2,1,"Piso en Carrer de sardenya 271. Alto, reformad..."


In [11]:
test.head()

Unnamed: 0,loc_string,type,house_size,Number_Bedrooms,Number_Bathrooms,combined
0,Barcelona - El Parc i la Llacuna del Poblenou,FLAT,87.0,4,1,Piso Carrer de llull. Piso con 4 habitaciones ...
1,Barcelona - Poblenou,FLAT,78.0,4,1,Piso Diagonal. Luminoso piso de 4 habitaciones...
2,Barcelona - L´Antiga Esquerra de l´Eixample,FLAT,65.0,1,1,Piso Carrer del consell de cent. Piso amueblad...
3,Barcelona - Poblenou,FLAT,88.0,3,1,Piso Castanys. Carrer castanys Piso en pleno c...
4,Barcelona - Sant Antoni,FLAT,82.0,2,1,Piso Carrer de casanova. Piso con 2 habitacion...


## 3. Text preprocess by using spaCy with TFIDF

In [12]:
import time

# Start timing
start_time = time.time()


# Load the Spanish spacy model
nlp = spacy.load("es_dep_news_trf")
#nlp = spacy.load("en_core_web_trf")

# Example text preprocessing function
def preprocess_text(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.is_alpha and not token.is_stop:
            result.append(token.lemma_)
    return ' '.join(result)


# Apply preprocessing to the description feature
train['desc_processed'] = train['combined'].apply(preprocess_text)
test['desc_processed'] = test['combined'].apply(preprocess_text)

train['loc_string_processed'] = train['loc_string'].apply(preprocess_text)
test['loc_string_processed'] = test['loc_string'].apply(preprocess_text)

train['type_processed'] = train['type'].apply(preprocess_text)
test['type_processed'] = test['type'].apply(preprocess_text)


# End timing
end_time = time.time()

# Calculate the execution time
execution_time = end_time - start_time

print(f"Execution time: {execution_time} seconds")

Execution time: 262.0185730457306 seconds


In [13]:
numeric_features_test = test.select_dtypes(include=['float64', 'int64'])
numeric_features = train.select_dtypes(include=['float64', 'int64']).drop(columns=['price'])

In [14]:
# Initialize TF-IDF Vectorizer for each text feature
tfidf_desc = TfidfVectorizer()
tfidf_title = TfidfVectorizer()
tfidf_loc = TfidfVectorizer()
tfidf_type = TfidfVectorizer()

# Fit on training data and transform both training and test data using the fitted vectorizer
X_text_tfidf_desc_train = tfidf_desc.fit_transform(train['desc_processed']).toarray()
X_text_tfidf_desc_test = tfidf_desc.transform(test['desc_processed']).toarray()

X_text_tfidf_loc_train = tfidf_loc.fit_transform(train['loc_string_processed']).toarray()
X_text_tfidf_loc_test = tfidf_loc.transform(test['loc_string_processed']).toarray()

X_text_tfidf_type_train = tfidf_type.fit_transform(train['type_processed']).toarray()
X_text_tfidf_type_test = tfidf_type.transform(test['type_processed']).toarray()

X_combined_train = np.hstack([numeric_features, X_text_tfidf_desc_train, X_text_tfidf_loc_train, X_text_tfidf_type_train])
X_combined_test = np.hstack([numeric_features_test, X_text_tfidf_desc_test, X_text_tfidf_loc_test, X_text_tfidf_type_test])

In [15]:
X_combined_train.shape, X_combined_test.shape

((866, 4802), (132, 4802))

## 4. Convert into pkl file for the future use

In [16]:
import pickle

# Open the pickle file in binary read mode
with open('train_processed.pkl', 'rb') as file:
    # Deserialize and read the data
    X_combined_train = pickle.load(file)

# Open the pickle file in binary read mode
with open('test_processed.pkl', 'rb') as file:
    # Deserialize and read the data
    X_combined_test = pickle.load(file)

print(X_combined_train)


[[85.  2.  1. ...  0.  0.  0.]
 [65.  2.  1. ...  0.  0.  0.]
 [77.  2.  1. ...  0.  0.  0.]
 ...
 [79.  4.  2. ...  0.  0.  0.]
 [63.  1.  1. ...  0.  0.  0.]
 [80.  2.  1. ...  0.  0.  0.]]


In [17]:
# Split dataset
y = train['price'].values
X_train, X_test, y_train, y_test = train_test_split(X_combined_train, y, test_size=0.2, random_state=100)

## 5. Modeling - Stacking

In [38]:
from itertools import combinations
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge
from xgboost import XGBRegressor
import numpy as np
from catboost import CatBoostRegressor

# Define potential base models
base_model_candidates = {
    'extra_trees': ExtraTreesRegressor(n_estimators=100, random_state=42),
    'gradient_boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'catboost': CatBoostRegressor(n_estimators=100, random_state=42, verbose=0),  # verbose=0 to silence training output
    'ridge': Ridge(random_state=42)
}

# Function to evaluate a model combination
def evaluate_model_combination(model_combination):
    base_models = [(name, model) for name, model in model_combination]
    # Use XGBRegressor as the final estimator, this needs correction as per the comment
    stacking_ensemble = StackingRegressor(estimators=base_models, final_estimator=LinearRegression(), cv=5)
    score = np.mean(cross_val_score(stacking_ensemble, X_train, y_train, cv=5, scoring='r2'))
    return score

# Evaluate all possible combinations of the base models, starting from combinations of two
best_score = -np.inf
best_combination = None

for L in range(2, len(base_model_candidates) + 1):  # Start from 2 to skip single model evaluations
    for subset in combinations(base_model_candidates.items(), L):
        score = evaluate_model_combination(subset)
        if score > best_score:
            best_score = score
            best_combination = subset
        print(f"Combination: {[name for name, _ in subset]}, Score: {score}")

print(f"Best combination: {[name for name, _ in best_combination]}, Best Score: {best_score}")


Combination: ['extra_trees', 'gradient_boosting'], Score: 0.6307514622154037
Combination: ['extra_trees', 'catboost'], Score: 0.6411786383051952
Combination: ['extra_trees', 'ridge'], Score: 0.6526035396344566
Combination: ['gradient_boosting', 'catboost'], Score: 0.6217245601941288
Combination: ['gradient_boosting', 'ridge'], Score: 0.6468414201768733
Combination: ['catboost', 'ridge'], Score: 0.6485071167691157
Combination: ['extra_trees', 'gradient_boosting', 'catboost'], Score: 0.6406804999922384
Combination: ['extra_trees', 'gradient_boosting', 'ridge'], Score: 0.6537939093904985
Combination: ['extra_trees', 'catboost', 'ridge'], Score: 0.6571799856503115
Combination: ['gradient_boosting', 'catboost', 'ridge'], Score: 0.6512336256188747
Combination: ['extra_trees', 'gradient_boosting', 'catboost', 'ridge'], Score: 0.6564765010126504
Best combination: ['extra_trees', 'catboost', 'ridge'], Best Score: 0.6571799856503115


In [39]:
best_combination

(('extra_trees', ExtraTreesRegressor(random_state=42)),
 ('catboost', <catboost.core.CatBoostRegressor at 0x2afca2b90>),
 ('ridge', Ridge(random_state=42)))

In [23]:
from sklearn.ensemble import StackingRegressor, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge

# Define the best base models as found from your evaluation
best_base_models = [
    ('extra_trees', ExtraTreesRegressor(n_estimators=100, random_state=42)),
    ('catboost', CatBoostRegressor(n_estimators=100, random_state=42, verbose=0)),
    ('ridge', Ridge(random_state=42))
]

# Initialize the Stacking Regressor with LinearRegression as the final estimator
final_stacking_ensemble = StackingRegressor(
    estimators=best_base_models, 
    final_estimator=LinearRegression(), 
    cv=5
)


In [24]:
final_stacking_ensemble.fit(X_train, y_train)

In [25]:
predictions = final_stacking_ensemble.predict(X_combined_test)
len(predictions)

132

In [26]:
# Create a DataFrame with an ID column and a Price column
submission = pd.DataFrame({
    'id': range(0, len(predictions)),
    'price': predictions
})
submission.to_csv('solution.csv', index=False)

In [27]:
submission

Unnamed: 0,id,price
0,0,346.095305
1,1,345.848873
2,2,297.301436
3,3,340.390469
4,4,342.737770
...,...,...
127,127,374.099425
128,128,300.324632
129,129,301.596978
130,130,354.542687
