In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from textblob import TextBlob  # For sentiment analysis
from sklearn.neighbors import NearestNeighbors
import joblib

# # Generate fake property data
# fake = Faker()
# property_types = ["Apartment", "House", "Villa", "Cottage"]
# # ... (rest of your data generation code)

# data = []  # Your generated data goes here

# # Create DataFrame
# property_df = pd.DataFrame(data)

df = pd.read_csv("property_data.csv")

In [2]:
duplicates = df.apply(lambda x: x.duplicated()).sum()
# print (duplicates)
data = df.drop_duplicates()

# Drop the 'property_name' column
property_df = data.drop(columns=['images', 'title'])

In [3]:
# Sentiment analysis function for description
def analyze_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# Apply sentiment analysis to the description column
property_df["description_sentiment"] = property_df["description"].apply(analyze_sentiment)
print(property_df["description_sentiment"])

0       0.107143
1      -0.009877
2      -0.077778
3       0.175000
4       0.222222
          ...   
4995    0.180556
4996   -0.075000
4997    0.000000
4998    0.191667
4999    0.425000
Name: description_sentiment, Length: 5000, dtype: float64


In [4]:
# Create Latitude and Longitude features
property_df["latitude"] = property_df["latitude"].astype(float)
property_df["longitude"] = property_df["longitude"].astype(float)

In [5]:
# # Nearest Neighbors for suggesting prices
# locations = property_df[['latitude', 'longitude']].values
# k = 5  # Number of neighbors to consider
# nn_model = NearestNeighbors(n_neighbors=k)
# nn_model.fit(locations)
# neighbors_indices = nn_model.kneighbors(locations, n_neighbors=k, return_distance=False)
# property_df["avg_neighbor_price"] = property_df.iloc[neighbors_indices].groupby("property_id")["base_price"].transform("mean")

# print(property_df["avg_neighbor_price"])

# Nearest Neighbors for suggesting prices
locations = property_df[['latitude', 'longitude']].values
k = 5  # Number of neighbors to consider
nn_model = NearestNeighbors(n_neighbors=k)
nn_model.fit(locations)
neighbors_indices = nn_model.kneighbors(locations, n_neighbors=k, return_distance=False)

# Calculate average neighbor price
avg_neighbor_prices = []
for indices in neighbors_indices:
    avg_price = property_df.loc[indices, "base_price"].mean()
    avg_neighbor_prices.append(avg_price)

property_df["avg_neighbor_price"] = avg_neighbor_prices

print(property_df["avg_neighbor_price"])


0       3006.874731
1       2416.693221
2       2451.611493
3       3247.925859
4       1853.944600
           ...     
4995    1962.837250
4996    1695.240780
4997    3133.501139
4998    1886.310132
4999    2473.171552
Name: avg_neighbor_price, Length: 5000, dtype: float64


In [6]:
# Preprocessing and modeling pipeline
numeric_features = ["number_of_bedrooms", "base_price", "bathrooms", "beds", "guests", "description_sentiment", "avg_neighbor_price"]
property_df[numeric_features] = property_df[numeric_features].fillna(property_df[numeric_features].mean())

categorical_features = ["location", "property_type", "option", "amenities", "seasonality", "bed_type", "neighborhood", "guest_type"]
property_df[categorical_features] = property_df[categorical_features].fillna(property_df[categorical_features].mode().iloc[0])

# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='mean')),
#     ('scaler', StandardScaler())
# ])

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_features),
#         ('cat', categorical_transformer, categorical_features)
#     ])

# model_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', RandomForestRegressor(n_estimators=100, random_state=42))
# ])

In [7]:
label_encoder = LabelEncoder()
for feature in categorical_features:
    property_df[feature] = label_encoder.fit_transform(property_df[feature])

# Calculate correlation matrix for all features
# all_features = numerical_features + list(data_encoded.columns)
# correlation_matrix = data.corr()

# # Create a heatmap
# plt.figure(figsize=(20, 10))
# sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
# plt.title("Correlation Matrix")
# plt.show()

In [8]:
# # Create interaction terms
# property_df['bedrooms_bathrooms_interaction'] = property_df['number_of_bedrooms'] * property_df['bathrooms']

# # Create polynomial features
# property_df['beds_squared'] = property_df['beds'] ** 2

# # Create derived ratios
# property_df['price_per_bedroom'] = property_df['base_price'] / property_df['number_of_bedrooms']
# property_df.head(2)

In [9]:
# Remove extreme outliers in base_price
property_df = property_df[property_df['base_price'] < 5000]

In [10]:
# Transformation
property_df['log_base_price'] = np.log(property_df['base_price'])

In [11]:
# Split data into features and target
# X = property_df.drop(columns=["base_price", "log_base_price", "description", "description_sentiment", "avg_neighbor_price", "bedrooms_bathrooms_interaction", "beds_squared", "price_per_bedroom"])  # Features excluding base_price
# y = property_df["log_base_price"]  # Target variable

# X = property_df.drop(columns=["base_price", "description", "description_sentiment", "avg_neighbor_price", "bedrooms_bathrooms_interaction", "beds_squared", "price_per_bedroom"])  # Features excluding base_price
# y = property_df["base_price"]  # Target variable

X = property_df.drop(columns=["base_price", "log_base_price", "description", "description_sentiment", "avg_neighbor_price"])  # Features excluding base_price
y = property_df["log_base_price"]  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
property_df.head(2)

Unnamed: 0,location,latitude,longitude,property_type,option,guests,number_of_bedrooms,amenities,seasonality,base_price,bathrooms,bed_type,beds,neighborhood,guest_type,description,description_sentiment,avg_neighbor_price,log_base_price
0,3883,-30.07168,-156.619822,5,2,4,5,3830,2,3339.355493,9,2,7,3,1,Audience discover theory himself his. Federal ...,0.107143,3006.874731,8.113533
1,2527,-65.997038,-36.710449,7,0,1,2,4386,0,4435.880429,3,2,4,3,0,Another tough young if health. Energy fund off...,-0.009877,2416.693221,8.397481


In [13]:
X.head(2)

Unnamed: 0,location,latitude,longitude,property_type,option,guests,number_of_bedrooms,amenities,seasonality,bathrooms,bed_type,beds,neighborhood,guest_type
0,3883,-30.07168,-156.619822,5,2,4,5,3830,2,9,2,7,3,1
1,2527,-65.997038,-36.710449,7,0,1,2,4386,0,3,2,4,3,0


In [14]:
y.head(2)

0    8.113533
1    8.397481
Name: log_base_price, dtype: float64

In [15]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [16]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42)
}

for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} - MSE: {mse:.4f}, R-squared: {r2:.4f}")

Linear Regression - MSE: 0.8082, R-squared: 0.0001
Ridge Regression - MSE: 0.8082, R-squared: 0.0001
Random Forest Regressor - MSE: 0.8363, R-squared: -0.0347
Gradient Boosting Regressor - MSE: 0.8266, R-squared: -0.0228


In [17]:
# # Fit the model pipeline
# model_pipeline.fit(X_train, y_train)

# # Predict base prices
# y_pred = model_pipeline.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# print("Mean Squared Error:", mse)