In [76]:
# Run this in a notebook cell to install a stable version of pyarrow
# import sys
# import subprocess

# # Uninstall the current broken version
# subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "pyarrow", "-y"])

# # Install a stable version (16.1.0)
# subprocess.check_call([sys.executable, "-m", "pip", "install", "pyarrow==16.1.0"])

In [77]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [78]:
df = pd.read_csv('property_listing_data_in_Bangladesh.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'property_listing_data_in_Bangladesh.csv'

In [None]:
# Count missing values in each column
missing_values = df.isnull().sum()

# Display the result
print(missing_values)

In [None]:
# -------------------------------------------------
# 1. PREMIUM AREA
# -------------------------------------------------
premium_areas = [
    'Gulshan', 'Banani', 'Baridhara', 'Uttara Sector 13',
    'Dhanmondi', 'Mirpur DOHS', 'Mohakhali DOHS',
    'Nasirabad Properties', 'Rajuk Uttara Apartment Project'
]

df['is_premium_area'] = df['title'].apply(
    lambda t: int(any(area in t for area in premium_areas))
)

# -------------------------------------------------
# 2. CONDITION (medium‑value predictor)
# -------------------------------------------------
condition_keywords = [
    'Strongly Structured', 'Tastefully Designed', 'Strongly Constructed',
    'Well-Constructed', 'Elegant', 'Spacious', 'Excellent',
    'Marvelous', 'Perfect', 'Smartly Priced'
]

df['is_high_condition'] = df['title'].apply(
    lambda t: int(any(word in t for word in condition_keywords))
)

# -------------------------------------------------
# 3. FURNISHED (high‑value indicator)
# -------------------------------------------------
df['is_fully_furnished'] = df['title'].str.contains(
    'Fully Furnished', case=False, na=False
).astype(int)

# -------------------------------------------------
# 5. LUXURY & VIEW (luxury indicator)
# -------------------------------------------------
def is_luxury(title: str) -> int:
    """Return 1 if any luxury‑related keyword appears in the title."""
    luxury_keywords = [
        'Fully Furnished',
        'Open Sky View',
        'View',
        'Gulshan',
        'Banani'
    ]
    return int(any(word in title for word in luxury_keywords))

df['is_luxury'] = df['title'].apply(is_luxury)

# -------------------------------------------------
# Sample output
# -------------------------------------------------
print("Sample of extracted features:")
print(df[['title',
          'is_premium_area',
          'is_high_condition',
          'is_fully_furnished',
          'is_luxury',
          'beds']].head(10))

In [None]:
# # assume df is your DataFrame
# unique_addresses = df['address'].dropna().unique()   # remove NaNs if any

# # ── Print to console ─────────────────────────────────────
# for addr in unique_addresses:
#     print(addr)

# # ── Save to a text file (one address per line) ─────────────────────
# with open('unique_addresses.txt', 'w', encoding='utf-8') as f:
#     for addr in unique_addresses:
#         f.write(f"{addr}\n")

In [None]:
print(df[['beds', 'bath']].dtypes)

In [None]:
df['beds'] = pd.to_numeric(df['beds'], errors='coerce')
df['bath'] = pd.to_numeric(df['bath'], errors='coerce')

In [None]:
median_beds = df['beds'].median()
median_bath = df['bath'].median()

In [None]:
df['beds'].fillna(median_beds, inplace=True)
df['bath'].fillna(median_bath, inplace=True)

In [None]:
df['baths_per_bed'] = df['bath'] / (df['beds'] + 1e-6)
df['beds_minus_baths'] = df['beds'] - df['bath']
df['total_rooms'] = df['beds'] + df['bath']
df['beds * bath'] = df['beds'] * df['bath']
df['beds_squared'] = df['beds'] ** 2
df['baths_squared'] = df['bath'] ** 2
df['beds_cube'] = df['beds'] ** 3
df['baths_cube'] = df['bath'] ** 3
df['beds_four'] = df['beds'] ** 4
df['baths_four'] = df['bath'] ** 4

In [None]:
print(df.isnull().sum())

In [None]:
df.columns

In [None]:
# 4. Clean Price & Area
def convert_price_to_number(price):
    if 'Thousand' in price:
        return float(price.replace('Thousand', '')) * 1000
    elif 'Lakh' in price:
        return float(price.replace('Lakh', '')) * 100000
    else:
        return float(price)

df['price'] = df['price'].apply(convert_price_to_number)
df['area'] = df['area'].str.replace(',', '', regex=False)
df['area'] = df['area'].str.replace(' sqft', '', regex=False)
df['area'] = df['area'].astype(float)

In [None]:
MAX_AREA = 4000
df = df[df['area'] < MAX_AREA]

MAX_PRICE = 200000
df = df[df['price'] < MAX_PRICE]

In [None]:
print(f"Original Dataset Size: 7557")
print(f"Filtered Dataset Size: {len(df)}")
print(f"Removed outliers: {7557 - len(df)} rows")

In [None]:
cols_to_drop = ['title', 'beds', 'bath', 'purpose', 'flooPlan', 'url', 'lastUpdated']
df = df.drop(cols_to_drop, axis=1)

In [None]:
df = pd.get_dummies(df, columns=['type'], drop_first=True)

In [None]:
X = df.drop('price', axis=1)
y = df['price']
y_log = np.log1p(y)

In [None]:
# 7. Train/Val/Test Split (60/20/20)
X_train, X_rem, y_train_log, y_rem_log = train_test_split(
    X, y_log, 
    train_size=0.6, 
    random_state=42
)

X_val, X_test, y_val_log, y_test_log = train_test_split(
    X_rem, y_rem_log, 
    test_size=0.5, 
    random_state=42
)

In [None]:
#Target Encoding Address (Strict Train-Only Logic)
train_combined = X_train.copy()
train_combined['price'] = np.expm1(y_train_log) # Convert back to actual price for grouping
mean_map = train_combined.groupby('address')['price'].mean()
global_mean = train_combined['price'].mean()

def encode_address(data, mean_map, global_mean):
    data_copy = data.copy()
    data_copy['address_encoded'] = data_copy['address'].map(mean_map)
    data_copy['address_encoded'].fillna(global_mean, inplace=True)
    data_copy.drop('address', axis=1, inplace=True)
    return data_copy

X_train_enc = encode_address(X_train, mean_map, global_mean)
X_val_enc   = encode_address(X_val, mean_map, global_mean)
X_test_enc  = encode_address(X_test, mean_map, global_mean)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_enc)
X_val_scaled   = scaler.transform(X_val_enc)
X_test_scaled  = scaler.transform(X_test_enc)

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train_log)

In [None]:
y_train_pred_log = model.predict(X_train_scaled)
y_val_pred_log = model.predict(X_val_scaled)

# Convert back from Log to Actual Prices for RMSE calculation
y_train_pred = np.expm1(y_train_pred_log)
y_val_pred = np.expm1(y_val_pred_log)
y_train_actual = np.expm1(y_train_log)
y_val_actual = np.expm1(y_val_log)

train_mse = mean_squared_error(y_train_actual, y_train_pred)
val_mse = mean_squared_error(y_val_actual, y_val_pred)

train_rmse = np.sqrt(train_mse)
val_rmse = np.sqrt(val_mse)

avg_price = np.mean(y_train_actual)

train_error_pct = (train_rmse / avg_price) * 100
val_error_pct = (val_rmse / avg_price) * 100

print(f"Average House Price: {avg_price:,.2f}")
print(f"Training RMSE: {train_rmse:,.2f} ({train_error_pct:.2f}%)")
print(f"Validation RMSE: {val_rmse:,.2f} ({val_error_pct:.2f}%)")

In [None]:
print("\n--- Model Diagnosis ---")
if train_error_pct > 50: 
    print("Status: SEVERE Underfitting")
    print("Reason: The model is too simple.")
elif val_error_pct > train_error_pct * 2: 
    print("Status: Overfitting")
else:
    print("Status: Good Fit / Balanced")