In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import joblib

# Load the data
file_path = './mihiresh/goal_amount.csv'
data = pd.read_csv(file_path)

# Filter out data with zero values for critical features
data = data[(data['investment_amount'] != 0) & 
            (data['years_to_retire'] != 0) & 
            (data['salary'] != 0)]

# Select relevant features and target variables
features = ['years_to_retire', 'investment_amount']
target_low = 'goal_low'
target_mid = 'goal_mid'
target_high = 'goal_high'

# Apply log transformation to the target variables
data[target_low] = np.log1p(data[target_low])
data[target_mid] = np.log1p(data[target_mid])
data[target_high] = np.log1p(data[target_high])

# Prepare the data
X = data[features]
y_low = data[target_low]
y_mid = data[target_mid]
y_high = data[target_high]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Split the data into training and testing sets
X_train, X_test, y_low_train, y_low_test = train_test_split(X_scaled, y_low, test_size=0.2, random_state=42)
_, _, y_mid_train, y_mid_test = train_test_split(X_scaled, y_mid, test_size=0.2, random_state=42)
_, _, y_high_train, y_high_test = train_test_split(X_scaled, y_high, test_size=0.2, random_state=42)

# Define a function to build and train XGBoost models with hyperparameter tuning
def train_xgboost(X_train, y_train, X_test, y_test):
    model = xgb.XGBRegressor(objective='reg:squarederror')
    
    # Hyperparameter grid
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 3, 5]
    }
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return best_model, y_pred, mse

# Train the model for goal_low
model_low, y_low_pred, mse_low = train_xgboost(X_train, y_low_train, X_test, y_low_test)

# Save the model for goal_low
# joblib.dump(model_low, 'model_low.pkl')

# Train the model for goal_mid
model_mid, y_mid_pred, mse_mid = train_xgboost(X_train, y_mid_train, X_test, y_mid_test)

# Save the model for goal_mid
joblib.dump(model_mid, 'model_mid.pkl')

# Train the model for goal_high
model_high, y_high_pred, mse_high = train_xgboost(X_train, y_high_train, X_test, y_high_test)

# Save the model for goal_high
joblib.dump(model_high, 'model_high.pkl')

# Inverse log transformation for predictions and actual values
y_low_pred = np.expm1(y_low_pred)
y_mid_pred = np.expm1(y_mid_pred)
y_high_pred = np.expm1(y_high_pred)

y_low_test = np.expm1(y_low_test)
y_mid_test = np.expm1(y_mid_test)
y_high_test = np.expm1(y_high_test)

print(f'Mean Squared Error for goal_low (XGBoost with Log Transform): {mse_low}')
print(f'Mean Squared Error for goal_mid (XGBoost with Log Transform): {mse_mid}')
print(f'Mean Squared Error for goal_high (XGBoost with Log Transform): {mse_high}')


Mean Squared Error for goal_low (XGBoost with Log Transform): 0.010945650060628283
Mean Squared Error for goal_mid (XGBoost with Log Transform): 0.009124486520039542
Mean Squared Error for goal_high (XGBoost with Log Transform): 0.008180176245165352


In [45]:
# Displaying a few predictions
predictions = pd.DataFrame({
    'goal_low_actual': y_low_test,
    'goal_low_predicted': y_low_pred.flatten(),
    'goal_mid_actual': y_mid_test,
    'goal_mid_predicted': y_mid_pred.flatten(),
    'goal_high_actual': y_high_test,
    'goal_high_predicted': y_high_pred.flatten()
}).head()

predictions

Unnamed: 0,goal_low_actual,goal_low_predicted,goal_mid_actual,goal_mid_predicted,goal_high_actual,goal_high_predicted
36337,10287570.0,9900247.0,10870600.0,11153420.0,13471660.0,13557330.0
48054,236223.0,243524.5,263566.3,264896.6,382411.1,398340.6
1599,28989310.0,26138710.0,31924940.0,32938600.0,61793750.0,63875620.0
26048,32718760.0,34920870.0,50586090.0,59452380.0,168314900.0,168301100.0
16051,9089949.0,9243036.0,10811200.0,10371630.0,16607050.0,15613110.0


In [46]:
import joblib
import numpy as np

# Load the scaler and model for 'goal_mid'
scaler = joblib.load('scaler.pkl')
model_mid = joblib.load('model_mid.pkl')


In [47]:
inp_a = np.array([[12, 25000]])
inp_b = np.array([[33, 22500]])

In [48]:
a = scaler.transform(inp_a)
b = scaler.transform(inp_b)



In [49]:
answer_a = model_mid.predict(a)
answer_b = model_mid.predict(b)

answer_a = np.expm1(answer_a)
answer_b = np.expm1(answer_b)

In [50]:
print(f"Answer a: {answer_a}\nanswer b: {answer_b}")

Answer a: [5380771.5]
answer b: [39596800.]


In [1]:
import json

In [2]:
with open('properties.json', 'r') as f:
    property_data = json.load(f)

In [3]:
property_data

{'Mumbai': [{'title': 'Bank Auction Property - Avishkar Empress  Dadar West',
   'price': 55200000,
   'rate': 29253,
   'address': 'Bank Auction Property - Avishkar Empress\xa0 Near Nabar Guruji School, Krushnaji Waman Chitale Rd, Dadar West,Mumbai',
   'area': 1887,
   'estimated_emi': 316000,
   'bhk': 3.0,
   'location': 'Dadar',
   'goal_price': 60445244.81,
   'profit': 5245244.810000002,
   'profit_percentage': 9.502255090579714},
  {'title': 'Lodha Bellavista  Manpada',
   'price': 39909090,
   'rate': 17589,
   'address': 'Lodha Bellavista\xa0 Near DMart (Manpada),Pokhroad road no 2,Manpada, Mumbai',
   'area': 2269,
   'estimated_emi': 229000,
   'bhk': 4.0,
   'location': 'Manpada',
   'goal_price': 43701353.54,
   'profit': 3792263.539999999,
   'profit_percentage': 9.502255100279157},
  {'title': 'Lodha Bellavista  Manpada',
   'price': 39909090,
   'rate': 17589,
   'address': 'Lodha Bellavista\xa0 Near DMart (Manpada),Pokhroad road no 2,Manpada, Mumbai',
   'area': 2269,