In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

In [2]:
df = pd.read_csv('/kaggle/input/dataaa/Transactions (1).csv')

In [3]:
# Selecting relevant columns (removing Arabic columns)
df = df[['trans_group_en', 'procedure_name_en', 'property_type_en', 'property_sub_type_en',
         'property_usage_en', 'reg_type_en', 'area_name_en', 'building_name_en', 'project_name_en',
         'master_project_en', 'nearest_landmark_en', 'nearest_metro_en', 'nearest_mall_en',
         'rooms_en', 'has_parking', 'procedure_area', 'actual_worth', 'meter_sale_price', 'rent_value',
         'meter_rent_price', 'no_of_parties_role_1', 'no_of_parties_role_2', 'no_of_parties_role_3']]

# Handle missing values
df = df.dropna()

In [4]:
# Handling outliers by capping
for col in ['procedure_area', 'actual_worth', 'meter_sale_price', 'rent_value', 'meter_rent_price']:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df[col] = np.where(df[col] > upper_bound, upper_bound, np.where(df[col] < lower_bound, lower_bound, df[col]))


In [5]:
# Define categorical and numerical columns
categorical_columns = ['trans_group_en', 'procedure_name_en', 'property_type_en', 'property_sub_type_en',
                       'property_usage_en', 'reg_type_en', 'area_name_en', 'building_name_en', 'project_name_en',
                       'master_project_en', 'nearest_landmark_en', 'nearest_metro_en', 'nearest_mall_en',
                       'rooms_en', 'has_parking']

numerical_columns = ['procedure_area', 'meter_sale_price', 'rent_value',
                     'meter_rent_price', 'no_of_parties_role_1', 'no_of_parties_role_2', 'no_of_parties_role_3']


In [6]:

# Encode categorical data
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_categorical_data = encoder.fit_transform(df[categorical_columns])

# Combine encoded categorical data with numerical data
numerical_data = df[numerical_columns].values
combined_data = np.hstack([encoded_categorical_data, numerical_data])

# Scale the combined data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(combined_data)

# Define the target variable
X = scaled_data
y = df['actual_worth'].values

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)




In [7]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [4, 6, 8],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 324 candidates, totalling 972 fits


In [8]:
# Best model from grid search
best_xgb_model = grid_search.best_estimator_

# Evaluate on validation set
y_val_pred = best_xgb_model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
val_r2 = r2_score(y_val, y_val_pred)

print(f"Validation Mean Squared Error: {val_mse}")
print(f"Validation Mean Absolute Error: {val_mae}")
print(f"Validation R-squared: {val_r2}")

# Evaluate on test set
y_test_pred = best_xgb_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Test Mean Squared Error: {test_mse}")
print(f"Test Mean Absolute Error: {test_mae}")
print(f"Test R-squared: {test_r2}")

Validation Mean Squared Error: 21309468.02626158
Validation Mean Absolute Error: 2588.4588815789475
Validation R-squared: 0.9999518971662502
Test Mean Squared Error: 17749935.47957812
Test Mean Absolute Error: 2519.3257863898025
Test R-squared: 0.999962167071641


In [9]:
# Function to predict price range and confidence given new input data
def predict_price_range_with_confidence(input_data, n_models=100):
    input_data_encoded = encoder.transform(input_data[categorical_columns])
    input_data_numeric = input_data[numerical_columns].values
    input_data_combined = np.hstack([input_data_encoded, input_data_numeric])
    input_data_scaled = scaler.transform(input_data_combined)
    
    # Generate predictions with multiple models to calculate confidence
    predictions = []
    for _ in range(n_models):
        bootstrap_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=np.random.randint(0, 10000))
        bootstrap_model.set_params(**grid_search.best_params_)
        bootstrap_model.fit(X_train, y_train)
        prediction = bootstrap_model.predict(input_data_scaled)
        predictions.append(prediction)
    
    predictions = np.array(predictions).flatten()
    mean_prediction = np.mean(predictions)
    std_prediction = np.std(predictions)
    
    confidence_interval = 1.96 * std_prediction  # 95% confidence interval
    return mean_prediction, confidence_interval

In [10]:

# Example function to take custom input and get prediction with confidence
def evaluate_custom_input(trans_group, procedure_name, property_type, property_sub_type, 
                          property_usage, reg_type, area_name, building_name, project_name, 
                          master_project, nearest_landmark, nearest_metro, nearest_mall, 
                          rooms, has_parking, procedure_area, meter_sale_price, rent_value, 
                          meter_rent_price, no_of_parties_role_1, no_of_parties_role_2, no_of_parties_role_3):
    custom_input = pd.DataFrame({
        'trans_group_en': [trans_group], 
        'procedure_name_en': [procedure_name], 
        'property_type_en': [property_type],
        'property_sub_type_en': [property_sub_type],
        'property_usage_en': [property_usage],
        'reg_type_en': [reg_type],
        'area_name_en': [area_name],
        'building_name_en': [building_name],
        'project_name_en': [project_name],
        'master_project_en': [master_project],
        'nearest_landmark_en': [nearest_landmark],
        'nearest_metro_en': [nearest_metro],
        'nearest_mall_en': [nearest_mall],
        'rooms_en': [rooms],
        'has_parking': [has_parking],
        'procedure_area': [procedure_area],
        'meter_sale_price': [meter_sale_price],
        'rent_value': [rent_value],
        'meter_rent_price': [meter_rent_price],
        'no_of_parties_role_1': [no_of_parties_role_1],
        'no_of_parties_role_2': [no_of_parties_role_2],
        'no_of_parties_role_3': [no_of_parties_role_3]
    })

    predicted_price, confidence = predict_price_range_with_confidence(custom_input)
    return predicted_price, confidence

# Example usage
predicted_price, confidence = evaluate_custom_input('group1', 'procedure1', 'type1', 'subtype1', 'usage1', 
                                                    'reg1', 'area1', 'building1', 'project1', 'master1', 
                                                    'landmark1', 'metro1', 'mall1', '3', 'yes', 
                                                    1000, 5000000, 20000, 500, 1, 2, 3)

print(f"Predicted Price: {predicted_price}")
print(f"95% Confidence Interval: ±{confidence}")

Predicted Price: 199688.9375
95% Confidence Interval: ±0.06125
