In [None]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0


In [None]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.3


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [3]:

def model_pipeline(data):
    
    data['total_bedrooms'] = data['total_bedrooms'].fillna(data['total_bedrooms'].median())
    price_quantiles = data['median_house_value'].quantile([0.25, 0.5, 0.75])

    def classify_house(value):
        if value <= price_quantiles[0.25]:
            return 0 
        elif price_quantiles[0.25] < value <= price_quantiles[0.5]:
            return 1 
        elif price_quantiles[0.5] < value <= price_quantiles[0.75]:
            return 2  
        else:
            return 3  

    data['house_category'] = data['median_house_value'].apply(classify_house)
    data['rooms_per_household'] = data['total_rooms'] / data['households']
    data['population_per_household'] = data['population'] / data['households']
    data['bedrooms_per_room'] = data['total_bedrooms'] / data['total_rooms']
    data['income_per_population'] = data['median_income'] / data['population']

    data['distance_from_center'] = np.sqrt(data['longitude']**2 + data['latitude']**2)
    data_encoded = pd.get_dummies(data, columns=['ocean_proximity'], drop_first=True)
    features = [
        'longitude', 'latitude', 'housing_median_age',
        'total_rooms', 'total_bedrooms', 'population',
        'households', 'median_income',
        'rooms_per_household', 'population_per_household',
        'bedrooms_per_room', 'income_per_population',
        'distance_from_center'
    ]

    features.extend([col for col in data_encoded.columns if col.startswith('ocean_proximity_')])

    X = data_encoded[features]
    y = data_encoded['house_category']

    return X, y

X, y = model_pipeline(data)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

xgb_model = XGBClassifier(
    n_estimators=300,  
    max_depth=7,  
    learning_rate=0.05,  
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_model, X_scaled, y, cv=cv, scoring='accuracy')

print("Cross-validation scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())

xgb_model.fit(X_scaled, y)


# feature_importance = pd.DataFrame({
#     'feature': X.columns,
#     'importance': xgb_model.feature_importances_
# }).sort_values('importance', ascending=False)

# print("\nTop 10 Most Important Features:")
# print(feature_importance.head(10))

Cross-validation scores: [0.88225886 0.88471949 0.88508858 0.88533465 0.88324311]
Mean CV Score: 0.8841289370078741
Standard Deviation: 0.0011836627431895334


In [4]:
from joblib import dump, load
dump(xgb_model, 'best_classification_model.joblib')
print("\n Best model saved as 'best_classification_model.joblib'")


 Best model saved as 'best_classification_model.joblib'


In [2]:
df1 = pd.read_csv('housing.csv')
df2 = pd.read_csv('boston_housing.csv')
data = pd.concat([df1, df2], ignore_index=True)
# data = pd.read_csv("housing.csv")
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,Longitude,Latitude,Housing Median Age,Total Rooms,Total Bedrooms,Population,Households,Median Income,Median House Value,Ocean Proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,,,,,,,,,,
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,,,,,,,,,,
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,,,,,,,,,,
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,,,,,,,,,,
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,,,,,,,,,,


In [None]:
## Load the saved model
model = load('best_classification_model.joblib')
## NEW DATA TO TEST THE MODEL
new_data = pd.DataFrame({
    'longitude': [-122.23],
    'latitude': [37.88],
    'housing_median_age': [41.0],
    'total_rooms': [880.0],
    'total_bedrooms': [129.0],
    'population': [322.0],
    'households': [126.0],
    'median_income': [8.3252],
    'rooms_per_household': [880.0 / 126.0],
    'population_per_household': [322.0 / 126.0],
    'bedrooms_per_room': [129.0 / 880.0],
    'income_per_population': [8.3252 / 322.0],
    'distance_from_center': [np.sqrt((-122.23)**2 + (37.88)**2)],
    'ocean_proximity_INLAND': [0],
    'ocean_proximity_ISLAND': [0],
    'ocean_proximity_NEAR BAY': [1],
    'ocean_proximity_NEAR OCEAN': [0]
})

new_data_scaled = scaler.transform(new_data)

predictions = xgb_model.predict(new_data_scaled)

category_mapping = {
    0: "Least Expensive",
    1: "Affordable",
    2: "Expensive",
    3: "Luxury"
}

mapped_predictions = [category_mapping[pred] for pred in predictions]

print("Predicted House Category for new data:", mapped_predictions[0])


Predicted House Category for new data: Luxury
