1. Apply feature selection methods (e.g., correlation analysis, LASSO, recursive feature elimination) to reduce irrelevant or redundant features.
2. Handle missing values, outliers, and data normalization/standardization.
3. Generate new features through transformations, interactions, or domain-specific insights.
4. Build and train a baseline machine learning model without any advanced tuning or feature engineering.
5. Optimize model hyperparameters using grid search, random search, or other optimization techniques.
6. Build advanced models using ensemble techniques (e.g., Random Forest, Gradient Boosting) or deep learning if applicable.
7. Evaluate the final model using appropriate metrics (e.g., accuracy, precision, recall, F1-score, ROC-AUC) and test set performance.
8. Apply cross-validation, hold-out validation, or bootstrapping to test model robustness and prevent overfitting.
9. Prepare a comprehensive report or presentation summarizing the project’s approach, model performance, and results.



In [34]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("kalodatahousing.csv", header = 0)
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [36]:
# Check for missing values in total 
print(df.isnull().sum())

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64


In [74]:
#One hot encoding to change yes and no to 0 and 1
ohed = pd.get_dummies(df, columns = ['mainroad', 
                                                     'guestroom', 
                                                     'basement', 
                                                     'hotwaterheating', 
                                                     'airconditioning', 
                                                     'prefarea', 
                                                     'furnishingstatus'], dtype=int)
ohed

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_no,mainroad_yes,guestroom_no,guestroom_yes,...,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
8,9870000,8100,4,1,2,2,0,1,0,1,...,1,1,0,0,1,0,1,1,0,0
9,9800000,5750,3,2,4,1,0,1,0,1,...,0,1,0,0,1,0,1,0,0,1
12,9310000,6550,4,2,2,1,0,1,1,0,...,0,1,0,0,1,0,1,0,1,0
13,9240000,3500,4,2,2,2,0,1,1,0,...,0,0,1,1,0,1,0,1,0,0
14,9240000,7800,3,2,2,0,0,1,1,0,...,0,1,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,2,0,1,1,0,...,1,1,0,1,0,1,0,0,0,1
541,1767150,2400,3,1,1,0,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
542,1750000,3620,2,1,1,0,0,1,1,0,...,0,1,0,1,0,1,0,0,0,1
543,1750000,2910,3,1,1,0,1,0,1,0,...,0,1,0,1,0,1,0,1,0,0


In [86]:
#outliers using Z score
def detech_outliers(ohed, threshold =3):

    ohed = np.array(ohed, dtype=np.float64)
    mean=np.mean(ohed)
    std=np.std(ohed)

    
    z_scores=(ohed-mean)/std
    outliers = ohed[np.abs(z_scores) > threshold]    
    return outliers

In [88]:
outliers_pt=detech_outliers(ohed)

In [90]:
outliers_pt

array([9870000., 9800000., 9310000., 9240000., 9240000., 9100000.,
       9100000., 8960000., 8890000., 8855000., 8750000., 8680000.,
       8645000., 8645000., 8575000., 8540000., 8463000., 8400000.,
       8400000., 8400000., 8400000., 8400000., 8295000., 8120000.,
       8080940., 8043000., 7980000., 7962500., 7910000., 7875000.,
       7840000., 7700000., 7700000., 7560000., 7560000., 7525000.,
       7490000., 7455000., 7420000., 7420000., 7420000., 7350000.,
       7350000., 7350000., 7350000., 7343000., 7245000., 7210000.,
       7210000., 7140000., 7070000., 7070000., 7035000., 7000000.,
       6930000., 6895000., 6860000., 6790000., 6755000., 6720000.,
       6685000., 6650000., 6650000., 6650000., 6650000., 6650000.,
       6650000., 6629000., 6615000., 6615000., 6580000., 6510000.,
       6510000., 6510000., 6475000., 6475000., 6440000., 6419000.,
       6405000., 6300000., 6300000., 6300000., 6300000., 6300000.,
       6293000., 6265000., 6230000., 6230000., 6195000., 61950

In [94]:
# Data Normalization/Standardization
scaler = StandardScaler()
numerical_features = ohed.select_dtypes(include=[np.number]).columns
ohed[numerical_features] = scaler.fit_transform(ohed[numerical_features])

In [None]:
#LASSO
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

X = data.drop('target_column', axis=1)  # Replace with your target column
y = data['target_column']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LASSO regression for feature selection
lasso = Lasso(alpha=0.01)  # Adjust alpha as needed
lasso.fit(X_train, y_train)

# Get feature importances
importance = np.abs(lasso.coef_)
selected_features = X.columns[importance > 0]
print("Selected features:", selected_features)