## End-to-end ML pipeline for house price prediction

#### Problem Statement: Given a housing dataset containing various features like latitude, longitude, households etc and the price for such house. Your task is to develop a machine learning model which can predict the price of such house given the other features.

In [None]:
pip install -r ../requirements.txt

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

### Data Loading

In [None]:
# Load and extract data
def load_data(file_path: str) -> pd.DataFrame:
    '''
    Load dataset from a zip or csv file and return as a pandas DataFrame.
    '''
    if file_path.endswith(".csv"):
        df = pd.read_csv(file_path)
    elif file_path.endswith(".zip"):
        df = pd.read_csv(file_path, compression='zip')
    else:
        raise ValueError("The provided file is not a .csv or .zip file.")
    
    return df

### Data Preprocessing

In [None]:
# Preprocess data
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Preprocess the housing data by handling missing values and scaling features.
    '''
    # Handle missing values
    df_num = df.select_dtypes(include=[np.number])
    df_num = df_num.fillna(df_num.median())
    df = df.drop(df_num.columns, axis=1)
    d
    
    # Feature scaling for numeric features
    scaler = StandardScaler()
    numeric_features = df_num.columns
    df[numeric_features] = scaler.fit_transform(df[numeric_features])

    # One-hot encoding for categorical features
    ohe = OneHotEncoder()
    categorical_features = df.select_dtypes(include=['object']).columns
    print(categorical_features)
    if len(categorical_features) > 0:
        ohe_features = ohe.fit_transform(df[categorical_features])
        ohe_df = pd.DataFrame(ohe_features)
        df = df.drop(columns=categorical_features).reset_index(drop=True)
        df = pd.concat([df, ohe_df], axis=1)
    
    return df

### Feature Engineering

In [None]:
# Feature engineering
def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    '''
    Create new features from existing ones to enhance model performance.
    '''
    df['total_rooms_per_household'] = df['total_rooms'] / df['households']
    df['total_bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
    df['population_per_household'] = df['population'] / df['households']
    
    return df

### Data Splitting

In [None]:
# Split data into training and testing sets
def split_data(df: pd.DataFrame, target_column: str):
    '''
    Split the dataset into training and testing sets.
    '''
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

### Model Training

In [None]:
# Train model
def train_model_LinearRegression(X_train: pd.DataFrame, y_train: pd.Series) -> LinearRegression:
    '''
    Train a Linear Regression model on the training data.
    '''
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    return model

def train_model_RandomForest(X_train: pd.DataFrame, y_train: pd.Series) -> RandomForestRegressor:
    '''
    Train a Random Forest Regressor on the training data.
    '''
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    return model

def train_model_DecisionTree(X_train: pd.DataFrame, y_train: pd.Series) -> DecisionTreeRegressor:
    '''
    Train a Decision Tree Regressor on the training data.
    '''
    model = DecisionTreeRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    return model

def train_model_SupportVectorMachine(X_train: pd.DataFrame, y_train: pd.Series) -> SVR:
    '''
    Train a Support Vector Regressor on the training data.
    '''
    model = SVR()
    model.fit(X_train, y_train)
    
    return model

def train_model_XGBoost(X_train: pd.DataFrame, y_train: pd.Series) -> XGBRegressor:
    '''
    Train a XGBoost Classifier on the training data.
    '''
    model = XGBRegressor()
    model.fit(X_train, y_train)
    
    return model

### Model Evaluation

In [None]:
# Evaluate model
def evaluate_model(model, X_test: pd.DataFrame, y_test: pd.Series) -> float:
    '''
    Evaluate the model using Mean Squared Error (MSE).
    '''
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    return mse

### Selecting Best Model

In [None]:
# Select best model
def select_best_model(models: dict, X_test: pd.DataFrame, y_test: pd.Series):
    '''
    Select the best model based on evaluation metrics.
    '''
    best_model = None
    best_mse = float('inf')
    
    for name, model in models.items():
        mse = evaluate_model(model, X_test, y_test)
        print(f"Model: {name}, MSE: {mse}")
        if mse < best_mse:
            best_mse = mse
            best_model = model
            
    return best_model

### Main Function

In [None]:
# Load data
data_path = "../datasets/housing.csv"
df = load_data(data_path)
df.head()

In [None]:
df.info()

In [None]:
df["ocean_proximity"].value_counts()

In [None]:
df.describe()

In [None]:
# Preprocess data
# df = preprocess_data(df)

In [None]:
# Handling missing values
df.dropna(inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# One-hot encoding for categorical features
categorical_features = df.select_dtypes('object').columns

df = pd.get_dummies(df, columns=categorical_features, drop_first=False)

In [None]:
df.head()

In [None]:
# Feature Enginnering
df = feature_engineering(df)
df.head()

In [None]:
# Split data
X_train, X_test, y_train, y_test = split_data(df, target_column='median_house_value')
X_train.shape

In [None]:
X_train.head()

In [None]:
type(y_train)

In [None]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns
numeric_features

In [None]:
# Scaling numerical features
ss_x = StandardScaler()
ss_y = StandardScaler()
X_train[numeric_features] = ss_x.fit_transform(X_train[numeric_features])
y_train_reshaped = y_train.values.reshape(-1, 1)
y_train = ss_y.fit_transform(y_train_reshaped)
X_train.rename(columns={'ocean_proximity_<1H OCEAN': 'ocean_proximity_less_than_1H_OCEAN'}, inplace=True)

In [None]:
X_train.head()

In [None]:
y_train

In [None]:
# Train models
models = {
    'RandomForest': train_model_RandomForest(X_train, y_train),
    'DecisionTree': train_model_DecisionTree(X_train, y_train),
    'LinearRegression': train_model_LinearRegression(X_train, y_train),
    'SupportVectorMachine': train_model_SupportVectorMachine(X_train, y_train),
    'XGBoost': train_model_XGBoost(X_train, y_train)
}

In [None]:
# Apply scaling on test set
X_test[numeric_features] = ss_x.transform(X_test[numeric_features])
y_test_reshaped = y_test.values.reshape(-1,1)
y_test = ss_y.transform(y_test_reshaped)

In [None]:
# Select best model
X_test.rename(columns={'ocean_proximity_<1H OCEAN': 'ocean_proximity_less_than_1H_OCEAN'}, inplace=True)
best_model = select_best_model(models, X_test, y_test)
print("Best model selected:", best_model)