## Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import joblib
import json
import os

# Model building

In [2]:
# function for calucalting Root Mean Squared Logarithmic Error
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

#function for calculating the other metrices
def evaluate_performance(y_test, y_pred):
    
    rmsle_score = compute_rmsle(np.log(y_test), np.log(y_pred))
    mse = np.mean((y_test - y_pred) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_test - y_pred))
    r2 = r2_score(y_test, y_pred)
    evaluation_results = {
        "rmsle_score": rmsle_score,
        "MSE": mse,
        "RMSE": rmse,
        "MAE": mae,
        "R^2": r2
                }

    
    return evaluation_results

def split_data(data: pd.DataFrame, target_column, test_size: float = 0.2, random_state: int = 42) -> tuple:
    X = data.drop(columns=[target_column])
    y = data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test


def clean_data(data: pd.DataFrame) -> pd.DataFrame:
    columns_to_drop = ['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu', 'Id',
                       'GarageYrBlt', '1stFlrSF', 'TotRmsAbvGrd', 'GarageArea']
    data.drop(columns=columns_to_drop, axis=1, inplace=True)
    return data
    


def fill_missing_values(data: pd.DataFrame) -> pd.DataFrame:
    numerical_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = data.select_dtypes(exclude=[np.number]).columns.tolist()

    for col in numerical_cols:
        data[col] = data[col].fillna(data[col].mean())
    for col in categorical_cols:
        data[col] = data[col].fillna(data[col].mode()[0])
    return data
    
def feature_selection(X_train, y_train):
    """
    Performs feature selection based on correlation and chi-square test.
    """
    features_dictionary = {}
    train_data = pd.concat([X_train, y_train], axis=1)
    numerical_correlations = train_data.select_dtypes(include=[np.number]).corr()['SalePrice'].abs().sort_values(ascending=False)
    top_10_numerical_features = numerical_correlations[1:11].index.tolist()
    
    categorical_features = train_data.select_dtypes(exclude=[np.number]).columns.tolist()
    chi2_results = {feature: chi2_contingency(pd.crosstab(train_data[feature], train_data['SalePrice']))[0] for feature in categorical_features}
    top_5_categorical_features = sorted(chi2_results, key=chi2_results.get, reverse=True)[:5]
    

    print('top_5_categorical_features is ',top_5_categorical_features)

    file_path = '/home/sachin/DSP/dsp-anandhu-krishna/models/features_dictionary.json'

    features_dictionary['top_10_numerical_features'] =top_10_numerical_features
    features_dictionary['top_5_categorical_features'] =top_5_categorical_features

    # Dumping the dictionary into a JSON file
    with open(file_path, 'w') as file:
        json.dump(features_dictionary, file)
    
    print(f" features Dictionary saved to {file_path}")
    
    return top_10_numerical_features, top_5_categorical_features

def preprocess_features_train(X: pd.DataFrame, numeric_features: list, categorical_features: list, model_dir: str) -> np.ndarray:
    """
    Preprocesses the given DataFrame by encoding categorical features and scaling numerical features.
    
    Args:
    - X: DataFrame containing the training or test data.
    - numeric_features: List of names of numeric features to scale.
    - categorical_features: List of names of categorical features to encode.
    - model_dir: Directory path where the preprocessing objects (encoder and scaler) will be saved.
    
    Returns:
    - X_processed: A numpy array of processed features ready for training or prediction.
    """
    # Initialize the encoder and scaler
    encoder = OneHotEncoder(handle_unknown='ignore').fit(X[categorical_features])
    scaler = StandardScaler().fit(X[numeric_features])
    
    # Transform the data
    X_encoded = encoder.transform(X[categorical_features]).toarray()
    X_scaled = scaler.transform(X[numeric_features])
    
    # Combine encoded and scaled features
    X_processed = np.hstack((X_scaled, X_encoded))
    
    # Save the preprocessing objects for later use
    save_preprocessing_objects(encoder, scaler, model_dir)
    
    return X_processed
    
def preprocess_features_test(test_data: pd.DataFrame,top_10_numerical_features,top_5_categorical_features, model_dir: str) -> np.ndarray:
    """
    Preprocesses the given DataFrame by encoding categorical features and scaling numerical features.
    
    Args:
    - test_data: DataFrame containing the training or test data.
    - model_dir: Directory path where the preprocessing objects (encoder and scaler) already saved.
    
    Returns:
    - test_data_processed: A numpy array of processed features ready for training or prediction.
    """
   # location of  the encoder and scaler objects
    encoder_path = os.path.join(model_dir, 'encoder.joblib')
    scaler_path = os.path.join(model_dir, 'scaler.joblib')
    
    # Load the encoder objects
    loaded_encoder = joblib.load(encoder_path)
    test_data_encoded = loaded_encoder.transform(test_data[top_5_categorical_features])
    
    
    # Load the scaler objects
    loaded_scaler = joblib.load(scaler_path)
    test_data_scaled = loaded_scaler.transform(test_data[top_10_numerical_features])

    # Combine scaled numeric features and encoded categorical features
    test_data_processed = np.hstack((test_data_scaled, test_data_encoded.toarray()))

    return test_data_processed


def train_model(X_train: np.array, y_train: pd.Series, model_dir: str) -> XGBRegressor:
    param = {
        'max_depth': 4,            
        'objective': 'reg:squarederror',  
        'learning_rate': 0.1,   
        'n_estimators': 200,       
        'subsample': 0.7,          
        'colsample_bytree': 0.8,    
        'eval_metric': 'rmse'       
    }
    
    model = XGBRegressor(**param)
    model.fit(X_train, y_train)
    joblib.dump(model, os.path.join(model_dir, 'model.joblib'))
    print(f'Model saved to {os.path.join(model_dir, "model.joblib")}')
    
def save_preprocessing_objects(encoder, scaler, model_dir: str) -> None:
    joblib.dump(encoder, os.path.join(model_dir, 'encoder.joblib'))
    joblib.dump(scaler, os.path.join(model_dir, 'scaler.joblib'))
    print(f'Encoder and Scaler saved to {model_dir}')

def load_features(file_path):
    """
    Load the top 10 numerical features and top 5 categorical features from a JSON file.

    Parameters:
    file_path (str): The path to the JSON file.

    Returns:
    tuple: A tuple containing two lists: top_10_numerical_features and top_5_categorical_features.
    """
    # Loading the JSON data back into a Python dictionary
    with open(file_path, 'r') as file:
        loaded_features_dictionary = json.load(file)
    
    # Extracting the features from the dictionary
    top_10_numerical_features = loaded_features_dictionary['top_10_numerical_features']
    top_5_categorical_features = loaded_features_dictionary['top_5_categorical_features']
    
    return top_10_numerical_features, top_5_categorical_features


In [3]:
def model_training(X_train,y_train,model_dir) :
    
    X_train = clean_data(X_train) 
    X_train = fill_missing_values(X_train)
    top_10_numerical_features, top_5_categorical_features = feature_selection(X_train,y_train)
    X_train_processed  = preprocess_features_train(X_train,top_10_numerical_features,top_5_categorical_features,model_dir)
    
    train_model(X_train_processed, y_train ,model_dir)
def model_evaluation(X_test,y_test,model_dir) :

    X_test = clean_data(X_test) 
    X_test = fill_missing_values(X_test)
    # Specify the file path from where you want to load the JSON
    file_path = '/home/sachin/DSP/dsp-anandhu-krishna/models/features_dictionary.json'
    top_10_numerical_features, top_5_categorical_features = load_features(file_path)
    X_test_processed  = preprocess_features_test(X_test,top_10_numerical_features,top_5_categorical_features,model_dir)
    model_path = os.path.join(model_dir, 'model.joblib')
    #model loading
    model = joblib.load(model_path)
    #model predicitng
    y_pred = model.predict(X_test_processed)

    evaluation_results =evaluate_performance(y_test, y_pred)

    return evaluation_results
    
def build_model(data: pd.DataFrame) -> dict[str, str]:

    model_dir = '/home/sachin/DSP/dsp-anandhu-krishna/models'
    X_train, X_test, y_train, y_test = split_data(data,"SalePrice")
    
    model_training(X_train,y_train,model_dir)

    performances = model_evaluation(X_test,y_test,model_dir)

    return performances
    
    

### Dataset Loading

In [4]:
train_data = pd.read_csv('/home/sachin/DSP/dsp-anandhu-krishna/data/train.csv')
performances = build_model(train_data)
print("the performances metrices are " , performances)

top_5_categorical_features is  ['Neighborhood', 'Exterior2nd', 'Exterior1st', 'SaleType', 'HouseStyle']
 features Dictionary saved to /home/sachin/DSP/dsp-anandhu-krishna/models/features_dictionary.json
Encoder and Scaler saved to /home/sachin/DSP/dsp-anandhu-krishna/models
Model saved to /home/sachin/DSP/dsp-anandhu-krishna/models/model.joblib
the performances metrices are  {'rmsle_score': 0.01, 'MSE': 706354207.2513262, 'RMSE': 26577.32505823952, 'MAE': 17273.834853916953, 'R^2': 0.9079107995968321}


# Model inference

In [5]:
def make_predictions(test_data: pd.DataFrame) -> np.ndarray:
    #storing test_data_ids into a list
    test_data_id= list(test_data['Id'])
    model_dir = '/home/sachin/DSP/dsp-anandhu-krishna/models'
    test_data_cleand = clean_data(test_data) 
    test_data_cleand = fill_missing_values(test_data_cleand)
    # Specify the file path from where you want to load the JSON
    file_path = '/home/sachin/DSP/dsp-anandhu-krishna/models/features_dictionary.json'
    top_10_numerical_features, top_5_categorical_features = load_features(file_path)
    test_data_processed  = preprocess_features_test(test_data_cleand,top_10_numerical_features,top_5_categorical_features,model_dir)
    model_path = os.path.join(model_dir, 'model.joblib')
    #model loading
    model = joblib.load(model_path)
    #model predicitng
    test_data_pred = model.predict(test_data_processed)
    #storing the predicted slaes price into a df
    test_data_pred_df = pd.DataFrame(test_data_pred, columns=['pred_sales_price'])
    #converting the Ids into a dataframe
    test_data_id_df = pd.DataFrame(test_data_id, columns=['id'])
    # Concatenate the new DataFrame with 'test_data_id_df' 
    df_combined = pd.concat([test_data_id_df, test_data_pred_df], axis=1)
    
    return df_combined



In [6]:
test_data = pd.read_csv('/home/sachin/DSP/dsp-anandhu-krishna/data/test.csv')


test_data_pred = make_predictions(test_data)
test_data_pred

Unnamed: 0,id,pred_sales_price
0,1461,119319.164062
1,1462,142304.906250
2,1463,181756.421875
3,1464,186675.765625
4,1465,201736.968750
...,...,...
1454,2915,69565.773438
1455,2916,82225.914062
1456,2917,175976.859375
1457,2918,115504.906250
