In [1]:
import os
from datetime import datetime,timedelta
import sys
from importlib import reload 
from sklearn.model_selection import train_test_split 
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from steps.data_ingestion import *
from steps.data_inspection import *
from steps.data_visualization import *
from steps.data_analysis import *
from steps.missing_values_handling import *
from steps.data_encoding import *


from feast import FeatureStore

import mlflow
from mlflow.models import infer_signature
from mlflow.sklearn import log_model, load_model

import bentoml
from bentoml import HTTPServer




  from bentoml import HTTPServer
  return _bootstrap._gcd_import(name[level:], package, level)


In [2]:
# Data ingest 

file_path = "C:\\Users\\mkrym\\Downloads\\archive.zip"

Ingestor = DataIngestorFactory.get_data_ingestor(file_path)   
df = Ingestor.ingest(file_path)
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
# Initialize inspector and execute
inspector = DataInspector()
results = inspector.execute_all_strategies(df)

# Generate and print report
report = inspector.generate_report("inspection_report.txt")
print(report)

2025-08-24 11:07:46,291 - INFO - Executing all inspection strategies
2025-08-24 11:07:46,293 - INFO - Performing data type inspection
2025-08-24 11:07:46,305 - INFO - Generating summary statistics
2025-08-24 11:07:46,349 - INFO - Analyzing missing values
2025-08-24 11:07:46,355 - INFO - Detecting outliers
2025-08-24 11:07:46,386 - INFO - Checking for duplicates
2025-08-24 11:07:46,414 - INFO - Analyzing data distributions
2025-08-24 11:07:46,462 - INFO - Generating inspection report
2025-08-24 11:07:46,469 - INFO - Report saved to inspection_report.txt


Data Inspection Report - 2025-08-24 11:07:46

DataType:
--------------------------------------------------
Data Types:
  price: int64
  area: int64
  bedrooms: int64
  bathrooms: int64
  stories: int64
  mainroad: object
  guestroom: object
  basement: object
  hotwaterheating: object
  airconditioning: object
  parking: int64
  prefarea: object
  furnishingstatus: object
Type Summary:
  object: 7
  int64: 6

SummaryStatistics:
--------------------------------------------------
Numerical Summary:
  count: {'price': 545.0, 'area': 545.0, 'bedrooms': 545.0, 'bathrooms': 545.0, 'stories': 545.0, 'parking': 545.0}
  mean: {'price': 4766729.247706422, 'area': 5150.54128440367, 'bedrooms': 2.9651376146788992, 'bathrooms': 1.2862385321100918, 'stories': 1.8055045871559634, 'parking': 0.6935779816513762}
  std: {'price': 1870439.6156573922, 'area': 2170.141022508803, 'bedrooms': 0.7380638605685743, 'bathrooms': 0.5024696160532146, 'stories': 0.8674924629255264, 'parking': 0.8615857504605412}
 

In [4]:
# Sample inspection results (in practice, these would come from data_inspection.py)
inspection_results = {
    'MissingValues': {
        'missing_counts': df.isnull().sum().to_dict(),
        'missing_percentage': (df.isnull().sum() / len(df) * 100).to_dict()
    },
    'CorrelationAnalysis': {
        'correlations': df.select_dtypes(include=['float64', 'int64']).corr().to_dict()
    }
}

# Initialize visualizer and execute
visualizer = DataVisualizer()
visualizer.execute_all_visualizations(df, inspection_results)

2025-08-24 11:07:46,514 - INFO - Executing all visualization strategies
2025-08-24 11:07:46,517 - INFO - Running visualization: MissingValuesVisualization
2025-08-24 11:07:46,519 - INFO - Visualizing missing values
2025-08-24 11:07:46,987 - INFO - Missing values plot saved as 'missing_values.png'
2025-08-24 11:07:46,988 - INFO - Running visualization: NumericalDistributionVisualization
2025-08-24 11:07:46,989 - INFO - Visualizing data distributions
2025-08-24 11:07:47,306 - INFO - Distribution plot for price saved as 'distribution_price.png'
2025-08-24 11:07:47,594 - INFO - Distribution plot for area saved as 'distribution_area.png'
2025-08-24 11:07:47,799 - INFO - Distribution plot for bedrooms saved as 'distribution_bedrooms.png'
2025-08-24 11:07:48,098 - INFO - Distribution plot for bathrooms saved as 'distribution_bathrooms.png'
2025-08-24 11:07:48,309 - INFO - Distribution plot for stories saved as 'distribution_stories.png'
2025-08-24 11:07:48,522 - INFO - Distribution plot for p

In [5]:
DataAnalysis = DataAnalyzer()  
results = DataAnalysis.execute_all_strategies(df)
results

2025-08-24 11:07:50,215 - INFO - Using strategy: CorrelationAnalysis
2025-08-24 11:07:50,218 - INFO - Performing correlation analysis
2025-08-24 11:07:50,225 - INFO - Using strategy: CardinalityAnalysis
2025-08-24 11:07:50,229 - INFO - Analyzing cardinality of categorical columns


{'CorrelationAnalysis': {'correlations': {'price': {'price': 1.0,
    'area': 0.5359973457780797,
    'bedrooms': 0.36649402577386964,
    'bathrooms': 0.517545339455012,
    'stories': 0.42071236618861724,
    'parking': 0.38439364863572645},
   'area': {'price': 0.5359973457780797,
    'area': 1.0,
    'bedrooms': 0.1518584855745371,
    'bathrooms': 0.1938195310520531,
    'stories': 0.08399605092891993,
    'parking': 0.35298048121168235},
   'bedrooms': {'price': 0.36649402577386964,
    'area': 0.1518584855745371,
    'bedrooms': 1.0,
    'bathrooms': 0.37393023597215413,
    'stories': 0.4085642375381521,
    'parking': 0.139269896865613},
   'bathrooms': {'price': 0.517545339455012,
    'area': 0.1938195310520531,
    'bedrooms': 0.37393023597215413,
    'bathrooms': 1.0,
    'stories': 0.32616470613294235,
    'parking': 0.17749582102283437},
   'stories': {'price': 0.42071236618861724,
    'area': 0.08399605092891993,
    'bedrooms': 0.4085642375381521,
    'bathrooms': 0.326

Since our data doesnt have any missing values so we just skip this part and move on 

In [6]:
# DataMissingValues = MissingValuesProcessor(df)
# DataMissingValues.set_strategy(DropMissingValues())
# DataMissingValues.execute()

=======================================

Data Preprocessing

=======================================

In [7]:
# Select categorical binary and nominal columns for encoding
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
binary_cols = [col for col in cat_cols if df[col].nunique() == 2]
categorical_cols = [col for col in cat_cols if df[col].nunique() > 2]

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [8]:
DataEncoder = DataEncoderFactory()

binary_encoder = DataEncoder.get_encoder("binary_custom")
categorical_encoder = DataEncoder.get_encoder("one_hot")
numerical_encoder = DataEncoder.get_encoder("numerical")

binary_encoder.encode(df, binary_cols)
categorical_encoder.encode(df, categorical_cols)
numerical_encoder.encode(df, numerical_cols)


2025-08-24 11:07:50,371 - INFO - Applying Binary Custom Encoding.
2025-08-24 11:07:50,386 - INFO - Applying One Hot Encoding.
2025-08-24 11:07:50,399 - INFO - Applying Numerical Scaling.


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,4.566365,1.046726,1.403419,1.421812,1.378217,1,0,0,0,1,1.517692,1,furnished
1,4.004484,1.757010,1.403419,5.405809,2.532024,1,0,0,0,1,2.679409,0,furnished
2,4.004484,2.218232,0.047278,1.421812,0.224410,1,0,1,0,0,1.517692,1,semi-furnished
3,3.985755,1.083624,1.403419,1.421812,0.224410,1,0,1,0,1,2.679409,1,furnished
4,3.554979,1.046726,1.403419,-0.570187,0.224410,1,1,1,0,1,1.517692,0,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,-1.576868,-0.991879,-1.308863,-0.570187,-0.929397,1,0,1,0,0,1.517692,0,unfurnished
541,-1.605149,-1.268613,0.047278,-0.570187,-0.929397,0,0,0,0,0,-0.805741,0,semi-furnished
542,-1.614327,-0.705921,-1.308863,-0.570187,-0.929397,1,0,0,0,0,-0.805741,0,unfurnished
543,-1.614327,-1.033389,0.047278,-0.570187,-0.929397,0,0,0,0,0,-0.805741,0,furnished


<!-- Prepare the dataset for the feature store (Feast) -->

In [12]:
# Create timestamps 
timestamps = pd.date_range(
    start=pd.Timestamp.now(), 
    end=pd.Timestamp.now() + pd.DateOffset(days=30), 
    periods=len(df)
).to_frame(name="event_timestamp", index=False)

df["event_timestamp"] = timestamps.event_timestamp
df["house_id"] = range(1, len(df) + 1)  # Assign unique IDs to each house


In [13]:
# Splitting the dataset into features (X) and target (y)
X = df.drop(columns=['price'])
y = pd.DataFrame(df[['house_id', 'event_timestamp', 'price']])

# Display the first few rows of the preprocessed features
X.head(), y.head()

(       area  bedrooms  bathrooms   stories  mainroad  guestroom  basement  \
 0  1.046726  1.403419   1.421812  1.378217         1          0         0   
 1  1.757010  1.403419   5.405809  2.532024         1          0         0   
 2  2.218232  0.047278   1.421812  0.224410         1          0         1   
 3  1.083624  1.403419   1.421812  0.224410         1          0         1   
 4  1.046726  1.403419  -0.570187  0.224410         1          1         1   
 
    hotwaterheating  airconditioning   parking  prefarea furnishingstatus  \
 0                0                1  1.517692         1        furnished   
 1                0                1  2.679409         0        furnished   
 2                0                0  1.517692         1   semi-furnished   
 3                0                1  2.679409         1        furnished   
 4                0                1  1.517692         0        furnished   
 
                 event_timestamp  house_id  
 0 2025-08-24 11:26:5