In [2]:
# Importación de bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
class DataExplorer:
    @staticmethod
    def explore_data(data):
        print('Explorando dataset:')
        print(data.head().T)
        print(data.describe())
        print(data.info())
    
    @staticmethod
    def plot_histograms(data):
        data.hist(bins=15, figsize=(15, 10))
        plt.show()

    @staticmethod
    def plot_correlation_matrix(data):
        plt.figure(figsize=(12, 8))
        for col in data.columns:
            data[col] = pd.to_numeric(data[col], errors='coerce')
        sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap='coolwarm')
        plt.show()

In [7]:
class Model:
    def __init__(self, filepath, nc):
        self.filepath = filepath
        standar = StandardScaler()
        minmax = MinMaxScaler()
        pca = PCA(n_components=nc)

        self.preprocessing = ColumnTransformer([
        ('minmax', minmax, make_column_selector(dtype_include=np.number)),
        ('stan', standar, make_column_selector(dtype_include=np.number)),
        ('pca', pca, make_column_selector(dtype_include=np.number))
        ], remainder = 'passthrough')

        self.model = make_pipeline(self.preprocessing, RandomForestRegressor())

        #self.model_pipeline = Pipeline([
        #    ('minmax', MinMaxScaler()),
        #    ('scaler', StandardScaler()),
        #    ('pca', PCA(n_components=nc)),
        #    ('classifier', RandomForestRegressor())
        #])
        self.X_train, self.X_test, self.y_train, self.y_test = [None] * 4

    def load_data(self):
        self.data = pd.read_excel(self.filepath)
        self.data = self.data.rename(columns=self.data.iloc[0]).drop(self.data.index[0])
        DataExplorer.explore_data(self.data)
        return self

    def preprocess_data(self):
        column_names = []
        for i in range (0, len(self.data.columns)):
            column_names.append(self.data.columns[i])
        for i in range (0, len(self.data.columns)):
            if self.data.columns[i] in column_names:
                column_names[i] = column_names[i] + '_' + str(i)
        self.data.columns = column_names
        for col in  self.data.columns:
            self.data[col] = pd.to_numeric(self.data[col], errors='coerce')
        for variable in self.data.iloc[:,:-2].columns:
            percentile_10 = self.data[variable].quantile(0.10)
            percentile_90 = self.data[variable].quantile(0.90)
            iqr = percentile_90 - percentile_10
            upper_limit = percentile_90 + 1.5 * iqr
            lower_limit = percentile_10 - 1.5 * iqr
            outliers = self.data[(self.data[variable] < lower_limit) | (self.data[variable] > upper_limit)]
            self.data = self.data[~self.data[variable].isin(outliers[variable])]
        corr_matrix = self.data.corr().abs()
        # get upper triangle of correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        # find features with correlation greater than 0.98
        to_drop = [column for column in upper.columns if any(upper[column] > 0.97)]
        # drop highly correlated features
        self.data.drop(to_drop, axis=1, inplace=True)
        X = self.data.iloc[:,:-2]
        Y = self.data.iloc[:,[-2,-1]]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
        return self
    
    def train_model(self):
        self.model.fit(self.X_train, self.y_train)
        return self
    
    def evaluate_model(self):
        print("Model Evaluation:")
        y_pred = self.model.predict(self.X_test)
        r2_score_ = r2_score(self.y_test, y_pred)
        mean_squared_error_ = mean_squared_error(self.y_test, y_pred)
        print('r2_score: ' + str(r2_score_))
        print('mean_squared_error: ' + str(mean_squared_error_))
        return self

In [9]:
filepath='/Users/juancarloscamperovilla/Documents/GitHub/MLOps/Residencial_build/Fase_1/data/raw/Residential-Building-Data-Set.xlsx'
nc = 29

model = Model(filepath, nc)
model.load_data()
model.preprocess_data()
model.train_model()
model.evaluate_model()
joblib.dump(model, '/Users/juancarloscamperovilla/Documents/GitHub/MLOps/Residencial_build/Fase_1/models/regression_model_3.joblib')

Explorando dataset:
                           1          2           3           4          5
START YEAR                81         84          78          72         87
START QUARTER              1          1           1           2          1
COMPLETION YEAR           85         89          81          73         90
COMPLETION QUARTER         1          4           4           2          2
V-1                        1          1           1           1          1
V-2                     3150       7600        4800         685       3000
V-3                      920       1140         840         202        800
V-4                    598.5       3040         480        13.7       1230
V-5                      190        400         100          20        410
V-6                  1010.84     963.81      689.84      459.54     631.91
V-7                       16         23          15           4         13
V-8                     1200       2900         630         140       5000
V-11 

['/Users/juancarloscamperovilla/Documents/GitHub/MLOps/Residencial_build/Fase_1/models/regression_model_3.joblib']