In [46]:
# Importación de bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns  
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [43]:
def data_load(path):
    try:
        df = pd.read_excel(path)
        df = df.rename(columns=df.iloc[0]).drop(df.index[0])
        return df
    except Exception as e:
        return e

def preprocessing_data(df):
    try:
        column_names = []
        for i in range (0, len(df.columns)):
            column_names.append(df.columns[i])
        for i in range (0, len(df.columns)):
            if df.columns[i] in column_names:
                column_names[i] = column_names[i] + '_' + str(i)
        df.columns = column_names
        
        for col in  df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        
        for variable in df.iloc[:,:-2].columns:
            percentile_10 = df[variable].quantile(0.10)
            percentile_90 = df[variable].quantile(0.90)
            iqr = percentile_90 - percentile_10
            upper_limit = percentile_90 + 1.5 * iqr
            lower_limit = percentile_10 - 1.5 * iqr
            outliers = df[(df[variable] < lower_limit) | (df[variable] > upper_limit)]
            df = df[~df[variable].isin(outliers[variable])]
        
        corr_matrix = df.corr().abs()
        # get upper triangle of correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        # find features with correlation greater than 0.98
        to_drop = [column for column in upper.columns if any(upper[column] > 0.98)]

        # drop highly correlated features
        df.drop(to_drop, axis=1, inplace=True)

        return df

    except Exception as e:
        return e

def X_and_Y(df):
    try:
        X = df.iloc[:,:-2]
        Y = df.iloc[:,[-2,-1]]

        return [X, Y]
    except Exception as e:
        return e
    
def normalice_PCA(X, Y):
    standar = StandardScaler().fit(X)
    standar_df = standar.transform(X)
    standar_df = pd.DataFrame(standar_df, columns=X.columns)
    total = pd.concat([standar_df, Y], axis=1)
    total_components = standar_df.shape[1]

    pca = PCA()
    pca.fit(standar_df)
    x_pca = pca.transform(standar_df)
    x_pca_df = pd.DataFrame(x_pca, columns=pca.get_feature_names_out())

    exp_var = pca.explained_variance_ratio_ * 100
    cum_exp_var = np.cumsum(exp_var)
    for i in range(0,len(cum_exp_var)):
        if cum_exp_var[i] >= 99:
            n_components_ = i + 1
            break
    standar = StandardScaler()
    minmax = MinMaxScaler()
    pca = PCA(n_components=n_components_)

    preprocessing = ColumnTransformer([
        #('dropper', 'drop', mycols),
        #('sqrt', FunctionTransformer(np.sqrt), skew_cols),
        ('minmax', minmax, make_column_selector(dtype_include=np.number)),
        ('stan', standar, make_column_selector(dtype_include=np.number)),
        ('pca', pca, make_column_selector(dtype_include=np.number))
        ], remainder = 'passthrough')
    return preprocessing

def data_split(X, Y):
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=42)
    return [Xtrain, Xtest, ytrain, ytest]

def train_model(Xtrain, ytrain, preprocessing):
    randf_model = make_pipeline(preprocessing, RandomForestRegressor())
    randf_model.fit(Xtrain, ytrain)
    return randf_model

def test_evaluate(randf_model, Xtest, ytest):
    y_predicted = randf_model.predict(Xtest)
    r2_score_ = r2_score(ytest, y_predicted)
    mean_squared_error_ = mean_squared_error(ytest, y_predicted)
    print('r2_score: ' + str(r2_score_))
    print('mean_squared_error: ' + str(mean_squared_error_))
    return [r2_score_, mean_squared_error_]

In [47]:
def main(filepath):
    df = data_load(filepath)
    df_processed = preprocessing_data(df)
    X, Y = X_and_Y(df_processed)
    preprocessing = normalice_PCA(X, Y)
    Xtrain, Xtest, ytrain, ytest = data_split(X, Y)
    randf_model = train_model(Xtrain, ytrain, preprocessing)
    r2_score_, mean_squared_error_ = test_evaluate(randf_model, Xtest, ytest)
    joblib.dump(randf_model, '/Users/juancarloscamperovilla/Documents/GitHub/MLOps/Residencial_build/Fase_1/models/regression_model_1.joblib')
    

In [48]:
main('/Users/juancarloscamperovilla/Documents/GitHub/MLOps/Residencial_build/Fase_1/data/raw/Residential-Building-Data-Set.xlsx')

r2_score: 0.9562005201338709
mean_squared_error: 703.8379788451102
