# 5.1 Código ML (INDIVIDUAL) - Avance Evidencia 2
Equipo 1
Hiram Maximiliano Muñoz Ramirez A01197991

In [1]:
import pandas as pd
from tempfile import mkdtemp
from sklearn import pipeline, preprocessing, feature_selection, compose, linear_model, model_selection, svm, ensemble

coating_df = pd.read_feather('data/pinturas_revestidos_jul20_ago23.feather')

In [2]:
class NullColumnRemover:
    null_threshold = 0.0
    columns_to_keep = None

    def __init__(self, null_threshold):
        self.null_threshold = null_threshold

    def get_params(self, deep):
        return {
            'null_threshold': self.null_threshold
        }

    def fit(self, X, y=None):
        null_percents = pd.isna(X).sum(axis=0) / X.shape[0]
        self.columns_to_keep = null_percents[null_percents < self.null_threshold].index
        return self

    def transform(self, X):
        return X[self.columns_to_keep]

    def get_feature_names_out(self, input_features=None):
        return self.columns_to_keep.values


cachedir = mkdtemp()
cleaner = pipeline.Pipeline([
    ('null_threshold', NullColumnRemover(0.05)),
    ('scaling_and_one_hot', compose.ColumnTransformer([
        ('one_hot', preprocessing.OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'),
         compose.make_column_selector(dtype_include='category')),
        ('scaler', preprocessing.StandardScaler(), compose.make_column_selector(dtype_include=['float64']))
    ])),
    ('variance_threshold', feature_selection.VarianceThreshold()),
], memory=cachedir)

scoring = [
    'neg_mean_squared_error',
    'neg_root_mean_squared_error',
    'neg_mean_absolute_error',
    'r2',
    'neg_mean_absolute_percentage_error',
]




In [3]:
cleaner.fit(coating_df.drop(columns='total_liters_used'))
for feature in cleaner.get_feature_names_out(input_features=coating_df.drop(columns='total_liters_used').columns.values):
    print(feature)



KeyboardInterrupt



In [None]:
lasso_pipeline = pipeline.Pipeline([
    ('cleaner', cleaner),
    ('lasso', linear_model.Lasso())
], memory=cachedir)

In [None]:
lasso_pipeline

In [None]:

lasso_grid = model_selection.GridSearchCV(lasso_pipeline, param_grid={
    'lasso__alpha': [0.1, 0.5, 1, 2],
}, scoring='neg_mean_squared_error', refit=True)

lasso_scores = model_selection.cross_validate(lasso_grid, coating_df.drop(columns='total_liters_used'),
                                              coating_df['total_liters_used'], scoring=scoring)

In [None]:
svm_pipeline = pipeline.Pipeline([
    ('cleaner', cleaner),
    ('svr', svm.LinearSVR())
], memory=cachedir)

In [None]:
svm_pipeline

In [None]:

svm_grid = model_selection.GridSearchCV(svm_pipeline, param_grid={
    'svr__C': [2.5, 5, 10, 20],
}, scoring='neg_mean_squared_error', refit=True)

svm_scores = model_selection.cross_validate(svm_grid, coating_df.drop(columns='total_liters_used'),
                                            coating_df['total_liters_used'], scoring=scoring)

In [None]:
gradient_booster_pipeline = pipeline.Pipeline([
    ('cleaner', cleaner),
    ('gradient_booster', ensemble.HistGradientBoostingRegressor(max_iter=100))
])

In [None]:
gradient_booster_pipeline

In [None]:

gradient_booster_grid = model_selection.GridSearchCV(gradient_booster_pipeline, param_grid={
    'gradient_booster__learning_rate': [0.05, 0.1, 0.25, 0.5],
}, scoring='neg_mean_squared_error', refit=True)

gradient_booster_scores = model_selection.cross_validate(gradient_booster_grid,
                                                         coating_df.drop(columns='total_liters_used'),
                                                         coating_df['total_liters_used'],
                                                         scoring=scoring)

In [None]:
lasso_scores_df = pd.DataFrame(lasso_scores)
lasso_scores_df

In [None]:
svm_scores_df = pd.DataFrame(svm_scores)
svm_scores_df

In [None]:
gradient_booster_scores_df = pd.DataFrame(gradient_booster_scores)
gradient_booster_scores_df

El mejor modelo es claramente el de **Gradient Boost**. Este modelo presenta el menor error en todas las pruebas, por lo cual claramente resulta la mejora elección para la predicción del consumo de pintura. Los otros dos modelos comparten la característica de ser modelos lineales, por lo que no son ideales para realizar esta regresión.

In [None]:
coating_df._get_numeric_data()

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

corr = coating_df._get_numeric_data().drop(columns=['pos', 'nom_1']).corr().round(2)
corr

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(corr, annot=True)
plt.show()
