<h3>This notebook contains a Lasso Regression. Our dataset consist of weather and other variables corresponding to sugar cane crops, the goal is to predict values of sucrose production.</h3>

# Loading libraries and dataset

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
plt.style.use(['seaborn'])
sns.set_theme(style="whitegrid", palette=sns.color_palette("tab10"))
sns.set_style('ticks')

from sklearn.linear_model import Lasso
# import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_absolute_error
from scipy.stats import normaltest
from sklearn.model_selection import RandomizedSearchCV, RepeatedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, quantile_transform
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import multiprocessing

np.random.seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path='/data.xlsx'
weather_data_final=pd.read_excel(path)
file = ' no suelo'

#  Normalization
---

Perform a normality test on our target variables (*Sac, Sac Campo* and *Sac % Caña*)

In [None]:
fig, ax = plt.subplots(1,3,figsize=(25,10))
ax[0].hist(weather_data_final['Sac'], color='r')
ax[0].set_title('Histogram of Sac')
ax[1].hist(weather_data_final['Sac % Caña'],color='g')
ax[1].set_title('Histogram of Sac % Caña')
ax[2].hist(weather_data_final['Sac Campo'])
ax[2].set_title('Histogram of Sac Campo')

In [None]:
def normal_test(target):
	stat, p = normaltest(weather_data_final[target])
	alpha =0.05
	if p > alpha:
		print(target + ' looks Gaussian')
	else:
		print(target + ' does not look Gaussian')
normal_test('Sac')
normal_test('Sac % Caña')
normal_test('Sac Campo')

There is no conclusion about the normality.

Using preprocessing techniques such as *Quantile transformation* we can normalize our targets.

In [None]:
def normalization(target):
    y=weather_data_final[target]
    y_trans = quantile_transform(y.to_frame(), output_distribution="normal", copy=True)
    return y_trans

weather_data_final['sac_trans'] = normalization('Sac')
weather_data_final['sac_caña_trans']= normalization('Sac % Caña')
weather_data_final['sac_campo_trans'] = normalization('Sac Campo')

In [None]:
fig, ax = plt.subplots(1,3,figsize=(25,10))
ax[0].hist(weather_data_final['sac_trans'], color='r')
ax[0].set_title('Histogram of Sac after transformation')
ax[1].hist(weather_data_final['sac_caña_trans'],color='g')
ax[1].set_title('Histogram of Sac % Caña after transformation')
ax[2].hist(weather_data_final['sac_campo_trans'])
ax[2].set_title('Histogram of Sac Campo after transformation')

Info about target variables after the transformation

In [None]:
weather_data_final[['sac_trans', 'sac_campo_trans', 'sac_caña_trans']].describe()

Testing normality

In [None]:
normal_test('sac_trans')
normal_test('sac_caña_trans')
normal_test('sac_campo_trans')

In [None]:
weather_data_final.drop(['Sac', 'Sac % Caña', 'Sac Campo'], axis=1, inplace=True) #remove old targets to left only normal targets

# Train-Test split

Train-test split 80/20

In [None]:
X = weather_data_final.drop(['sac_trans', 'sac_caña_trans', 'sac_campo_trans'], axis=1)
y = weather_data_final[['sac_trans', 'sac_caña_trans', 'sac_campo_trans']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Preprocess and training pipeline



We are going to use one hot encoder for categorical variables and standarization for numerical.

In [None]:
cat_vars = ['tmprda', 'TIPO COS','Con Sin Mad', 'nm_cndcion', 'PRODUCTO', 'VAR']
num_vars =  X_train.select_dtypes(include=['float64', 'int']).columns.to_list()

In [None]:
# Transformaciones para las variables numéricas
numeric_transformer = Pipeline(
                        steps=[('scaler', StandardScaler())]
                      )

# Transformaciones para las variables categóricas
categorical_transformer = Pipeline(
                            steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]
                          )

preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', numeric_transformer, num_vars),
                        ('cat', categorical_transformer, cat_vars)
                    ],
                    remainder='passthrough'
                )

# Se combinan los pasos de preprocesado y el modelo en un mismo pipeline.
pipe = Pipeline([('preprocessing', preprocessor),
                 ('modelo', Lasso(max_iter=2000))])

# Definimos espacio de busqueda para el param alpha de Lasso
param_distributions = {'modelo__alpha': np.linspace(0, 1, 100)}

grid = RandomizedSearchCV(
        estimator  = pipe,
        param_distributions = param_distributions,
        n_iter     = 10,
        scoring    = 'neg_mean_absolute_error',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits = 5, n_repeats = 3),
        refit      = True,
        verbose    = 0,
        random_state = 123,
        return_train_score = True
       )

# Sac

In [None]:
target = 'sac_trans'
grid.fit(X = X_train, y = y_train[target])

In [None]:
resultados_sac = pd.DataFrame(grid.cv_results_)
resultados_sac.filter(regex = '(param.*|mean_t|std_t)')\
    .drop(columns = 'params')\
    .sort_values('mean_test_score', ascending = False)\
    .head(3)

In [None]:
# Gráfico resultados validación cruzada para cada hiperparámetro
# ==============================================================================
def plot_error(resultados):
    fig, ax = plt.subplots(figsize=(6, 3.84))
    hiperparametro = 'param_modelo__alpha'
    resultados = resultados.sort_values(hiperparametro, ascending = False)
    metrica    = grid.scoring

    resultados.plot(hiperparametro, 'mean_train_score', ax=ax)
    resultados.plot(hiperparametro, 'mean_test_score', ax=ax)
    ax.fill_between(resultados[hiperparametro].astype(int),
                    resultados['mean_train_score'] + resultados['std_train_score'],
                    resultados['mean_train_score'] - resultados['std_train_score'],
                    alpha=0.2)
    ax.fill_between(resultados[hiperparametro].astype(int),
                    resultados['mean_test_score'] + resultados['std_test_score'],
                    resultados['mean_test_score'] - resultados['std_test_score'],
                    alpha=0.2)
    ax.legend()
    ax.set_title('Evolución del error CV')
    ax.set_ylabel(metrica);
plot_error(resultados_sac)

In [None]:
# Error de test del modelo final
# ==============================================================================
modelo_final = grid.best_estimator_
predicciones = modelo_final.predict(X = X_test)
mae_err = mean_absolute_error(
            y_true  = y_test[target],
            y_pred  = predicciones,
          )
r2_err = r2_score(y_true  = y_test[target],
            y_pred  = predicciones)
print(f"El error (MAE) de test es: {mae_err}")
print(f"El r2 de test es: {r2_err}")

In [None]:
def plot_series(time, series,i, format="-", start=0, end=None):
    # fig, axis = plt.figure(figsize=(20,10))
    plt.plot(time[start:end], series[start:end], format,label=i)
    plt.xlabel("Unseen Samples")
    plt.ylabel("Sucrose Field")
    # plt.legend()

In [None]:
plt.figure(figsize=(22,5))
plot_series(y_test.reset_index().index, y_test[target], "True")
plot_series(y_test.reset_index().index, predicciones,'Predicted')
plt.legend()
plt.grid(False)
plt.title('Sac true vs predicted')
fig = plt.gcf()
plt.show()
fig.savefig('true vs predic sac' + file + '.png', bboxs='tight')

# Sac Campo

In [None]:
target = 'sac_campo_trans'
grid.fit(X = X_train, y = y_train[target])

In [None]:
resultados_sac = pd.DataFrame(grid.cv_results_)
resultados_sac.filter(regex = '(param.*|mean_t|std_t)')\
    .drop(columns = 'params')\
    .sort_values('mean_test_score', ascending = False)\
    .head(3)

In [None]:
# Gráfico resultados validación cruzada para cada hiperparámetro
# ==============================================================================

plot_error(resultados_sac)

In [None]:
# Error de test del modelo final
# ==============================================================================
modelo_final = grid.best_estimator_
predicciones = modelo_final.predict(X = X_test)
mae_err = mean_absolute_error(
            y_true  = y_test[target],
            y_pred  = predicciones,
          )
r2_err = r2_score(y_true  = y_test[target],
            y_pred  = predicciones)
print(f"El error (MAE) de test es: {mae_err}")
print(f"El r2 de test es: {r2_err}")

In [None]:
plt.figure(figsize=(22,5))
plot_series(y_test.reset_index().index, y_test[target], "True")
plot_series(y_test.reset_index().index, predicciones,'Predicted')
plt.legend()
plt.grid(False)
plt.title('Sac_Campo true vs predicted')
fig = plt.gcf()

plt.show()

fig.savefig('true vs predic sac campo' + file + '.png', bboxs='tight')

# Sac % Caña

In [None]:
target = 'sac_caña_trans'
grid.fit(X = X_train, y = y_train[target])

In [None]:
resultados_sac = pd.DataFrame(grid.cv_results_)
resultados_sac.filter(regex = '(param.*|mean_t|std_t)')\
    .drop(columns = 'params')\
    .sort_values('mean_test_score', ascending = False)\
    .head(3)

In [None]:
# Gráfico resultados validación cruzada para cada hiperparámetro
# ==============================================================================

plot_error(resultados_sac)

In [None]:
# Error de test del modelo final
# ==============================================================================
modelo_final = grid.best_estimator_
predicciones = modelo_final.predict(X = X_test)
mae_err = mean_absolute_error(
            y_true  = y_test[target],
            y_pred  = predicciones,
          )
r2_err = r2_score(y_true  = y_test[target],
            y_pred  = predicciones)
print(f"El error (MAE) de test es: {mae_err}")
print(f"El r2 de test es: {r2_err}")

In [None]:
plt.figure(figsize=(22,5))
plot_series(y_test.reset_index().index, y_test[target], "True")
plot_series(y_test.reset_index().index, predicciones,'Predicted')
plt.legend()
plt.grid(False)
plt.title('Sac_%_Caña true vs predicted')
fig = plt.gcf()
plt.show()
fig.savefig('true vs predic sac caña' + file + '.png', bboxs='tight')