In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import numpy as np
import locale
locale.setlocale(locale.LC_TIME, 'es_ES.UTF-8')

import ds_utilidades as ds

pd.options.mode.chained_assignment = None  # default='warn'
plt.style.use('seaborn-notebook')

## Data loading and cleaning

In this section, the applicant loads and cleans raw data from the following files:
- `precipitaciones.csv` has the monthly mean of rainfall registered from January 1979 to April 2020.
- `banco_central.csv` has macroeconomic variables.

### Rainfall _(precipitaciones)_

In [None]:
precipitaciones = pd.read_csv('./precipitaciones.csv')#[mm]
precipitaciones['date'] = pd.to_datetime(precipitaciones['date'], format = '%Y-%m-%d')
precipitaciones = precipitaciones.sort_values(by = 'date', ascending = True).reset_index(drop = True)
precipitaciones

In [None]:
precipitaciones[precipitaciones.isna().any(axis=1)] #no tiene nans

In [None]:
precipitaciones[precipitaciones.duplicated(subset = 'date', keep = False)] #ni repetidos

In [None]:
regiones = ['Coquimbo', 'Valparaiso', 'Metropolitana_de_Santiago',
       'Libertador_Gral__Bernardo_O_Higgins', 'Maule', 'Biobio',
       'La_Araucania', 'Los_Rios']
precipitaciones[regiones].describe() 

In [None]:
precipitaciones.dtypes

### Macroeconomic variables _(banco central)_

In [None]:
banco_central = pd.read_csv('./banco_central.csv')
banco_central

In [None]:
banco_central['Periodo'] = banco_central['Periodo'].apply(lambda x: x[0:10])

banco_central['Periodo'] = pd.to_datetime(banco_central['Periodo'], format = '%Y-%m-%d', errors = 'coerce')
display(banco_central.head())
print(banco_central.columns) 


In [None]:
banco_central[banco_central.duplicated(subset = 'Periodo', keep = False)] #repetido se elimina

In [None]:
banco_central.drop_duplicates(subset = 'Periodo', inplace = True)
banco_central = banco_central[~banco_central.Periodo.isna()]

In [None]:
def convert_int(x):
    return int(x.replace('.', ''))

cols_pib = [x for x in list(banco_central.columns) if 'PIB' in x]
cols_pib.extend(['Periodo'])
banco_central_pib = banco_central[cols_pib]
banco_central_pib = banco_central_pib.dropna(how = 'any', axis = 0)

for col in cols_pib:
    if col == 'Periodo':
        continue
    else:
        banco_central_pib[col] = banco_central_pib[col].apply(lambda x: convert_int(x))

banco_central_pib.sort_values(by = 'Periodo', ascending = True)
banco_central_pib

In [None]:
def to_100(x): #mirando datos del bc, pib existe entre ~85-120 - igual esto es cm (?)
    x = x.split('.')
    if x[0].startswith('1'): #es 100+
        if len(x[0]) >2:
            return float(x[0] + '.' + x[1])
        else:
            x = x[0]+x[1]
            return float(x[0:3] + '.' + x[3:])
    else:
        if len(x[0])>2:
            return float(x[0][0:2] + '.' + x[0][-1])
        else:
            x = x[0] + x[1]
            return float(x[0:2] + '.' + x[2:])

        
cols_imacec = [x for x in list(banco_central.columns) if 'Imacec' in x]
cols_imacec.extend(['Periodo'])
banco_central_imacec = banco_central[cols_imacec]
banco_central_imacec = banco_central_imacec.dropna(how = 'any', axis = 0)

for col in cols_imacec:
    if col == 'Periodo':
        continue
    else:
        banco_central_imacec[col] = banco_central_imacec[col].apply(lambda x: to_100(x))
        assert(banco_central_imacec[col].max()>100)
        assert(banco_central_imacec[col].min()>30)

banco_central_imacec.sort_values(by = 'Periodo', ascending = True)
banco_central_imacec

In [None]:
banco_central_iv = banco_central[['Indice_de_ventas_comercio_real_no_durables_IVCM', 'Periodo']]
banco_central_iv = banco_central_iv.dropna() # -unidades? #parte 
banco_central_iv = banco_central_iv.sort_values(by = 'Periodo', ascending = True)

In [None]:
banco_central_iv.head() #unidades? https://si3.bcentral.cl/siete/ES/Siete/Canasta?idCanasta=M57TP1161519 porcentajes?

In [None]:
banco_central_iv['num'] = banco_central_iv.Indice_de_ventas_comercio_real_no_durables_IVCM.apply(lambda x: to_100(x))
banco_central_iv

In [None]:
banco_central_iv.Periodo.min()

In [None]:
banco_central_iv.Periodo.max()

In [None]:
banco_central_num = pd.merge(banco_central_pib, banco_central_imacec, on = 'Periodo', how = 'inner')
banco_central_num = pd.merge(banco_central_num, banco_central_iv, on = 'Periodo', how = 'inner')

# Supply data and feature engineering.

In this section, the applicant loads the file `precio_leche.csv` which contains the monthly price of milk from 1979 to 2021. She/he/they merges this dataset with the data previously processed and creates new variables to help prediction (feature engineering). Aditionally, some some variables are plotted for exploratory analisys.

In [None]:
precio_leche = pd.read_csv('./precio_leche.csv')
precio_leche.rename(columns = {'Anio': 'ano', 'Mes': 'mes_pal'}, inplace = True) # precio = nominal, sin iva en clp/litro
precio_leche['mes'] = pd.to_datetime(precio_leche['mes_pal'], format = '%b')
precio_leche['mes'] = precio_leche['mes'].apply(lambda x: x.month)
precio_leche['mes-ano'] = precio_leche.apply(lambda x: f'{x.mes}-{x.ano}', axis = 1)
precio_leche.head()

In [None]:
precio_leche.plot(x = 'mes-ano', y = 'Precio_leche') #alza 2010-2011?

In [None]:
precio_leche[precio_leche.ano>=2013].plot(x = 'mes-ano', y = 'Precio_leche')

In [None]:
precipitaciones['mes'] = precipitaciones.date.apply(lambda x: x.month)
precipitaciones['ano'] = precipitaciones.date.apply(lambda x: x.year)
precio_leche_pp = pd.merge(precio_leche, precipitaciones, on = ['mes', 'ano'], how = 'inner')
precio_leche_pp.drop('date', axis = 1, inplace = True)
precio_leche_pp #precipitaciones fecha_max = 2020-04-01

In [None]:
banco_central_num['mes'] = banco_central_num['Periodo'].apply(lambda x: x.month)
banco_central_num['ano'] = banco_central_num['Periodo'].apply(lambda x: x.year)
precio_leche_pp_pib = pd.merge(precio_leche_pp, banco_central_num, on = ['mes', 'ano'], how = 'inner')
precio_leche_pp_pib.drop(['Periodo', 'Indice_de_ventas_comercio_real_no_durables_IVCM', 'mes-ano', 'mes_pal'], axis =1, inplace = True)
precio_leche_pp_pib

# Model
In this section, the applicant builds a regression model to predict the price of milk. The model uses and macroeconomic climatological variables loaded at the beginning of this notebook. It also uses features created in the previous section.

In [None]:
X = precio_leche_pp_pib.drop(['Precio_leche'], axis = 1)
y = precio_leche_pp_pib['Precio_leche']

In [None]:
y.mean()

In [None]:
y.std()

In [None]:
# imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score

# generate random data-set
np.random.seed(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe = Pipeline([('scale', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_regression)),
                 ('poly', PolynomialFeatures()),
                 ('model', Ridge())])
k=[3, 4, 5, 6, 7, 10]
alpha=[1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]
poly = [1, 2, 3, 5, 7]
grid = GridSearchCV(estimator = pipe,
                    param_grid = dict(selector__k=k,
                                      poly__degree=poly,
                                      model__alpha=alpha),
                    cv = 3,
                   scoring = 'r2')
grid.fit(X_train, y_train)
y_predicted = grid.predict(X_test)

# evaluar modelo
rmse = mean_squared_error(y_test, y_predicted)
r2 = r2_score(y_test, y_predicted)

# printing values
print('RMSE: ', rmse)
print('R2: ', r2)

In [None]:
grid.best_params_

In [None]:
X_train.columns[grid.best_estimator_.named_steps['selector'].get_support()]

In [None]:
predicted = pd.DataFrame(y_test).reset_index(drop = True)
predicted['predicc'] = y_predicted
predicted= predicted.reset_index()
plt.scatter(predicted.index, predicted['Precio_leche'], label = 'real')
plt.scatter(predicted.index, predicted['predicc'], color = 'red', label = 'prediccion', alpha = 0.5)
plt.grid(axis = 'x')
plt.legend()

In [None]:
predicted['residual'] = predicted.Precio_leche - predicted.predicc
plt.hlines(0, xmin = predicted.predicc.min()-10, xmax = predicted.predicc.max()+10, linestyle='--', color = 'black', linewidth = 0.7)
plt.scatter(predicted.predicc, predicted.residual)
plt.xlabel('Predicción')
plt.ylabel('Residuo (y_real - y_pred)')

### Regresión utilizando solamente variables macroeconómicas y climatológicas

In [None]:
# generate random data-set
np.random.seed(0)
cols_no_leche = [x for x in list(X.columns) if not ('leche' in x)]
X_train = X_train[cols_no_leche]
X_test = X_test[cols_no_leche]

pipe = Pipeline([('scale', StandardScaler()),
                 ('selector', SelectKBest(mutual_info_regression)),
                 ('poly', PolynomialFeatures()),
                 ('model', Ridge())])
k=[3, 4, 5, 6, 7, 10]
alpha=[1, 0.5, 0.2, 0.1, 0.05, 0.02, 0.01]
poly = [1, 2, 3, 5, 7]
grid = GridSearchCV(estimator = pipe,
                    param_grid = dict(selector__k=k,
                                      poly__degree=poly,
                                      model__alpha=alpha),
                    cv = 3,
                   scoring = 'r2')
grid.fit(X_train, y_train)
y_predicted_noleche = grid.predict(X_test)

# evaluar modelo
rmse = mean_squared_error(y_test, y_predicted_noleche)
r2 = r2_score(y_test, y_predicted_noleche)

# printing values
print('RMSE: ', rmse)
print('R2: ', r2)

In [None]:
grid.best_params_

In [None]:
X_train.columns[grid.best_estimator_.named_steps['selector'].get_support()]

In [None]:
predicted = pd.DataFrame(y_test).reset_index(drop = True)
predicted['predicc'] =y_predicted_noleche
predicted= predicted.reset_index()
plt.scatter(predicted.index, predicted['Precio_leche'], label = 'real')
plt.scatter(predicted.index, predicted['predicc'], color = 'red', label = 'prediccion', alpha = 0.5)
plt.grid(axis = 'x')
plt.legend()

In [None]:
predicted['residual'] = predicted.Precio_leche - predicted.predicc
plt.hlines(0, xmin = predicted.predicc.min()-10, xmax = predicted.predicc.max()+10, linestyle='--', color = 'black', linewidth = 0.7)
plt.scatter(predicted.predicc, predicted.residual)
plt.xlabel('Predicción')
plt.ylabel('Residuo (y_real - y_pred)')