In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

In [2]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
class CategoricalEncoder:
    
    """
    It also changes the data type to int
    """
    
    def __init__(self, cols, method = 'classic'):
        self.COLUMNS = cols
        self.METHOD = method
        self.encoder = {}
        self.inverse_encoder = {}
        
    def fit(self, X, y=None):
        X = X.copy()
        if self.METHOD == 'classic':
            for col in self.COLUMNS:
                self.encoder[col] = {}
                self.inverse_encoder[col] = {}
                unique_values = X[col].unique()
                for i, val in enumerate(unique_values):
                    self.encoder[col][val] = i
                    self.inverse_encoder[col][i] = val
                    
        if self.METHOD == 'mean':
            for col in self.COLUMNS:
                self.encoder[col] = {}
                self.inverse_encoder[col] = {}
                unique_values = X[col].unique()
                for val in unique_values:
                    tmp = X[col][X[col] == val]
                    mean = y.loc[tmp.index, 'P80'].mean()
                    self.encoder[col][val] = mean
                    self.inverse_encoder[col][mean] = val
                self.encoder['NaN'] = y['P80'].mean()
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in self.COLUMNS:
            X[col] = X[col].map(self.encoder[col]).fillna(self.encoder['NaN'])
        return X
    
    def inverse_transform(self, X):
        X = X.copy()
        for col in self.COLUMNS:
            X[col] = X[col].replace(self.inverse_encoder[col])
        return X
    

In [4]:
"""
P80 es bi modal
"""
df = pd.read_csv('data/data_fixed.csv',sep=';')
df = df[df['Tipo Explosivo'] != 'M']
df = df.dropna()

In [5]:
df.columns

Index(['Fase', 'Banco', 'Tipo de tronadura', 'Tipo Material', 'M',
       'Dominio Estructural', 'Diámetro', 'BxS', 'Tiempo entre Pozos Filas ms',
       'Fc', 'Tipo Explosivo', 'P10', 'P20', 'P30', 'P40', 'P50', 'P60', 'P70',
       'P80', 'P90', 'P100', 'Este', 'Norte', 'Cota', 'B', 'S', 'tiempo_1',
       'tiempo_2'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,Fase,Banco,Tipo de tronadura,Tipo Material,M,Dominio Estructural,Diámetro,BxS,Tiempo entre Pozos Filas ms,Fc,...,P80,P90,P100,Este,Norte,Cota,B,S,tiempo_1,tiempo_2
0,Stage 5,2930.0,Tronadura 1,Roca 1,L4,Noreste,10.625,6.5x6.5,11-105,498.0,...,6.5,9.37,15.08,59095.2,90292.2,2930.0,6.5,6.5,11.0,105.0
1,Stage 4,2990.0,Tronadura 1,Roca 1,L4,Noreste,10.625,6.5x6.5,11-105,424.0,...,6.34,8.46,13.54,59276.7,90607.7,2990.0,6.5,6.5,11.0,105.0
2,Stage 4,2930.0,Tronadura 1,Roca 1,L4,Noreste,10.625,6.5x6.5,11-105,424.0,...,6.38,9.25,14.61,59067.4,90335.2,2930.0,6.5,6.5,11.0,105.0
3,Stage 4,2990.0,Tronadura 1,Roca 1,L4,Noreste,10.625,6.5x8,5-87,385.0,...,6.22,8.15,16.46,59278.6,90567.5,2990.0,6.5,8.0,5.0,87.0
4,Stage 2,3575.0,Tronadura 2,Roca 1,L8,Norte,10.625,7x7,5-87,507.0,...,4.88,6.38,11.46,59238.4,91671.4,3575.0,7.0,7.0,5.0,87.0


In [7]:
CATEGORICAL_COLS = ['Fase', 'Tipo de tronadura', 'Tipo Material', 'M', 'Dominio Estructural', 'Tipo Explosivo']
NUMERICAL_COLS = ['Banco', 'Diámetro', 'Fc', 'Cota', 'B', 'S', 'tiempo_1', 'tiempo_2']
TARGET_COLS = ['P10', 'P20', 'P30', 'P40', 'P50', 'P60', 'P70', 'P80', 'P90', 'P100']
TEST_SIZE = 0.33

In [8]:
#y = df[TARGET_COLS]
#X = df[CATEGORICAL_COLS + NUMERICAL_COLS]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)

In [9]:
#X_train.to_csv('data/X_train.csv', sep=';', index=False)
#X_test.to_csv('data/X_test.csv', sep=';', index=False)
#y_train.to_csv('data/y_train.csv', sep=';', index=False)
#y_test.to_csv('data/y_test.csv', sep=';', index=False)

In [10]:
X_train = pd.read_csv('data/X_train.csv', sep=';')
X_test = pd.read_csv('data/X_test.csv', sep=';')
y_train = pd.read_csv('data/y_train.csv', sep=';')
y_test = pd.read_csv('data/y_test.csv', sep=';')

In [None]:
encoder = CategoricalEncoder(CATEGORICAL_COLS, method='mean')
encoder.fit(X_train, y_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [None]:
X_test.head()

In [None]:
X_train.head()

In [None]:
def evalute_classifier(classifier):
    for target in TARGET_COLS:
        MAPES = []
        RMSE = []
        BEST_MAPE = 100

        for n in range(1, X_train.shape[1]):
            clf = classifier
            selector = RFE(clf, n, step=1)
            selector = selector.fit(X_train, y_train[target])

            y_pred = selector.predict(X_test)
            mape = mean_absolute_percentage_error(y_test[target], y_pred)
            rmse = mean_squared_error(y_test[target], y_pred)
            MAPES.append(mape)
            RMSE.append(rmse)
            if mape < BEST_MAPE:
                BEST_MAPE = mape
                BEST_RMSE = rmse
                BEST_NUMBER_OF_FEATURES = n

        print('Numero optimo de features para {}: {}, MAPE: {}, RMSE: {}'.format(target, 
                                                                                 BEST_NUMBER_OF_FEATURES,
                                                                                 round(BEST_MAPE,2),
                                                                                 round(BEST_RMSE,2)
                                                                                ))

In [None]:
print('Decision Tree')
evalute_classifier(DecisionTreeRegressor(random_state=43))

In [None]:
N_ESTIMATORS = [2, 5, 10, 15, 20, 30, 50]
for n_estimator in N_ESTIMATORS:
    print('Random Forest - {} Estimadores'.format(n_estimator))
    evalute_classifier(RandomForestRegressor(n_estimators=n_estimator, random_state=43))
    print('_________________________________________________________________________-')

In [None]:
print('Linear Regression')
evalute_classifier(LinearRegression())

In [None]:
for target in TARGET_COLS:
    nn = MLPRegressor()
    nn.fit(X_train, y_train[target])
    y_pred = nn.predict(X_test)
    mape = mean_absolute_percentage_error(y_test[target], y_pred)
    rmse = mean_squared_error(y_test[target], y_pred)
    print('{}, MAPE: {}, RMSE: {}'.format(target, mape, rmse))