In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from joblib import dump, load
import pickle
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE

In [2]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [3]:
class CategoricalEncoder:
    
    """
    It also changes the data type to int
    """
    
    def __init__(self, cols, method = 'classic'):
        self.COLUMNS = cols
        self.METHOD = method
        self.encoder = {}
        self.inverse_encoder = {}
        
    def fit(self, X, y=None):
        X = X.copy()
        if self.METHOD == 'classic':
            for col in self.COLUMNS:
                self.encoder[col] = {}
                self.inverse_encoder[col] = {}
                unique_values = X[col].unique()
                for i, val in enumerate(unique_values):
                    self.encoder[col][val] = i
                    self.inverse_encoder[col][i] = val
                    
        if self.METHOD == 'mean':
            for col in self.COLUMNS:
                self.encoder[col] = {}
                self.inverse_encoder[col] = {}
                unique_values = X[col].unique()
                for val in unique_values:
                    tmp = X[col][X[col] == val]
                    mean = y.loc[tmp.index, 'P80'].mean()
                    self.encoder[col][val] = mean
                    self.inverse_encoder[col][mean] = val
                self.encoder['NaN'] = y['P80'].mean()
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in self.COLUMNS:
            X[col] = X[col].map(self.encoder[col]).fillna(self.encoder['NaN'])
        return X
    
    def inverse_transform(self, X):
        X = X.copy()
        for col in self.COLUMNS:
            X[col] = X[col].replace(self.inverse_encoder[col])
        return X
    

In [4]:
"""
P80 es bi modal
"""
df = pd.read_csv('data/data_fixed.csv',sep=';')
df = df[df['Tipo Explosivo'] != 'M']
df = df.dropna()

In [5]:
df.head()

Unnamed: 0,Fase,Banco,Tipo de tronadura,Tipo Material,M,Dominio Estructural,Diámetro,BxS,Tiempo entre Pozos Filas ms,Fc,...,P80,P90,P100,Este,Norte,Cota,B,S,tiempo_1,tiempo_2
0,Stage 5,2930.0,Tronadura 1,Roca 1,L4,Noreste,10.625,6.5x6.5,11-105,498.0,...,6.5,9.37,15.08,59095.2,90292.2,2930.0,6.5,6.5,11.0,105.0
1,Stage 4,2990.0,Tronadura 1,Roca 1,L4,Noreste,10.625,6.5x6.5,11-105,424.0,...,6.34,8.46,13.54,59276.7,90607.7,2990.0,6.5,6.5,11.0,105.0
2,Stage 4,2930.0,Tronadura 1,Roca 1,L4,Noreste,10.625,6.5x6.5,11-105,424.0,...,6.38,9.25,14.61,59067.4,90335.2,2930.0,6.5,6.5,11.0,105.0
3,Stage 4,2990.0,Tronadura 1,Roca 1,L4,Noreste,10.625,6.5x8,5-87,385.0,...,6.22,8.15,16.46,59278.6,90567.5,2990.0,6.5,8.0,5.0,87.0
4,Stage 2,3575.0,Tronadura 2,Roca 1,L8,Norte,10.625,7x7,5-87,507.0,...,4.88,6.38,11.46,59238.4,91671.4,3575.0,7.0,7.0,5.0,87.0


In [6]:
CATEGORICAL_COLS = ['Fase', 'Tipo de tronadura', 'Tipo Material', 'M', 'Dominio Estructural', 'Tipo Explosivo']
NUMERICAL_COLS = ['Banco', 'Diámetro', 'Fc', 'Cota', 'B', 'S', 'tiempo_1', 'tiempo_2']
TARGET_COLS = ['P10', 'P20', 'P30', 'P40', 'P50', 'P60', 'P70', 'P80', 'P90', 'P100']
TEST_SIZE = 0.33

In [7]:
#y = df[TARGET_COLS]
#X = df[CATEGORICAL_COLS + NUMERICAL_COLS]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)

In [8]:
#X_train.to_csv('data/X_train.csv', sep=';', index=False)
#X_test.to_csv('data/X_test.csv', sep=';', index=False)
#y_train.to_csv('data/y_train.csv', sep=';', index=False)
#y_test.to_csv('data/y_test.csv', sep=';', index=False)

In [9]:
X_train = pd.read_csv('data/X_train.csv', sep=';')
X_test = pd.read_csv('data/X_test.csv', sep=';')
y_train = pd.read_csv('data/y_train.csv', sep=';')
y_test = pd.read_csv('data/y_test.csv', sep=';')

In [10]:
encoder = CategoricalEncoder(CATEGORICAL_COLS, method='mean')
encoder.fit(X_train, y_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

In [11]:
X_test.head()

Unnamed: 0,Fase,Tipo de tronadura,Tipo Material,M,Dominio Estructural,Tipo Explosivo,Banco,Diámetro,Fc,Cota,B,S,tiempo_1,tiempo_2
0,5.771911,4.930173,5.800176,5.803466,4.677983,4.588825,2825.0,12.25,495.0,2825.0,7.0,7.0,5.0,72.0
1,4.034303,4.930173,4.069519,4.104362,4.677983,4.588825,3440.0,12.25,140.0,3440.0,13.0,14.0,7.0,61.0
2,4.645672,4.930173,4.893772,4.080759,4.677983,4.588825,3845.0,9.875,244.0,3845.0,7.0,8.0,7.0,85.0
3,5.771911,4.930173,5.800176,5.803466,4.677983,4.588825,2840.0,12.25,523.0,2840.0,7.0,7.0,5.0,36.0
4,4.034303,4.930173,4.069519,4.080759,4.677983,4.442981,3530.0,10.625,217.0,3530.0,8.5,10.0,17.0,120.0


In [12]:
X_train.head()

Unnamed: 0,Fase,Tipo de tronadura,Tipo Material,M,Dominio Estructural,Tipo Explosivo,Banco,Diámetro,Fc,Cota,B,S,tiempo_1,tiempo_2
0,5.771911,4.930173,5.800176,5.803466,5.688099,4.588825,2855.0,12.25,523.0,2855.0,7.0,7.0,4.0,30.0
1,4.034303,4.930173,4.069519,4.080759,4.677983,4.588825,3425.0,12.25,149.0,3425.0,13.0,14.0,7.0,84.0
2,4.034303,4.930173,4.069519,4.104362,4.677983,4.284439,3515.0,12.25,230.0,3515.0,10.5,10.5,9.0,94.0
3,5.771911,4.930173,5.800176,5.803466,5.688099,5.290478,2885.0,12.25,523.0,2885.0,7.0,7.0,2.0,25.0
4,4.034303,4.930173,4.069519,4.104362,4.677983,4.588825,3425.0,12.25,149.0,3425.0,13.0,14.0,7.0,84.0


In [None]:
#CONF = {}

In [13]:
def evalute_classifier(classifier):
    for target in TARGET_COLS:
        MAPES = []
        MSE = []
        BEST_MAPE = 100
        for n in range(1, X_train.shape[1]):
            clf = classifier
            selector = RFE(clf, n, step=1)
            selector = selector.fit(X_train, y_train[target])

            y_pred = selector.predict(X_test)
            mape = mean_absolute_percentage_error(y_test[target], y_pred)
            mse = mean_squared_error(y_test[target], y_pred)
            MAPES.append(mape)
            MSE.append(mse)
            if mape < BEST_MAPE:
                BEST_MAPE = mape
                BEST_MSE = mse
                BEST_NUMBER_OF_FEATURES = n
            
            #CONF[target] = {}
            #CONF[target]['n_features'] = BEST_NUMBER_OF_FEATURES
            #CONF[target]['mape'] = BEST_MAPE
            #CONF[target]['mse'] = BEST_MSE

        print('Numero optimo de features para {}: {}, MAPE: {}, MSE: {}'.format(target, 
                                                                                 BEST_NUMBER_OF_FEATURES,
                                                                                 round(BEST_MAPE,2),
                                                                                 round(BEST_MSE,2)
                                                                                ))

In [None]:
print('Decision Tree')
evalute_classifier(DecisionTreeRegressor(random_state=43))

In [None]:
print('Linear Regression')
evalute_classifier(LinearRegression())

In [None]:
N_ESTIMATORS = [2, 5, 10, 15, 20]
for n_estimator in N_ESTIMATORS:
    print('Random Forest - {} Estimadores'.format(n_estimator))
    evalute_classifier(RandomForestRegressor(n_estimators=n_estimator, random_state=43))
    print('_________________________________________________________________________')

In [None]:
for target in TARGET_COLS:
    nn = MLPRegressor()
    nn.fit(X_train, y_train[target])
    y_pred = nn.predict(X_test)
    mape = mean_absolute_percentage_error(y_test[target], y_pred)
    rmse = mean_squared_error(y_test[target], y_pred)
    print('{}, MAPE: {}, RMSE: {}'.format(target, mape, rmse))

In [None]:
"""
Random Forest - 10 Estimadores
Numero optimo de features para P10: 12, MAPE: 66.52, RMSE: 0.07
Numero optimo de features para P20: 12, MAPE: 50.86, RMSE: 0.18
Numero optimo de features para P30: 4, MAPE: 42.77, RMSE: 0.32
Numero optimo de features para P40: 5, MAPE: 34.59, RMSE: 0.43
Numero optimo de features para P50: 3, MAPE: 25.0, RMSE: 0.45
Numero optimo de features para P60: 4, MAPE: 15.44, RMSE: 0.37
Numero optimo de features para P70: 12, MAPE: 9.19, RMSE: 0.25
Numero optimo de features para P80: 13, MAPE: 5.67, RMSE: 0.18
Numero optimo de features para P90: 3, MAPE: 9.7, RMSE: 0.75
Numero optimo de features para P100: 7, MAPE: 18.63, RMSE: 6.22
"""

In [14]:
CONF = {
    'P10':{'n_features':12},
    'P20':{'n_features':12},
    'P30':{'n_features':4},
    'P40':{'n_features':5},
    'P50':{'n_features':3},
    'P60':{'n_features':4},
    'P70':{'n_features':12},
    'P80':{'n_features':13},
    'P90':{'n_features':3},
    'P100':{'n_features':7}
}

In [15]:
"""
MODELO SELECCIONADO:
RANDOM FOREST 10 ESTIMATORS
"""

for target in TARGET_COLS:
    clf = RandomForestRegressor(n_estimators=10, random_state=43)
    selector = RFE(clf, CONF[target]['n_features'], step=1)
    selector = selector.fit(X_train, y_train[target])
    
    CONF[target]['columns'] = X_train.columns[selector.ranking_ == 1]
    
    
    clf = RandomForestRegressor(n_estimators=10, random_state=43)
    clf.fit(X_train[CONF[target]['columns']], y_train[target])
    CONF[target]['model'] = clf
    
    dump(clf, 'models/{}_model.joblib'.format(target)) 
    
    y_pred = clf.predict(X_test[CONF[target]['columns']])
    mape = mean_absolute_percentage_error(y_test[target], y_pred)
    mse = mean_squared_error(y_test[target], y_pred)

    print('Numero optimo de features para {}: {}, MAPE: {}, MSE: {}'.format(target, 
                                                                         CONF[target]['n_features'],
                                                                         round(mape,2),
                                                                         round(mse,2)
                                                                        ))

Numero optimo de features para P10: 12, MAPE: 66.52, MSE: 0.07
Numero optimo de features para P20: 12, MAPE: 50.86, MSE: 0.18
Numero optimo de features para P30: 4, MAPE: 42.77, MSE: 0.32
Numero optimo de features para P40: 5, MAPE: 34.59, MSE: 0.43
Numero optimo de features para P50: 3, MAPE: 25.0, MSE: 0.45
Numero optimo de features para P60: 4, MAPE: 15.44, MSE: 0.37
Numero optimo de features para P70: 12, MAPE: 9.19, MSE: 0.25
Numero optimo de features para P80: 13, MAPE: 5.67, MSE: 0.18
Numero optimo de features para P90: 3, MAPE: 9.7, MSE: 0.75
Numero optimo de features para P100: 7, MAPE: 18.63, MSE: 6.22


In [17]:
with open('models/CONF.pickle', 'wb') as handle:
    pickle.dump(CONF, handle, protocol=pickle.HIGHEST_PROTOCOL)

### PREDICCION

In [None]:
df_test = pd.read_excel('data/Datos_Entregable2_Hackathon.xlsx', header=2)

In [None]:
df_test.head()

In [None]:
def split_by_x(x):
    if 'x' in x:
        return x.split('x')
    else:
        return x.split('X')

def split_by_dash(x):
    if x == '-1':
        return [x,x]
    else:
        return x.split('-')
    
def take_in_out(x):
    x = str(x)
    if '..' in x:
        x = x.replace('..','.')
    elif ' in' in x:
        x = x.replace(' in','')
    return x

def take_double_dots_out(x):
    x = str(x)
    if '..' in x:
        x = x.replace('..','.')
    return x

In [None]:
df_test['B'] = df_test['BxS'].apply(lambda x: split_by_x(x)[0])
df_test['S'] =  df_test['BxS'].apply(lambda x: split_by_x(x)[1])
df_test['tiempo_1'] = df_test['Tiempo entre Pozos Filas ms'].fillna('-1').apply(lambda x: split_by_dash(x)[0])
df_test['tiempo_2'] = df_test['Tiempo entre Pozos Filas ms'].fillna('-1').apply(lambda x: split_by_dash(x)[1])

df_test['tiempo_1'] = df_test['tiempo_1'].replace({'-1':None})
df_test['tiempo_2'] = df_test['tiempo_2'].replace({'-1':None})

df_test['B'] = pd.to_numeric(df_test['B'])
df_test['S'] = pd.to_numeric(df_test['S'])
df_test['tiempo_1'] = pd.to_numeric(df_test['tiempo_1'])
df_test['tiempo_2'] = pd.to_numeric(df_test['tiempo_2'])

df_test['Norte'] = df_test['Norte'].apply(lambda x: take_double_dots_out(x))
df_test['Este'] = df_test['Este'].apply(lambda x: take_double_dots_out(x))
df_test['Norte'] = pd.to_numeric(df_test['Norte'].replace({'nan':None}))
df_test['Este'] = pd.to_numeric(df_test['Este'].replace({'nan':None}))

In [None]:
df_test = encoder.transform(df_test)

In [None]:
df_test.head()

In [None]:
for target in TARGET_COLS:
    
    tmp = df_test[CONF[target]['columns']]
    predictions = CONF[target]['model'].predict(tmp)
    df_test[target] = predictions

In [None]:
df_test = encoder.inverse_transform(df_test)

In [None]:
df_test.to_csv('data/predicciones_stage_2.csv', sep=';', index=False)