In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd

# Create a figure with 2 subplots side by side
fig, axs = plt.subplots(1, 2, figsize=(20, 8))

img1 = mpimg.imread('imgs/nbr16697.png')
axs[0].imshow(img1)
axs[0].axis('off')  # Hide axes
axs[0].set_title('NBR16697:2018 - Cimento Portland — Requisitos')

img2 = mpimg.imread('imgs/nbr12655.png')
axs[1].imshow(img2)
axs[1].axis('off')  # Hide axes
axs[1].set_title('NBR12655:2022 - Concreto de cimento Portland — Procedimento')

plt.show()


# Create a figure with 2 subplots side by side
fig, axs = plt.subplots(1, 3, figsize=(20, 8))

img3 = mpimg.imread('imgs/nbr12655_definicoes.png')
axs[0].imshow(img3)
axs[0].axis('off')  # Hide axes
axs[0].set_title('NBR16697:2018 - Cimento Portland — Requisitos')

img4 = mpimg.imread('imgs/nbr12655_resistencia.png')
axs[1].imshow(img4)
axs[1].axis('off')  # Hide axes
axs[1].set_title('NBR12655:2022 - Concreto de cimento Portland — Procedimento')

img5 = mpimg.imread('imgs/nbr12655_sd.png')
axs[2].imshow(img5)
axs[2].axis('off')  # Hide axes
axs[2].set_title('NBR12655:2022 - Concreto de cimento Portland — Procedimento')


def NBR_compressive_strength(concrete_class, age_days):
    # Dictionary to hold compressive strength data at 3, 7, and 28 days for each class
    compressive_strength = {
        'CP I': {'3_days': 8.0, '7_days': 15.0, '28_days': 25.0},
        'CP I-S': {'3_days': 10.0, '7_days': 20.0, '28_days': 32.0},
        'CP I-E': {'3_days': 10.0, '7_days': 20.0, '28_days': 32.0},
        'CP II-F': {'3_days': 10.0, '7_days': 20.0, '28_days': 32.0},
        'CP II-Z': {'3_days': 10.0, '7_days': 20.0, '28_days': 32.0},
        'CP III': {'3_days': 10.0, '7_days': 18.0, '28_days': 25.0},
        'CP IV': {'3_days': 14.0, '7_days': 23.0, '28_days': 40.0},
        'CP V': {'3_days': 14.0, '7_days': 24.0, '28_days': 34.0},
    }

    # Get the compressive strength for the specified class
    if concrete_class in compressive_strength:
        return compressive_strength[concrete_class][age_days]
    else:
        return "Concrete class/age not found"

# Example usage:
concrete_class = 'CP V'
age = '7_days'
strength = NBR_compressive_strength(concrete_class, age)
# print(f"Compressive strength for {concrete_class}:\n{age}: {strength}")

In [2]:
from Tbx_Regression import *
from Tbx_Classification import *
from Tbx_Optimizing import *

# Pré-processamento

## Filtro "empírico"

In [3]:
df_concreto_traco = pd.read_pickle('data/data_clean_Concreto_traco.pkl')

df_concreto_traco.head()

Unnamed: 0,Empresa,Planta,Nota Fiscal,Data,Horário,Cidade,cimento Tipo,cimento Classe de resistência,concreto Cimento,concreto Polifuncional,...,CT Plastificante,CT Polifuncional,CT Superplastificante,CT Incorporador de ar,CT Brita 0,CT Brita 1,CT Brita 2,CT Areia natural,CT Areia artificial,CT Água
12080,303,AA,49654,07/10/2021,14:11:37,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,212.875,1.99625,...,0.0,0.0,0.0,0.0,1074.0,0.0,0.0,889.0,0.0,193.0
12081,303,AA,50407,22/11/2021,09:35:14,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,228.0,2.204998,...,0.0,0.0,0.0,0.0,283.0,851.0,0.0,841.0,0.0,186.0
12082,303,AA,49613,06/10/2021,11:18:42,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,203.125,1.904999,...,0.0,0.0,0.0,0.0,339.0,794.0,0.0,866.0,0.0,185.0
12083,303,AA,49657,07/10/2021,15:37:02,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,213.272727,2.021817,...,0.0,0.0,0.0,0.0,1074.0,0.0,0.0,889.0,0.0,193.0
12084,303,AA,49604,06/10/2021,07:31:10,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,203.25,1.901249,...,0.0,0.0,0.0,0.0,339.0,794.0,0.0,866.0,0.0,185.0


In [4]:
# Cria variável 'A/C', converte tipos, aplica filtros que estão nos parâmetros (valores < que)
df_traco_processed = preprocess_data(data = df_concreto_traco,
                                     params_filter = {},
                                     verbose = False)

df_traco_processed.head()

Unnamed: 0,Empresa,Planta,Nota Fiscal,Data,Horário,Cidade,cimento Tipo,cimento Classe de resistência,concreto Cimento,concreto Polifuncional,...,Brita_total,Areia_total,Agregados,CT Brita 0_Cimento,CT Brita 1_Cimento,CT Areia natural_Cimento,CT Areia artificial_Cimento,Brita_total_Cimento,Areia_total_Cimento,Agregados_Cimento
12080,303.0,AA,49654.0,07/10/2021,14:11:37,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,212.875,1.99625,...,1074.0,889.0,1963.0,5.042254,0.0,4.173709,0.0,5.042254,4.173709,9.215962
12081,303.0,AA,50407.0,22/11/2021,09:35:14,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,228.0,2.204998,...,1134.0,841.0,1975.0,1.235808,3.716157,3.672489,0.0,4.951965,3.672489,8.624454
12082,303.0,AA,49613.0,06/10/2021,11:18:42,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,203.125,1.904999,...,1133.0,866.0,1999.0,1.661765,3.892157,4.245098,0.0,5.553922,4.245098,9.79902
12083,303.0,AA,49657.0,07/10/2021,15:37:02,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,213.272727,2.021817,...,1074.0,889.0,1963.0,5.042254,0.0,4.173709,0.0,5.042254,4.173709,9.215962
12084,303.0,AA,49604.0,06/10/2021,07:31:10,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,203.25,1.901249,...,1133.0,866.0,1999.0,1.661765,3.892157,4.245098,0.0,5.553922,4.245098,9.79902


## Filtros NBR

In [5]:
# Remove de acordo com parâmetros, aPlica filtros da norma (fator a/c, resistencia esperada,etc)
df_traco_processed_NBR = apply_standardization_filter(data = df_traco_processed,
                                                      remove_params = {},
                                                      regression = False)

df_traco_processed_NBR.head()

Instâncias eliminadas devido não conformidade NBR6118:2023 (relação água/cimento):         15845


Unnamed: 0,Empresa,Planta,Nota Fiscal,Data,Horário,Cidade,cimento Tipo,cimento Classe de resistência,concreto Cimento,concreto Polifuncional,...,Brita_total,Areia_total,Agregados,CT Brita 0_Cimento,CT Brita 1_Cimento,CT Areia natural_Cimento,CT Areia artificial_Cimento,Brita_total_Cimento,Areia_total_Cimento,Agregados_Cimento
12553,303.0,AA,50427.0,23/11/2021,09:17:47,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,290.666667,2.802219,...,1119.0,792.0,1911.0,1.143345,2.675768,2.703072,0.0,3.819113,2.703072,6.522184
12806,303.0,AA,50335.0,17/11/2021,06:52:20,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,291.333333,2.808334,...,1119.0,792.0,1911.0,1.143345,2.675768,2.703072,0.0,3.819113,2.703072,6.522184
12842,303.0,AA,57427.0,30/09/2022,15:14:54,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,290.714286,2.509997,...,1160.0,750.0,1910.0,1.188356,2.784247,2.568493,0.0,3.972603,2.568493,6.541096
12919,303.0,AA,49430.0,28/09/2021,12:59:05,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,320.375,3.102497,...,1133.0,789.0,1922.0,1.153061,2.70068,2.683673,0.0,3.853741,2.683673,6.537415
12976,303.0,AA,57608.0,13/10/2022,10:41:31,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,296.875,2.51125,...,1146.0,748.0,1894.0,1.151007,2.694631,2.510067,0.0,3.845638,2.510067,6.355705


# Remoção casos com pouca representatividade

In [6]:
print(f"Quantidade de casos removidos: sem informação sobre Fc aos 7 dias: {len(df_traco_processed_NBR[df_traco_processed_NBR['Fc 7d'] == 0])}")
df_traco_processed_fc = df_traco_processed_NBR[df_traco_processed_NBR['Fc 7d'] != 0]

instances_per_class = df_traco_processed_fc.groupby(['cimento Tipo', 'cimento Classe de resistência', 'Fck']).count()['Fc 28d'].reset_index()

print('Casos eliminados - Pouca representatividade')
display(instances_per_class[instances_per_class['Fc 28d'] < 30].sort_values(by = ['cimento Tipo', 'Fck']))

keep_instances = instances_per_class[instances_per_class['Fc 28d'] > 30].reset_index()

df_traco_processed_fc = df_traco_processed_fc.merge(
                                keep_instances[['cimento Tipo', 'cimento Classe de resistência', 'Fck']],
                                on=['cimento Tipo', 'cimento Classe de resistência', 'Fck'],
                                how='inner')


df_traco_processed_fc.head()

Quantidade de casos removidos: sem informação sobre Fc aos 7 dias: 1424
Casos eliminados - Pouca representatividade


Unnamed: 0,cimento Tipo,cimento Classe de resistência,Fck,Fc 28d
0,CP-II-E,40,25.0,1
4,CP-II-F,40,20.0,17
9,CP-II-Z,40,30.0,3
10,CP-II-Z,40,35.0,1
11,CP-II-Z,40,40.0,2
12,CP-III,40,30.0,1
14,CP-III,40,40.0,7
15,CP-III-E,40,40.0,2
16,CP-III-RS,40,20.0,6
17,CP-III-RS,40,25.0,13


Unnamed: 0,Empresa,Planta,Nota Fiscal,Data,Horário,Cidade,cimento Tipo,cimento Classe de resistência,concreto Cimento,concreto Polifuncional,...,Brita_total,Areia_total,Agregados,CT Brita 0_Cimento,CT Brita 1_Cimento,CT Areia natural_Cimento,CT Areia artificial_Cimento,Brita_total_Cimento,Areia_total_Cimento,Agregados_Cimento
0,303.0,AA,50427.0,23/11/2021,09:17:47,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,290.666667,2.802219,...,1119.0,792.0,1911.0,1.143345,2.675768,2.703072,0.0,3.819113,2.703072,6.522184
1,303.0,AA,50335.0,17/11/2021,06:52:20,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,291.333333,2.808334,...,1119.0,792.0,1911.0,1.143345,2.675768,2.703072,0.0,3.819113,2.703072,6.522184
2,303.0,AA,60004.0,28/03/2023,08:15:50,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,205.428571,1.708566,...,1179.0,770.0,1949.0,1.234266,2.888112,2.692308,0.0,4.122378,2.692308,6.814685
3,303.0,AA,59842.0,16/03/2023,08:24:51,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,291.125,2.809998,...,1163.0,768.0,1931.0,0.993151,2.989726,2.630137,0.0,3.982877,2.630137,6.613014
4,303.0,AA,59849.0,16/03/2023,11:14:41,SÃO JOSÉ DO RIO PRETO,CP-II-F,40,291.625,2.797501,...,1163.0,768.0,1931.0,1.191781,2.791096,2.630137,0.0,3.982877,2.630137,6.613014


# Criação feature target para classificação:

The formula for calculating the dosage strength is defined in NBR12655:

$$
f_{cmj} = f_{ckj} + 1.65 \times s_d
$$

Therefore, it follows that:

$$
f_{ckj} = f_{cmj} - 1.65 \times s_d
$$

where:
- $ f_{cmj} $ is the mean compressive strength of the concrete, predicted at age $ j $ days, in megapascals (MPa),
- $ f_{ckj} $ is the characteristic compressive strength of the concrete at age $ j $ days, in megapascals (MPa),
- $ s_d $ is the standard deviation of the dosage, expressed in megapascals (MPa), with a standard value of 4 MPa according to NBR12655.




In [7]:
df_traco_processed_fc['Fck28'] = df_traco_processed_fc['Fc 28d'] - 1.65 * 4

df_traco_processed_fc['Status'] = np.where(df_traco_processed_fc['Fck28'] >= (df_traco_processed_fc['Fck']), 1, 0)

print(f"Proporção de casos em que a resistência à compressão aos 28 dias (Fc 28) alcançou a resistência especificada em projeto (Fck):\n{df_traco_processed_fc['Status'].sum()} / {len(df_traco_processed_fc)}")

Proporção de casos em que a resistência à compressão aos 28 dias (Fc 28) alcançou a resistência especificada em projeto (Fck):
13592 / 25001


In [30]:
from sklearn.model_selection import train_test_split

# Identification columns and FCK columns
ID_columns = ['Empresa', 'Planta', 'Nota Fiscal', 'Data', 'Horário', 'Cidade', 'cimento Tipo', 'cimento Classe de resistência']
FCK_columns = ['Fc 3d', 'Fc 7d', 'Fc 28d', 'Fck', 'Fck28', 'Status']

group_columns = ['cimento Tipo', 'cimento Classe de resistência', 'Fck']
drop_columns = [col for col in ID_columns + FCK_columns if col not in group_columns and col in df_traco_processed_fc.columns]

# Define the target variables
target_regr = 'Fc 7d'
target_clas = 'Status'

# X contains all features except those to drop
X = df_traco_processed_fc.drop(drop_columns, axis=1)

# Define the regression and classification targets
y_regr = df_traco_processed_fc[target_regr]
y_clas = df_traco_processed_fc[target_clas]

# Combine the columns 'cimento Tipo' and 'cimento Classe de resistência' for stratification
stratify_col = X[group_columns].apply(lambda x: '_'.join(x.astype(str)), axis=1)

# Split the data for both regression and classification tasks, stratifying by the chosen column
X_train, X_test, y_train_regr, y_test_regr, y_train_clas, y_test_clas = train_test_split(
    X, y_regr, y_clas, 
    test_size=0.3, 
    stratify=stratify_col, 
    random_state=42
)

# Now, we split the training data into training and validation sets
# Combine the columns 'cimento Tipo' and 'cimento Classe de resistência' for stratification again
stratify_col_train = X_train[group_columns].apply(lambda x: '_'.join(x.astype(str)), axis=1)

# Split the training data into training and validation sets
X_tr, X_ts, y_tr_regr, y_ts_regr, y_tr_clas, y_ts_clas = train_test_split(
    X_train, y_train_regr, y_train_clas, 
    test_size=0.3, 
    stratify=stratify_col_train, 
    random_state=42
)

# Drop the group columns for model input
X_tr_clean = X_tr.drop(columns=group_columns)
X_ts_clean = X_ts.drop(columns=group_columns)
X_test_clean = X_test.drop(columns=group_columns)

# Output the shapes of the resulting splits
print('Training-Test set:')
for v in [y_train_regr, y_test_regr]:
    print(v.shape)

print('\nTraining-Validation set:')
for v in [y_tr_regr, y_ts_regr]:
    print(v.shape)

Training-Test set:
(17500,)
(7501,)

Training-Validation set:
(12250,)
(5250,)


# Focando no tipo de concreto/cimento

In [32]:
X_tr

Unnamed: 0,cimento Tipo,cimento Classe de resistência,concreto Cimento,concreto Polifuncional,concreto Brita 0,concreto Brita 1,concreto Areia natural,concreto Areia artificial,concreto Umidade brita 0,concreto Umidade brita 1,...,Brita_total,Areia_total,Agregados,CT Brita 0_Cimento,CT Brita 1_Cimento,CT Areia natural_Cimento,CT Areia artificial_Cimento,Brita_total_Cimento,Areia_total_Cimento,Agregados_Cimento
1360,CP-II-F,40,237.4,1.908002,160.200,867.0,447.60,480.40,0.0,0.0,...,1065.0,832.0,1897.0,0.551724,3.120690,1.344828,1.524138,3.672414,2.868966,6.541379
11724,CP-V,ARI-RS,0.0,0.000000,0.000,0.0,0.00,0.00,0.0,0.0,...,843.0,810.0,1653.0,2.107500,0.000000,0.605000,1.420000,2.107500,2.025000,4.132500
13495,CP-V,ARI-RS,0.0,0.000000,0.000,0.0,0.00,0.00,0.0,0.0,...,900.0,781.0,1681.0,2.250000,0.000000,0.582500,1.370000,2.250000,1.952500,4.202500
5935,CP-II-F,40,317.0,3.206249,998.125,0.0,448.75,445.25,0.0,0.0,...,1046.0,806.0,1852.0,0.478659,2.710366,1.152439,1.304878,3.189024,2.457317,5.646341
15806,CP-V,ARI-RS,0.0,0.000000,0.000,0.0,0.00,0.00,0.0,0.0,...,1100.0,734.0,1834.0,0.380403,2.789625,1.265130,0.850144,3.170029,2.115274,5.285303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9619,CP-II-E,40,352.4,1.409998,943.200,0.0,410.80,491.20,0.0,0.0,...,1067.0,872.0,1939.0,0.696246,2.945392,1.153584,1.822526,3.641638,2.976109,6.617747
18616,CP-V,ARI-RS,0.0,0.000000,0.000,0.0,0.00,0.00,0.0,0.0,...,1024.0,767.0,1791.0,0.323684,2.371053,1.207895,0.810526,2.694737,2.018421,4.713158
11834,CP-V,ARI-RS,0.0,0.000000,0.000,0.0,0.00,0.00,0.0,0.0,...,1128.0,755.0,1883.0,0.599291,3.400709,1.333333,1.343972,4.000000,2.677305,6.677305
738,CP-II-F,40,0.0,0.000000,0.000,0.0,0.00,0.00,0.0,0.0,...,1133.0,835.0,1968.0,0.518293,2.935976,0.951220,1.594512,3.454268,2.545732,6.000000


In [34]:
ciment_types = ['CP-II', ['CP-III', 'CP-IV'], 'CP-V']

for ciment in ciment_types:
    if isinstance(ciment, list):
        # If ciment is a list, join the list elements with a regex 'OR' (|) for contains
        pattern = '|'.join(ciment)
    else:
        # If ciment is a string, use it directly
        pattern = ciment


    tr_CP = pd.concat([X_tr, y_tr_regr], axis = 1)
    ts_CP = pd.concat([X_ts, y_ts_regr], axis = 1)

    tr_CP = tr_CP[tr_CP['cimento Tipo'].str.contains(pattern)]
    ts_CP = ts_CP[ts_CP['cimento Tipo'].str.contains(pattern)]
    
    print(f'\n\nCiment: {pattern}\t "Treino": {tr_CP.shape}\t "Teste": {ts_CP.shape}')



Ciment: CP-II	 "Treino": (5702, 41)	 "Teste": (2444, 41)


Ciment: CP-III|CP-IV	 "Treino": (1721, 41)	 "Teste": (738, 41)


Ciment: CP-V	 "Treino": (5724, 41)	 "Teste": (2453, 41)


In [9]:
CP = 'cimento Tipo'
CP_type = 'CP-V'

tr_CP = pd.concat([X_tr, y_tr_regr], axis = 1)
tr_CP = preprocess_data(data = tr_CP[tr_CP[CP] == CP_type],
                                     params_filter = {}, verbose = False)

# Total instancias no conjunto de dados - CPV
instances_per_class[instances_per_class['cimento Tipo'] == 'CP-V'].drop(columns = 'cimento Classe de resistência')

Unnamed: 0,cimento Tipo,Fck,Fc 28d
28,CP-V,25.0,1235
29,CP-V,30.0,2889
30,CP-V,35.0,4993
31,CP-V,40.0,2289
32,CP-V,45.0,277


In [10]:
# Total instancias no conjunto de TREINO - CPV
tr_CP.groupby(['cimento Tipo', 'cimento Classe de resistência', 'Fck']).count()['Fc 7d'].reset_index().drop(columns = 'cimento Classe de resistência')

Unnamed: 0,cimento Tipo,Fck,Fc 7d
0,CP-V,25.0,605
1,CP-V,30.0,1415
2,CP-V,35.0,2442
3,CP-V,40.0,1120
4,CP-V,45.0,136


In [11]:
# Fc 7 dias médio no conjunto de TREINO - CPV
tr_CP.groupby(['cimento Tipo', 'cimento Classe de resistência', 'Fck']).mean()['Fc 7d'].reset_index().drop(columns = 'cimento Classe de resistência')

Unnamed: 0,cimento Tipo,Fck,Fc 7d
0,CP-V,25.0,30.261679
1,CP-V,30.0,31.793305
2,CP-V,35.0,36.700879
3,CP-V,40.0,40.98982
4,CP-V,45.0,43.001471


## Definição parametros grid_search

In [12]:
# param_grid_rf_FeatSelection = {
#                             'n_estimators': [100, 200],
#                             'max_depth': [2, 10],
#                             'min_samples_split': [2, 5],
#                             }

# PG_rf = {
#     'n_estimators': [50, 150, 300],
#     'max_depth': [2, 5, 10, 20],
#     'min_samples_split': [2, 10],
#     'min_samples_leaf': [1, 5],
#     'bootstrap': [True, False]
#     }


PG_rf_FeatSelection = {
    'n_estimators': [25, 100],
    'max_depth': [2, 10],
    }

PG_rf_Regressor = {
    'n_estimators': [25, 100],
    'max_depth': [2, 5],
    'bootstrap': [True, False]
    }


PG_rf_Classifier = {
    'n_estimators': [25, 100],
    'max_depth': [2, 10],
    # 'min_samples_split': [2, 5],
    }

PG_rf_Classifier = {
    'n_estimators': [50, 150, 300],
    'max_depth': [2, 5, 10, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 5],
    'bootstrap': [True, False]
    }



from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier


classifiers = {
            'Logistic Regression': LogisticRegression(max_iter=500, random_state=42),
            'Random Forest Classifier': RandomForestClassifier(random_state=42),
            'Support Vector Classifier': SVC(random_state=42),
            'K-Nearest Neighbors': KNeighborsClassifier(),
            'Decision Tree Classifier': DecisionTreeClassifier(random_state=42),
            'Gradient Boosting Classifier': GradientBoostingClassifier(random_state=42)
        }

##  Caso sem dados de Fc7

### Regressão para estimar resistência à compressão aos 7 dias:

In [13]:
tr_CP_regr = pd.concat([X_tr, y_tr_regr], axis = 1)
tr_CP_regr = preprocess_data(data = tr_CP_regr[tr_CP_regr[CP] == CP_type],
                                     params_filter = {}, verbose = False)

X_tr_CP_regr = tr_CP_regr.drop(columns = target_regr)
y_tr_CP_regr = tr_CP_regr[target_regr]

ts_CP_regr = pd.concat([X_ts, y_ts_regr], axis = 1)
ts_CP_regr = preprocess_data(data = ts_CP_regr[ts_CP_regr[CP] == CP_type],
                                     params_filter = {}, verbose = False)

X_ts_CP_regr = ts_CP_regr.drop(columns = target_regr)
y_ts_CP_regr = ts_CP_regr[target_regr]


feat_select_regr = grid_search_FS_RF(X_tr_CP_regr.drop(columns = group_columns), y_tr_CP_regr, PG_rf_FeatSelection,
                                     regression = True, refit_metric = 'aic',
                                     n_splits = 3, verbose = False)

feat_import_regr = feat_select_regr[1]
features_selected_regr = feat_import_regr['Feature'].tolist()[:10]

X_tr_CP_regr_FS = X_tr_CP_regr[features_selected_regr]
X_ts_CP_regr_FS = X_ts_CP_regr[features_selected_regr]

eval_regressor_CV(X_tr_CP_regr_FS, X_ts_CP_regr_FS, y_tr_CP_regr, y_ts_CP_regr,
                  regressors = None, CV = True, n_splits = 3)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters for RandomForestRegressor:  {'max_depth': 10, 'n_estimators': 100}


Unnamed: 0,MAE_tr,MSE_tr,R2_tr,MAE_val,MSE_val,R2_val,MAE_test,MSE_test,R2_test
Linear Regression,3.537018,20.243259,0.491501,3.547812,20.377874,0.487788,3.573733,20.836123,0.474429
Decision Tree Regressor,0.700372,2.791573,0.929885,3.953916,26.848588,0.325282,3.870246,25.644258,0.353149
Random Forest Regressor,1.536935,4.568888,0.885235,3.324037,18.663204,0.530957,3.244515,17.977233,0.546542
Gradient Boosting Regressor,3.051132,15.232495,0.617386,3.222357,17.086413,0.570564,3.224056,17.052315,0.569872


In [14]:
rf_gridsearch_CP_regr = grid_search_RF(X_tr_CP_regr_FS, y_tr_CP_regr, PG_rf_Regressor,
                                       regression = True, refit_metric = 'aic',
                                       n_splits = 3)

results_CP_regr_FS = eval_regressor_CV(X_tr_CP_regr_FS, X_ts_CP_regr_FS, y_tr_CP_regr, y_ts_CP_regr,
                  regressors = {'RF grid_search' : rf_gridsearch_CP_regr.best_estimator_},
                  CV = True, n_splits = 3)

results_CP_regr_FS

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best parameters for RandomForestRegressor:  {'bootstrap': True, 'max_depth': 5, 'n_estimators': 100}


Unnamed: 0,MAE_test,MAE_tr,MAE_val,MSE_test,MSE_tr,MSE_val,R2_test,R2_tr,R2_val
RF grid_search,3.296746,3.173625,3.287944,17.79325,16.390192,17.738755,0.551183,0.588297,0.554196


In [15]:
# "2881 CASOS REAIS" em que não tem Fc7, mas podemos fazer regressão e ver que que dá!

# rf_fc7d = rf_gridsearch_CP.best_estimator_
# 
# X_BEDTEST = df_traco_processed[df_traco_processed['Fc 7d'] == 0][features_selected]
# 
# scaler = MinMaxScaler()
# X_train_scaled_TEST = scaler.fit_transform(X_tr_CP_FS)
# X_BEDTEST_scaled = scaler.transform(X_BEDTEST)
# 
# len(rf_fc7d.predict(X_BEDTEST_scaled))

###  Classificador 28 dias (sem usar Fc7)

In [16]:
tr_CP_clas_28_SEM_F7 = pd.concat([X_tr, y_tr_clas], axis = 1)
tr_CP_clas_28_SEM_F7 = preprocess_data(data = tr_CP_clas_28_SEM_F7[tr_CP_clas_28_SEM_F7[CP] == CP_type],
                                     params_filter = {}, verbose = False)

X_tr_CP_clas_28_SEM_F7 = tr_CP_clas_28_SEM_F7.drop(columns = target_clas)
y_tr_CP_clas_28_SEM_F7 = tr_CP_clas_28_SEM_F7[target_clas]

ts_CP_clas_28_SEM_F7 = pd.concat([X_ts, y_ts_clas], axis = 1)
ts_CP_clas_28_SEM_F7 = preprocess_data(data = ts_CP_clas_28_SEM_F7[ts_CP_clas_28_SEM_F7[CP] == CP_type],
                                     params_filter = {}, verbose = False)

X_ts_CP_clas_28_SEM_F7 = ts_CP_clas_28_SEM_F7.drop(columns = target_clas)
y_ts_CP_clas_28_SEM_F7 = ts_CP_clas_28_SEM_F7[target_clas]


feat_select_clas_28_SEM_F7 = grid_search_FS_RF(X_tr_CP_clas_28_SEM_F7.drop(columns = group_columns), y_tr_CP_clas_28_SEM_F7, PG_rf_FeatSelection,
                                     n_splits = 3, verbose = False,
                                     regression = False, refit_metric = 'f1')

feat_import_clas_28_SEM_F7 = feat_select_clas_28_SEM_F7[1]
features_selected_28_SEM_F7 = feat_import_clas_28_SEM_F7['Feature'].tolist()[:10]

X_tr_CP_clas_28_SEM_F7_FS = X_tr_CP_clas_28_SEM_F7[features_selected_28_SEM_F7]
X_ts_CP_clas_28_SEM_F7_FS = X_ts_CP_clas_28_SEM_F7[features_selected_28_SEM_F7]

classifiers_28_SEM_F7_report = eval_classifier_CV(X_tr_CP_clas_28_SEM_F7_FS, X_ts_CP_clas_28_SEM_F7_FS, y_tr_CP_clas_28_SEM_F7, y_ts_CP_clas_28_SEM_F7,
                  classifiers = classifiers, CV = True, n_splits = 3)

classifiers_28_SEM_F7_report['Case'] = '28d_SEM_F7'

classifiers_28_SEM_F7_report[[c for c in classifiers_28_SEM_F7_report.columns if ('tr' in c) or ('test' in c)]]

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters for RandomForestClassifier:  {'max_depth': 10, 'n_estimators': 100}


Unnamed: 0,Accuracy_tr,Precision_tr,Recall_tr,F1 Score_tr,Accuracy_test,Precision_test,Recall_test,F1 Score_test,Class Distribution (%)
Logistic Regression,0.726333,0.660434,0.726333,0.61467,0.735897,0.617153,0.735897,0.629735,"{1.0: 72.57, 0.0: 27.43}"
Random Forest Classifier,0.956917,0.956694,0.956917,0.956578,0.739459,0.738069,0.739459,0.738709,"{1.0: 72.57, 0.0: 27.43}"
Support Vector Classifier,0.730101,0.671024,0.730101,0.626901,0.738604,0.656547,0.738604,0.637795,"{1.0: 72.57, 0.0: 27.43}"
K-Nearest Neighbors,0.816074,0.807881,0.816074,0.807652,0.753276,0.736864,0.753276,0.741882,"{1.0: 72.57, 0.0: 27.43}"
Decision Tree Classifier,0.956917,0.960848,0.956917,0.957645,0.708405,0.723785,0.708405,0.714894,"{1.0: 72.57, 0.0: 27.43}"
Gradient Boosting Classifier,0.77927,0.782269,0.77927,0.736587,0.766382,0.747528,0.766382,0.719307,"{1.0: 72.57, 0.0: 27.43}"


## Classificador, usando Fc7, para dizer se o concreto vai atingir Fck

**obs. Depois mensurar a diferença entre usar o $Fc_7$ e $\hat{Fc_7}$**

In [17]:
tr_CP_clas = pd.concat([X_tr, y_tr_regr, y_tr_clas], axis = 1)
tr_CP_clas = preprocess_data(data = tr_CP_clas[tr_CP_clas[CP] == CP_type],
                                     params_filter = {}, verbose = False)

X_tr_CP_clas = tr_CP_clas.drop(columns = target_clas)
y_tr_CP_clas = tr_CP_clas[target_clas]

ts_CP_clas = pd.concat([X_ts, y_ts_regr, y_ts_clas], axis = 1)
ts_CP_clas = preprocess_data(data = ts_CP_clas[ts_CP_clas[CP] == CP_type],
                                     params_filter = {}, verbose = False)

X_ts_CP_clas = ts_CP_clas.drop(columns = target_clas)
y_ts_CP_clas = ts_CP_clas[target_clas]



In [18]:
feat_select_clas = grid_search_FS_RF(X_tr_CP_clas.drop(columns = group_columns), y_tr_CP_clas, PG_rf_FeatSelection,
                                     n_splits = 3, verbose = False,
                                     regression = False, refit_metric = 'f1')

feat_import_clas = feat_select_clas[1]
features_selected_clas = feat_import_clas['Feature'].tolist()[:10]

X_tr_CP_clas_FS = X_tr_CP_clas[features_selected_clas]
X_ts_CP_clas_FS = X_ts_CP_clas[features_selected_clas]

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters for RandomForestClassifier:  {'max_depth': 10, 'n_estimators': 100}


In [19]:
classifiers_baseline_report = eval_classifier_CV(X_tr_CP_clas_FS, X_ts_CP_clas_FS, y_tr_CP_clas, y_ts_CP_clas,
                  classifiers = classifiers, CV = True, n_splits = 3)

classifiers_baseline_report['Case'] = '28d_COM_F7'

classifiers_baseline_report[[c for c in classifiers_baseline_report.columns if ('tr' in c) or ('test' in c)]]

Unnamed: 0,Accuracy_tr,Precision_tr,Recall_tr,F1 Score_tr,Accuracy_test,Precision_test,Recall_test,F1 Score_test,Class Distribution (%)
Logistic Regression,0.829661,0.822129,0.829661,0.818093,0.830071,0.821856,0.830071,0.818901,"{1.0: 73.87, 0.0: 26.13}"
Random Forest Classifier,1.0,1.0,1.0,1.0,0.853045,0.850195,0.853045,0.85124,"{1.0: 73.87, 0.0: 26.13}"
Support Vector Classifier,0.841116,0.834875,0.841116,0.832007,0.838499,0.831296,0.838499,0.830063,"{1.0: 73.87, 0.0: 26.13}"
K-Nearest Neighbors,0.889647,0.887246,0.889647,0.887563,0.849375,0.844574,0.849375,0.845764,"{1.0: 73.87, 0.0: 26.13}"
Decision Tree Classifier,1.0,1.0,1.0,1.0,0.817836,0.819533,0.817836,0.818638,"{1.0: 73.87, 0.0: 26.13}"
Gradient Boosting Classifier,0.884138,0.881304,0.884138,0.881058,0.859434,0.854761,0.859434,0.855283,"{1.0: 73.87, 0.0: 26.13}"


In [20]:
# OTIMIZAOÇÃO RF
# rf_gridsearch_CP_clas = grid_search_RF(X_tr_CP_clas_FS, y_tr_CP_clas, param_grid = PG_rf_Classifier,
#                                        regression = False, refit_metric = 'f1',
#                                        n_splits = 3)
# 
# results_CPV_FS = eval_classifier_CV(X_tr_CP_clas_FS, X_ts_CP_clas_FS, y_tr_CP_clas, y_ts_CP_clas,
#                   classifiers = {'RF grid_search' : rf_gridsearch_CP_clas.best_estimator_},
#                   CV = True, n_splits = 3)
# 
# results_CPV_FS[[c for c in classifiers_baseline_report.columns if ('tr' in c) or ('test' in c)]]

### Regressão + Classificação

In [21]:
tr_CP_reg_clas = pd.concat([X_tr, y_tr_regr, y_tr_clas], axis = 1) # treinar classificador com predições ou dados reais?
tr_CP_reg_clas = preprocess_data(data = tr_CP_reg_clas[tr_CP_reg_clas[CP] == CP_type],
                                     params_filter = {}, verbose = False)

X_tr_CP_reg_clas = tr_CP_reg_clas.drop(columns = target_clas)
y_tr_CP_reg_clas = tr_CP_reg_clas[target_clas]


ts_CP_reg_clas = pd.concat([X_ts, y_ts_clas], axis = 1)
ts_CP_reg_clas = preprocess_data(data = ts_CP_reg_clas[ts_CP_reg_clas[CP] == CP_type],
                                     params_filter = {}, verbose = False)

X_ts_CP_reg_clas = ts_CP_reg_clas.drop(columns = target_clas)
y_ts_CP_reg_clas = ts_CP_reg_clas[target_clas]


rf_fc7d = rf_gridsearch_CP_regr.best_estimator_
scaler = MinMaxScaler()
X_tr_CP_reg_clas_scaled = scaler.fit_transform(X_tr_CP_reg_clas[features_selected_regr])
X_ts_CP_reg_clas_scaled = scaler.transform(ts_CP_reg_clas[features_selected_regr])

# y_tr_pred_F7 = pd.Series(rf_fc7d.predict(X_tr_CP_reg_clas_scaled))
y_ts_pred_F7 = rf_fc7d.predict(X_ts_CP_reg_clas_scaled)

y_ts_pred_F7 = pd.DataFrame(y_ts_pred_F7, index=X_ts_CP_reg_clas.index, columns=['Fc 7d'])
X_ts_CP_reg_clas['Fc 7d'] = y_ts_pred_F7
X_ts_CP_reg_clas

Unnamed: 0,cimento Tipo,cimento Classe de resistência,Mesp Brita 0,Mesp Brita 1,Mesp Areia natural,Mesp Areia artificial,Tempo de transporte,Slump,Fck,CT Cimento,...,Areia_total,Agregados,CT Brita 0_Cimento,CT Brita 1_Cimento,CT Areia natural_Cimento,CT Areia artificial_Cimento,Brita_total_Cimento,Areia_total_Cimento,Agregados_Cimento,Fc 7d
12345,CP-V,ARI-RS,2.67,2.67,2.64,2.66,0.024306,160.0,30.0,318.0,...,689.0,1777.0,0.512579,2.908805,1.078616,1.088050,3.421384,2.166667,5.588050,31.564847
13790,CP-V,ARI-RS,2.69,2.69,2.64,2.66,0.009722,100.0,30.0,305.0,...,756.0,1871.0,0.439344,3.216393,1.481967,0.996721,3.655738,2.478689,6.134426,31.179771
23038,CP-V,ARI-RS,2.69,2.69,2.64,2.66,0.022222,200.0,40.0,388.0,...,704.0,1728.0,0.262887,2.376289,1.085052,0.729381,2.639175,1.814433,4.453608,47.565311
20238,CP-V,ARI-RS,2.67,2.67,2.64,2.66,0.024306,160.0,25.0,332.0,...,777.0,1801.0,0.370482,2.713855,1.400602,0.939759,3.084337,2.340361,5.424699,33.477517
21849,CP-V,ARI-RS,2.67,2.67,2.64,2.66,0.015278,140.0,40.0,420.0,...,660.0,1739.0,0.385714,2.183333,0.783333,0.788095,2.569048,1.571429,4.140476,42.306680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24731,CP-V,ARI-RS,2.67,2.67,2.64,2.66,0.023611,120.0,45.0,376.0,...,645.0,1742.0,0.351064,2.566489,1.026596,0.688830,2.917553,1.715426,4.632979,40.298850
20392,CP-V,ARI-RS,2.67,2.67,2.64,2.66,0.025000,160.0,25.0,325.0,...,795.0,1806.0,0.467692,2.643077,1.218462,1.227692,3.110769,2.446154,5.556923,31.619924
15320,CP-V,ARI-RS,2.67,2.67,2.64,2.66,0.040972,140.0,35.0,380.0,...,689.0,1758.0,0.336842,2.476316,1.084211,0.728947,2.813158,1.813158,4.626316,38.673708
14631,CP-V,ARI-RS,2.67,2.67,2.64,2.66,0.020833,140.0,35.0,332.0,...,739.0,1794.0,0.475904,2.701807,1.108434,1.117470,3.177711,2.225904,5.403614,33.367177


In [22]:
feat_select_reg_clas = grid_search_FS_RF(X_tr_CP_reg_clas.drop(columns = group_columns), y_tr_CP_reg_clas, PG_rf_FeatSelection,
                                     n_splits = 3, verbose = False,
                                     regression = False, refit_metric = 'f1')

feat_import_reg_clas = feat_select_reg_clas[1]
features_selected_reg_clas = feat_import_reg_clas['Feature'].tolist()[:10]

X_tr_CP_reg_clas_FS = X_tr_CP_reg_clas[features_selected_reg_clas]
X_ts_CP_reg_clas_FS = X_ts_CP_reg_clas[features_selected_reg_clas]

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters for RandomForestClassifier:  {'max_depth': 10, 'n_estimators': 100}


In [23]:
classifiers_RegClas_report = eval_classifier_CV(X_tr_CP_reg_clas_FS, X_ts_CP_reg_clas_FS, y_tr_CP_reg_clas, y_ts_CP_reg_clas,
                  classifiers = classifiers, CV = True, n_splits = 3)

classifiers_RegClas_report['Case'] = '28d_REG_F7'

classifiers_RegClas_report[[c for c in classifiers_RegClas_report.columns if ('tr' in c) or ('test' in c)]]

Unnamed: 0,Accuracy_tr,Precision_tr,Recall_tr,F1 Score_tr,Accuracy_test,Precision_test,Recall_test,F1 Score_test,Class Distribution (%)
Logistic Regression,0.829661,0.822129,0.829661,0.818093,0.74245,0.704462,0.74245,0.656725,"{1.0: 73.87, 0.0: 26.13}"
Random Forest Classifier,1.0,1.0,1.0,1.0,0.754558,0.725764,0.754558,0.72141,"{1.0: 73.87, 0.0: 26.13}"
Support Vector Classifier,0.841116,0.834875,0.841116,0.832007,0.739601,0.688935,0.739601,0.662425,"{1.0: 73.87, 0.0: 26.13}"
K-Nearest Neighbors,0.889647,0.887246,0.889647,0.887563,0.740456,0.707067,0.740456,0.7098,"{1.0: 73.87, 0.0: 26.13}"
Decision Tree Classifier,1.0,1.0,1.0,1.0,0.712108,0.693205,0.712108,0.700426,"{1.0: 73.87, 0.0: 26.13}"
Gradient Boosting Classifier,0.884138,0.881304,0.884138,0.881058,0.753846,0.72287,0.753846,0.704544,"{1.0: 73.87, 0.0: 26.13}"


In [24]:
tr_CP_regFul_clas = pd.concat([X_tr, y_tr_clas], axis = 1) # treinar classificador com predições ou dados reais?
tr_CP_regFul_clas = preprocess_data(data = tr_CP_regFul_clas[tr_CP_regFul_clas[CP] == CP_type],
                                     params_filter = {}, verbose = False)

X_tr_CP_regFul_clas = tr_CP_regFul_clas.drop(columns = target_clas)
y_tr_CP_regFul_clas = tr_CP_regFul_clas[target_clas]


ts_CP_regFul_clas = pd.concat([X_ts, y_ts_clas], axis = 1)
ts_CP_regFul_clas = preprocess_data(data = ts_CP_regFul_clas[ts_CP_regFul_clas[CP] == CP_type],
                                     params_filter = {}, verbose = False)

X_ts_CP_regFul_clas = ts_CP_regFul_clas.drop(columns = target_clas)
y_ts_CP_regFul_clas = ts_CP_regFul_clas[target_clas]


rf_fc7d = rf_gridsearch_CP_regr.best_estimator_
scaler = MinMaxScaler()
X_tr_CP_regFul_clas_scaled = scaler.fit_transform(X_tr_CP_regFul_clas[features_selected_regr])
X_ts_CP_regFul_clas_scaled = scaler.transform(ts_CP_regFul_clas[features_selected_regr])

y_tr_pred_F7 = rf_fc7d.predict(X_tr_CP_regFul_clas_scaled)
y_ts_pred_F7 = rf_fc7d.predict(X_ts_CP_regFul_clas_scaled)

y_tr_pred_F7 = pd.DataFrame(y_tr_pred_F7, index=X_tr_CP_regFul_clas.index, columns=['Fc 7d'])
X_tr_CP_regFul_clas['Fc 7d'] = y_tr_pred_F7

y_ts_pred_F7 = pd.DataFrame(y_ts_pred_F7, index=X_ts_CP_regFul_clas.index, columns=['Fc 7d'])
X_ts_CP_regFul_clas['Fc 7d'] = y_ts_pred_F7


feat_select_regFul_clas = grid_search_FS_RF(X_tr_CP_regFul_clas.drop(columns = group_columns), y_tr_CP_regFul_clas, PG_rf_FeatSelection,
                                     n_splits = 3, verbose = False,
                                     regression = False, refit_metric = 'f1')

feat_import_regFul_clas = feat_select_regFul_clas[1]
features_selected_regFul_clas = feat_import_regFul_clas['Feature'].tolist()[:10]

X_tr_CP_regFul_clas_FS = X_tr_CP_regFul_clas[features_selected_regFul_clas]
X_ts_CP_regFul_clas_FS = X_ts_CP_regFul_clas[features_selected_regFul_clas]

classifiers_RegFullClas_report = eval_classifier_CV(X_tr_CP_regFul_clas_FS, X_ts_CP_regFul_clas_FS, y_tr_CP_regFul_clas, y_ts_CP_regFul_clas,
                  classifiers = classifiers, CV = True, n_splits = 3)

classifiers_RegFullClas_report['Case'] = '28d_REGFULL_F7'

classifiers_RegFullClas_report[[c for c in classifiers_RegFullClas_report.columns if ('tr' in c) or ('test' in c)]]

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters for RandomForestClassifier:  {'max_depth': 10, 'n_estimators': 100}


Unnamed: 0,Accuracy_tr,Precision_tr,Recall_tr,F1 Score_tr,Accuracy_test,Precision_test,Recall_test,F1 Score_test,Class Distribution (%)
Logistic Regression,0.732322,0.697264,0.732322,0.648625,0.739886,0.69156,0.739886,0.656799,"{1.0: 72.57, 0.0: 27.43}"
Random Forest Classifier,0.95711,0.956888,0.95711,0.956783,0.740313,0.738492,0.740313,0.739331,"{1.0: 72.57, 0.0: 27.43}"
Support Vector Classifier,0.73155,0.763827,0.73155,0.629646,0.740313,0.732692,0.740313,0.640372,"{1.0: 72.57, 0.0: 27.43}"
K-Nearest Neighbors,0.818972,0.811144,0.818972,0.810965,0.753419,0.737698,0.753419,0.742707,"{1.0: 72.57, 0.0: 27.43}"
Decision Tree Classifier,0.95711,0.960997,0.95711,0.957831,0.707265,0.721387,0.707265,0.713321,"{1.0: 72.57, 0.0: 27.43}"
Gradient Boosting Classifier,0.780622,0.774878,0.780622,0.745053,0.761823,0.736444,0.761823,0.720125,"{1.0: 72.57, 0.0: 27.43}"


In [28]:
results = pd.concat([classifiers_28_SEM_F7_report,
                     # classifiers_RegFullClas_report,
                     classifiers_RegClas_report, classifiers_baseline_report]).reset_index()

results_rf = results[results['index'] == 'Random Forest Classifier']
results_rf[['Case', 'Class Distribution (%)', 'Recall_tr', 'F1 Score_tr','Recall_test', 'F1 Score_test']]

Unnamed: 0,Case,Class Distribution (%),Recall_tr,F1 Score_tr,Recall_test,F1 Score_test
1,28d_SEM_F7,"{1.0: 72.57, 0.0: 27.43}",0.956917,0.956578,0.739459,0.738709
7,28d_REG_F7,"{1.0: 73.87, 0.0: 26.13}",1.0,1.0,0.754558,0.72141
13,28d_COM_F7,"{1.0: 73.87, 0.0: 26.13}",1.0,1.0,0.853045,0.85124
