## Nasa Numeric

In [1]:
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

from scipy.io import arff

#algoritmos ia
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm 
from sklearn.tree import DecisionTreeRegressor

from sklearn import model_selection as ms
from sklearn.model_selection import GridSearchCV

In [2]:
#Carregando dados

data, meta = arff.loadarff('datasets/nasa_numeric.arff')
df = pd.DataFrame(data)
df.head()

Unnamed: 0,recordnumber,projectname,cat2,forg,center,year,mode,rely,data,cplx,...,acap,aexp,pcap,vexp,lexp,modp,tool,sced,equivphyskloc,act_effort
0,1.0,b'de',b'avionicsmonitoring',b'g',b'2',1979.0,b'semidetached',b'h',b'l',b'h',...,b'n',b'n',b'n',b'n',b'h',b'h',b'n',b'l',25.9,117.6
1,2.0,b'de',b'avionicsmonitoring',b'g',b'2',1979.0,b'semidetached',b'h',b'l',b'h',...,b'n',b'n',b'n',b'n',b'h',b'h',b'n',b'l',24.6,117.6
2,3.0,b'de',b'avionicsmonitoring',b'g',b'2',1979.0,b'semidetached',b'h',b'l',b'h',...,b'n',b'n',b'n',b'n',b'h',b'h',b'n',b'l',7.7,31.2
3,4.0,b'de',b'avionicsmonitoring',b'g',b'2',1979.0,b'semidetached',b'h',b'l',b'h',...,b'n',b'n',b'n',b'n',b'h',b'h',b'n',b'l',8.2,36.0
4,5.0,b'de',b'avionicsmonitoring',b'g',b'2',1979.0,b'semidetached',b'h',b'l',b'h',...,b'n',b'n',b'n',b'n',b'h',b'h',b'n',b'l',9.7,25.2


## Preparando Dados

In [3]:
def one_hot_enconder(X_transform, transform_data): #X_transform = data, transform_data = colunas   
    for col in transform_data:
        cols = X_transform[col].unique() #retorna os valores da categoria que serão transformados em colunas 
        col_position = X_transform.columns.get_loc(col) #retorna posição da coluna, a partir dessa posição será adicionado as demais
        col_values = X_transform.pop(col) #retorna os valores da coluna primordial e apaga e remove 
        for c in cols: 
            X_transform.insert(col_position, str(col) + "_" + str(c).replace('b\'','').replace('\'',''), col_values.eq(c))
            col_position += 1

    return X_transform.replace({True: 1, False: 0})          

In [4]:
#removendo atributos desnecessários "recordnumber", "projectname", "year".

df = df.drop(['recordnumber','projectname', 'year'], axis=1)

#removendo linhas com atributos nulos ou vazios

df = df.dropna()

#Convertendo atributos textuais ordinais em numeric com escala de atributos (ainda sem normalização) "OrdinalEncoder"

tranform_data = {b"vl": 0,b"l":1,b"n":2,b"h":3,b"vh":4,b"xh":5}
df = df.replace(tranform_data)

#Convertendo atributos categóricos em colunas. "OneHotEncoder"

transform_data = ['cat2', 'forg', 'center', 'mode']
df =  one_hot_enconder(df, transform_data)

#normalizando dados - #min-max df=(df-df.min())/(df.max()-df.min()) - #zscore df=(df-df.mean())/df.std()

df = (df-df.mean())/df.std()

#separando atributo classe ACT_Effort para entrada nos modelos

X = df.drop(df.columns[-1], axis=1)
y = df.take([-1], axis=1).values.ravel()

## Treinando e avaliando o desempenho dos modelos

In [11]:
cv = ms.KFold(n_splits=3, shuffle=True, random_state=1)
models = []
#Instanciando os modelos #0 - nome, 1 - instância, 2 - parâmetros
models.append([' DT', DecisionTreeRegressor(), { "criterion" : ["squared_error", "friedman_mse", "absolute_error"],
                                                "splitter": ["best", "random"],
                                                "max_depth": np.arange(4,12),
                                                "random_state": [7], #np.arange(3,10)
                                                }])
models.append(['KNN', KNeighborsRegressor(), {  "n_neighbors" : np.arange(15,25),
                                                "weights": ["uniform", "distance"],
                                                "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']
                                                }])
models.append(['MLP', MLPRegressor(), {"hidden_layer_sizes": [5], #np.arange(2,8), #estabilizou no 5, fixado par reduzir tempo de processamento
                                                "activation": ['identity', 'logistic', 'tanh', 'relu'],
                                                "solver": ['lbfgs', 'sgd', 'adam'],
                                                "random_state": [0], #np.arange(0,6),  #fixado para reduzir o tempo de processamento.
                                                "max_iter": [3500]
                                                }])
models.append(['RFR', RandomForestRegressor(), {"n_estimators": np.arange(5,20,2),
                                                    "max_depth" : np.arange(5,20,2),
                                                    "random_state": np.arange(2,6),
                                                    }])
models.append(['SVR', svm.SVR(), { "kernel" : [ 'poly', 'linear', 'rbf', 'sigmoid'], 
                                        "C": np.arange(2,14,2),
                                        "epsilon": np.arange(0.0001,0.001,0.0001), #np.arange(0.0001,0.01,0.0001), # fixado para diminuir tempo de proc. 
                                    }])

In [12]:
for model in models:
    clf = GridSearchCV(model[1], model[2], scoring='neg_mean_absolute_error',  cv=cv,  n_jobs=-1)
    clf.fit(X, y)
    print(">", model[0], ">------ MAE: %.3f" % (1 + clf.best_score_), " ----- STD: %.3f" 
    % (clf.cv_results_['std_test_score'][clf.best_index_]), " ----- Best Params:", (clf.best_params_)) 

>  DT >------ MAE: 0.716  ----- STD: 0.047  ----- Best Params: {'criterion': 'absolute_error', 'max_depth': 10, 'random_state': 7, 'splitter': 'random'}
> KNN >------ MAE: 0.672  ----- STD: 0.064  ----- Best Params: {'algorithm': 'auto', 'n_neighbors': 23, 'weights': 'distance'}
> MLP >------ MAE: 0.600  ----- STD: 0.054  ----- Best Params: {'activation': 'logistic', 'hidden_layer_sizes': 5, 'max_iter': 3500, 'random_state': 0, 'solver': 'sgd'}
> RFR >------ MAE: 0.748  ----- STD: 0.041  ----- Best Params: {'max_depth': 11, 'n_estimators': 5, 'random_state': 3}
> SVR >------ MAE: 0.666  ----- STD: 0.062  ----- Best Params: {'C': 6, 'epsilon': 0.0002, 'kernel': 'poly'}
