## Cocomo Nasa V1

In [5]:
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

from scipy.io import arff

#algoritmos ia
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm 

from sklearn import model_selection as ms
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

## Preparando Dados

In [6]:
#Carregando dados

data, meta = arff.loadarff('datasets/cocomonasa_v1.arff')
df = pd.DataFrame(data)
df.head()

Unnamed: 0,RELY,DATA,CPLX,TIME,STOR,VIRT,TURN,ACAP,AEXP,PCAP,VEXP,LEXP,MODP,TOOL,SCED,LOC,ACT_EFFORT
0,b'Nominal',b'High',b'Very_High',b'Nominal',b'Nominal',b'Low',b'Nominal',b'High',b'Nominal',b'Very_High',b'Low',b'Nominal',b'High',b'Nominal',b'Low',70.0,278.0
1,b'Very_High',b'High',b'High',b'Very_High',b'Very_High',b'Nominal',b'Nominal',b'Very_High',b'Very_High',b'Very_High',b'Nominal',b'High',b'High',b'High',b'Low',227.0,1181.0
2,b'Nominal',b'High',b'High',b'Very_High',b'High',b'Low',b'High',b'High',b'Nominal',b'High',b'Low',b'High',b'High',b'Nominal',b'Low',177.9,1248.0
3,b'High',b'Low',b'High',b'Nominal',b'Nominal',b'Low',b'Low',b'Nominal',b'Nominal',b'Nominal',b'Nominal',b'High',b'High',b'Nominal',b'Low',115.8,480.0
4,b'High',b'Low',b'High',b'Nominal',b'Nominal',b'Low',b'Low',b'Nominal',b'Nominal',b'Nominal',b'Nominal',b'High',b'High',b'Nominal',b'Low',29.5,120.0


In [7]:
#removendo linhas com atributos nulos ou vazios

df = df.dropna()

#Convertendo atributos textuais em numeric com escala de atributos (ainda sem normalização) "OrdinalEncoder"

tranform_data = {b"Very_Low": 0, b"Low": 1, b"Nominal": 2, b"High": 3, b"Very_High": 4, b"Extra_High": 5}
df = df.replace(tranform_data)

#normalizando dados - #min-max df=(df-df.min())/(df.max()-df.min()) - #zscore df=(df-df.mean())/df.std()

df = (df-df.mean())/df.std()

#separando atributo classe ACT_Effort para entrada nos modelos #df.columns[-1]

X = df.drop(df.columns[-1], axis=1)
y = df.take([-1], axis=1).values

## Treinando e avaliando o desempenho dos modelos

In [8]:
models = []
#Instanciando os modelos
models.append(['KNN', KNeighborsRegressor(n_neighbors=5)]) 
models.append(['MLP', MLPRegressor(activation='logistic', solver='sgd', alpha=0.02, max_iter=300, hidden_layer_sizes=200)])
models.append(['RFR', RandomForestRegressor()])
models.append(['SVR', svm.SVR()])

cv = ms.KFold(n_splits=3, shuffle=True, random_state=1)

In [9]:
for model in models:
    
    scores = ms.cross_val_score(model[1], X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    #converter para positivo.
    scores = np.absolute(scores)
    s_mean = np.mean(scores)
    s_std = np.std(scores)
    
    print('-', model[0],'------------------------------------')
    print('Scores:', scores, 'MAE: %.4f' % (s_mean), 'STD: %.4f' % (s_std)) 

- KNN ------------------------------------
Scores: [0.65105381 0.41409165 0.37898784] MAE: 0.4814 STD: 0.1208


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


- MLP ------------------------------------
Scores: [0.61380527 0.40160267 0.45728306] MAE: 0.4909 STD: 0.0898


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


- RFR ------------------------------------
Scores: [0.32787999 0.24897435 0.11068234] MAE: 0.2292 STD: 0.0898
- SVR ------------------------------------
Scores: [0.53730604 0.29982776 0.33528374] MAE: 0.3908 STD: 0.1046


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [4]:
models = []
#Instanciando os modelos #0 - nome, 1 - instância, 2 - parâmetros
models.append(['DT', DecisionTreeRegressor(), { "criterion" : ["squared_error", "friedman_mse", "absolute_error"],
                                                "splitter": ["best", "random"],
                                                "max_depth": np.arange(4,12,2),
                                                "random_state": np.arange(2,6)
                                                }])
models.append(['KNN', KNeighborsRegressor(), {  "n_neighbors" : np.arange(1,10),
                                                "weights": ["uniform", "distance"],
                                                "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']
                                                }])
models.append(['MLP', MLPRegressor(), {"hidden_layer_sizes": np.arange(10,100,10),
                                                "activation": ['identity', 'logistic', 'tanh', 'relu'],
                                                "solver": ['lbfgs', 'sgd', 'adam'],
                                                "random_state": np.arange(2,6),
                                                "max_iter": [1500]
                                                }])
models.append(['RFR', RandomForestRegressor(), {"n_estimators": np.arange(5,20,5),
                                                    "max_depth" : np.arange(2,14,2),
                                                    "random_state": np.arange(2,6),
                                                    }])
models.append(['SVR', svm.SVR(), { "kernel" : [ 'poly', 'linear', 'rbf', 'sigmoid'], 
                                        "C": np.arange(2,14,2), #1 a 10
                                        "epsilon": np.arange(0.001,0.1,0.002), # 0.1 a 0.9
                                    }])

NameError: name 'DecisionTreeRegressor' is not defined

In [None]:
for model in models:
    clf = GridSearchCV(model[1], model[2], scoring='neg_mean_absolute_error',  cv=cv,  n_jobs=-1)
    clf.fit(X, y.values.ravel())
    print(">", model[0], ">------ MAE: %.4f" % (np.absolute(clf.best_score_)), " ----- STD: %.4f" % (clf.cv_results_['std_test_score'][clf.best_index_]), " ----- Best Params:", (clf.best_params_))