## Cocomo81

In [None]:
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

from scipy.io import arff

#algoritmos ia
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor

from sklearn import model_selection as ms
from sklearn.model_selection import GridSearchCV

## Preparando dados

In [None]:
#Carregando dados

data, meta = arff.loadarff('datasets/cocomo81.arff')
df = pd.DataFrame(data)
df.head()

In [None]:
#removendo linhas com atributos nulos ou vazios

df = df.dropna()

#normalizando dados - #min-max df=(df-df.min())/(df.max()-df.min()) - #zscore df=(df-df.mean())/df.std()

df = (df-df.mean())/df.std()

#separando atributo classe actual para entrada nos modelos #df.columns[-1]

X = df.drop(df.columns[-1], axis=1)
y = df.take([-1], axis=1)

## Treinando e avaliando o desempenho dos modelos

In [None]:
models = []
#Instanciando os modelos
models.append(['DT', DecisionTreeRegressor()])
#models.append(['KNN', KNeighborsRegressor(n_neighbors=5)]) 
#models.append(['MLP', MLPRegressor(activation='logistic', solver='sgd', alpha=0.02, max_iter=300, hidden_layer_sizes=200)])
#models.append(['RFR', RandomForestRegressor()])
#models.append(['SVR', svm.SVR()])

cv = ms.KFold(n_splits=3, shuffle=True, random_state=1)

kn = KNeighborsRegressor()



In [558]:
parameters={"p": [1,2], 
            "n_neighbors" : np.arange(1,11), #1 a 10
            "weights": ["uniform", "distance"]
}

clf = GridSearchCV(kn, parameters, scoring='neg_mean_absolute_error',  cv=cv,  n_jobs=-1)
clf.fit(X, y)

print(clf.best_estimator_)
print(clf.best_score_)
print(clf.best_params_)

data = pd.DataFrame(clf.cv_results_)[["mean_test_score","std_test_score"]]

data.loc[clf.best_index_] 

print ("MAE: %.4f" % (data.loc[clf.best_index_]["mean_test_score"]))

KNeighborsRegressor(n_neighbors=4, weights='distance')
-0.3179905720798769
{'n_neighbors': 4, 'p': 2, 'weights': 'distance'}
MAE: -0.3180


In [529]:
scores = ms.cross_val_score(KNeighborsRegressor(n_neighbors=4, weights='distance'), X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

scores = np.absolute(scores)
s_mean = np.mean(scores)
s_std = np.std(scores)
    
print('Scores:', scores, 'MAE: %.4f' % (s_mean), 'STD: %.4f' % (s_std)) 

Scores: [0.11490809 0.57999755 0.25906608] MAE: 0.3180 STD: 0.1944


In [None]:


parameters={"splitter":["best","random"],
            "max_depth" : np.arange(1,10), #[ 1,  3,  5,  7,  9, 11, 13]
            "criterion": ["squared_error", "absolute_error"]
}
dt = DecisionTreeRegressor()
clf = GridSearchCV(dt, parameters, scoring='neg_mean_absolute_error',  cv=3,  n_jobs=-1)
clf.fit(X, y)

print(clf.best_estimator_)
print(clf.best_score_)

pd.DataFrame(clf.cv_results_)[['params',
                                'mean_test_score',
                                'rank_test_score',
                                'std_test_score']].sort_index(by=["rank_test_score"])



In [None]:
for model in models:
    
    scores = ms.cross_val_score(model[1], X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    #converter para positivo.
    scores = np.absolute(scores)
    s_mean = np.mean(scores)
    s_std = np.std(scores)
    
    print('-', model[0],'------------------------------------')
    print('Scores:', scores, 'MAE: %.4f' % (s_mean), 'STD: %.4f' % (s_std)) 