## Seera

In [1]:
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt

#algoritmos ia
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm 
from sklearn.tree import DecisionTreeRegressor

from sklearn import model_selection as ms
from sklearn.model_selection import GridSearchCV

In [2]:
#Carregando dados

data = pd.read_csv('datasets/seera.csv', skiprows=1,sep = ';')
data.head()

Unnamed: 0,ProjID,Year of project,Organization id,Organization type,Role in organization,Size of organization,Size of IT department,Customer organization type,Estimated duration,Actual duration,...,Requirement accuracy level,Technical documentation,Comments within the code,User manual,Required reusability,Performance requirements,Product complexity,Security requirements,Reliability requirements,Specified H/W
0,1,2015,1,1,1,16,7,13,2,3,...,?,1,3,1,2,1,2,2,3,1
1,2,2016,25,5,1,2,1,2,2,3,...,2,2,2,1,1,1,1,4,2,1
2,3,2008,2,5,3,2,2,3,3,5,...,3,2,2,2,4,4,3,5,2,3
3,4,2009,42,4,2,3,2,4,6,6,...,1,1,3,4,4,2,4,3,4,2
4,5,2016,42,4,2,3,2,4,12,24,...,1,1,3,4,4,4,3,3,4,2


## Preparando Dados

In [3]:
def one_hot_encoder(X_transform, transform_data): #X_transform = data, transform_data = colunas
    for col in transform_data:
        if col in X_transform.columns:
            cols = X_transform[col].unique() #retorna os valores da categoria que serão transformados em colunas 
            col_position = X_transform.columns.get_loc(col) #retorna posição da coluna, a partir dessa posição será adicionado as demais
            col_values = X_transform.pop(col) #retorna os valores da coluna primordial e apaga e remove 
            for c in cols: 
                X_transform.insert(col_position, str(col) + "_" + str(c).replace('b\'','').replace('\'',''), col_values.eq(c).astype('int'))
                col_position += 1

    return X_transform  
   

In [4]:
#Definindo os atributos que serão usados. +
cols_drop = [
'ProjID', 
'Year of project',
'Organization id',
'Organization type',
'Role in organization',
'Size of organization',
'Size of IT department',
'Customer organization type',
'Estimated  duration',
'Actual duration',
'% project gain (loss)',
'Development type',
'Application domain',
#Object points; +
'Other sizing method',
'Estimated size',
'Estimated effort',
#Actual effort; + 
'Contract maturity',
'Government policy impact',
'Economic instability impact',
'Organization management structure clarity',
'Developer hiring policy',
'Developer incentives policy ',
'Developer training',
#Development team management; +
'Top management support',
'Top management opinion of previous system',
'Clarity of manual system',
'User resistance',
'User computer experience', 
' Users stability ', 
' Requirment stability ', 
' Requirements flexibility ',
#Project manager experience;+
'Consultant availability',
'DBMS  expert availability',
#Precedentedness;+
'Software tool experience',
#Programmers experience in programming language;+
'Programmers capability ',
'Analysts capability ',
'Team selection',
#Team size; +
'Dedicated team members',
'Daily working hours',
'Team contracts',
'Team continuity ',
'Team cohesion',
'Income satisfaction',
'Schedule quality',
'Development environment adequacy',
'Tool availability ',
'Methodology',
'# Multiple programing languages ',
#Programming language used; +
'DBMS used',
'Technical stability',
'Open source software',
'Level of outsourcing',
'Outsourcing impact',
'Degree of software reuse ',
'Degree of risk management',
#Use of standards; +
'Degree of standards usage', 
' Process reengineering ',
#Requirement accuracy level;+
'Technical documentation',
'Comments within the code',
'User manual',
#Required reusability; +
#Performance requirements; +
#Product complexity; +
#Security requirements; +
#Reliability requirements; +
'Specified H/W'    
]

df = data.drop(cols_drop, axis=1)

In [5]:
#removendo linhas com atributos nulos, vazios ou caracteres

df = df.replace(r'[^0-9a-zA-Z:]+', np.nan, regex=True)
#df[df.isna().any(axis=1)] #show rows with nan values
df = df.dropna()

#Convertendo atributos categóricos em colunas. "OneHotEncoder"

transform_data = ['Programming language used']
df =  one_hot_encoder(df, transform_data)

#convertendo dados to_numeric, para evitar incompatibilidade

for c in range(df.shape[1]):
    col = df.columns[c]
    df[col] = pd.to_numeric(df[col]) 


""" #verificador de atributos com problemas de conversão
for c in range(df.shape[1]):
    try:
        col = df.columns[c]
        df[col] = pd.to_numeric(df[col]) 
    except ValueError:
        mask = pd.to_numeric(df[col], errors='coerce').isna() 
        #if possible missing values
        #mask = pd.to_numeric(df[col].fillna('0'), errors='coerce').isna() 
        L = df.loc[mask, col].tolist()
        #Not converted values are: Wrong date format
        print ("Not converted values are: " + ", ".join(L))

"""


#normalizando dados - #min-max df=(df-df.min())/(df.max()-df.min()) - #zscore df=(df-df.mean())/df.std()

df = (df-df.mean())/df.std()

#separando atributo classe ACT_Effort para entrada nos modelos

X = df.drop('Actual effort', axis=1)
y = df.take([1], axis=1).values.ravel()

## Treinando e avaliando o desempenho dos modelos

In [10]:
cv = ms.KFold(n_splits=3, shuffle=True, random_state=1)
models = []
#Instanciando os modelos #0 - nome, 1 - instância, 2 - parâmetros
models.append([' DT', DecisionTreeRegressor(), { "criterion" : ["squared_error", "friedman_mse", "absolute_error"],
                                                "splitter": ["best", "random"],
                                                "max_depth": np.arange(2,9),
                                                "random_state": [7]#np.arange(3,10)
                                                }])
models.append(['KNN', KNeighborsRegressor(), {  "n_neighbors" : np.arange(2,15),
                                                "weights": ["uniform", "distance"],
                                                "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']
                                                }])
models.append(['MLP', MLPRegressor(), {"hidden_layer_sizes": np.arange(10,22,2), #fixado para reduzir tempo de processamento.
                                                "activation": ['identity', 'logistic', 'tanh', 'relu'],
                                                "solver": ['lbfgs', 'sgd', 'adam'],
                                                "random_state": [1], #np.arange(0,6),  #fixado para reduzir o tempo de processamento.
                                                "max_iter": [3500]
                                                }])
models.append(['RFR', RandomForestRegressor(), {"n_estimators": np.arange(5,20,2),
                                                    "max_depth" : np.arange(5,20,2),
                                                    "random_state": [5] #np.arange(2,8),
                                                    }])
models.append(['SVR', svm.SVR(), { "kernel" : [ 'poly', 'linear', 'rbf', 'sigmoid'], 
                                        "C": np.arange(2,8,2),
                                        "epsilon": np.arange(0.1,0.5,0.01), 
                                    }])

In [11]:
for model in models:
    clf = GridSearchCV(model[1], model[2], scoring='neg_mean_absolute_error',  cv=cv,  n_jobs=-1)
    clf.fit(X, y)
    print(">", model[0], ">------ MAE: %.3f" % (1 + clf.best_score_), " ----- STD: %.3f" 
    % (clf.cv_results_['std_test_score'][clf.best_index_]), " ----- Best Params:", (clf.best_params_)) 

>  DT >------ MAE: 0.679  ----- STD: 0.037  ----- Best Params: {'criterion': 'absolute_error', 'max_depth': 4, 'random_state': 7, 'splitter': 'random'}
> KNN >------ MAE: 0.550  ----- STD: 0.094  ----- Best Params: {'algorithm': 'kd_tree', 'n_neighbors': 7, 'weights': 'distance'}
> MLP >------ MAE: 0.591  ----- STD: 0.018  ----- Best Params: {'activation': 'tanh', 'hidden_layer_sizes': 18, 'max_iter': 3500, 'random_state': 1, 'solver': 'adam'}
> RFR >------ MAE: 0.655  ----- STD: 0.012  ----- Best Params: {'max_depth': 7, 'n_estimators': 7, 'random_state': 5}
> SVR >------ MAE: 0.666  ----- STD: 0.034  ----- Best Params: {'C': 4, 'epsilon': 0.13999999999999999, 'kernel': 'linear'}
