In [42]:
import os
import sys
import pandas as pd
import joblib

from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import pickle

sys.path.append(os.path.abspath("ColumnsProcessing.py"))
from ColumnsProcessing import *

pd.set_option("display.max_columns", 100)

In [43]:
vectorizer = pickle.load(open("/Users/borisperezg/rebelmodels_storing/models/Vectorizer/vectorizer.pickle", "rb"))

In [44]:
cp = ColumnsProcessing()

In [45]:
fileName = 'facts_prediction_dataset_1616537048521'

In [46]:
# Se usa el mismo dataset de entrada para identificacion de ATD y para identificacion de QA
myDF = read_csv('/Users/borisperezg/rebelmodels_storing/datasets_to_classify/'+fileName+'_pa.csv')
df_ = myDF.copy(deep=True)

In [47]:
df_

Unnamed: 0,factid,driver,goal,sourceelementname,sourceelementtype,layersource,targetelementname,targetelementtype,layertarget,isnewelement,facttype,relatontype,action,incoming,outcoming,ratiolinks,mostlinkedlayer,commitlogs_ngrams,chatlogs_ngrams,adrlogs_ngrams
0,45,cost reduction,fast time to deployment,asignación cita,process,business,,,,,element,,create,1,5,0.29,business,,,
1,46,cost reduction,fast time to deployment,asignación de turno,process,business,,,,,element,,create,2,1,0.14,business,,,
2,47,cost reduction,fast time to deployment,asignación cita,process,business,localización centros de atención,process,business,True,relation,composition,create,0,0,0.0,business,,,
3,48,cost reduction,fast time to deployment,asignación cita,process,business,selección de mecánico,process,business,True,relation,composition,create,0,0,0.0,business,,,
4,49,cost reduction,fast time to deployment,asignación cita,process,business,asignación de turno,process,business,True,relation,composition,create,0,0,0.0,business,,,
5,50,cost reduction,fast time to deployment,asignación cita,process,business,post-asignación cita,process,business,True,relation,triggering,create,0,0,0.0,business,,,
6,51,cost reduction,fast time to deployment,asignación cita,process,business,asignación cita service,service,business,True,relation,realization,create,0,0,0.0,business,,,
7,52,cost reduction,fast time to deployment,localización centros de atención,process,business,asignación de turno,process,business,True,relation,triggering,create,0,0,0.0,business,,,
8,53,cost reduction,fast time to deployment,asignación de turno,process,business,selección de mecánico,process,business,True,relation,triggering,create,0,0,0.0,business,,,
9,54,cost reduction,fast time to deployment,pre-asignación cita,process,business,asignación cita,process,business,True,relation,triggering,create,0,0,0.0,business,,,


In [48]:
# --------------------------------------
# CORRECCION DE NULLS
# --------------------------------------

In [49]:
df_[['targetelementname']] = df_[['targetelementname']].fillna(value='NoName')
df_[['targetelementtype']] = df_[['targetelementtype']].fillna(value='NoType')
df_[['sourceelementtype']] = df_[['sourceelementtype']].fillna(value='NoType')
df_[['mostlinkedlayer']] = df_[['mostlinkedlayer']].fillna(value='NoLayer')
df_[['isnewelement']] = df_[['isnewelement']].fillna(value='NoNew')
df_[['isnewelement']] = df_[['isnewelement']].astype(str)

df_[['layertarget']] = df_[['layertarget']].fillna(value='NoLayer')
df_[['layersource']] = df_[['layersource']].fillna(value='NoLayer')
df_[['relatontype']] = df_[['relatontype']].fillna(value='NoRelation')

df_[['property']] = df_[['property']].fillna(value='NoProperty')
df_[['propertynewvalue']] = df_[['propertynewvalue']].fillna(value='NoValue')
df_[['propertyoldvalue']] = df_[['propertyoldvalue']].fillna(value='NoValue')

In [50]:
# --------------------------------------
# LEMATIZACION Y REMOCION DE STOPWORDS
# --------------------------------------

In [51]:
df_['driver_lemma'] = df_.driver.apply(cp.remove_stopwords_and_lemma)
df_['goal_lemma'] = df_.goal.apply(cp.remove_stopwords_and_lemma)
df_['sourceelementname_lemma'] = df_.sourceelementname.apply(cp.remove_stopwords_and_lemma)
df_['targetelementname_lemma'] = df_.targetelementname.apply(cp.remove_stopwords_and_lemma)

# Los campos commitlogs_ngrams, chatlogs_ngrams, adrlogs_ngrams ya fueron procesados en Step 1

In [52]:
# --------------------------------------
# CONCATENACION DE CAMPOS TEXTUALES LEMATIZADOS
# --------------------------------------

In [53]:
df_['all_texts'] = df_['driver_lemma'].map(str) + ' ' + df_['goal_lemma'].map(str) + ' ' +  df_['sourceelementname_lemma'].map(str) + ' ' +  df_['targetelementname_lemma'].map(str) + ' ' +  df_['commitlogs_ngrams'].map(str) + ' ' +  df_['chatlogs_ngrams'].map(str) + ' ' +  df_['adrlogs_ngrams'].map(str)

In [54]:
df_.drop(['driver', 'goal', 'sourceelementname', 'driver_lemma', 'goal_lemma', 
          'sourceelementname_lemma', 'targetelementname', 'targetelementname_lemma',
         'commitlogs_ngrams', 'chatlogs_ngrams', 'adrlogs_ngrams'], axis=1, inplace=True)

In [55]:
df_.drop(['factid'], axis=1, inplace=True)

In [56]:
# --------------------------------------
# VECTORIZACION DE COLUMNA TEXTUAL
# --------------------------------------

In [57]:
result = cp.textColumnsVectorizationUsed(df_, 'all_texts', vectorizer)

In [58]:
# --------------------------------------
# NORMALIZACION DE COLUMNAS NUMERICAS
# --------------------------------------

In [59]:
result = cp.numericalNormalization(result, 'incoming')

In [60]:
result = cp.numericalNormalization(result, 'outcoming')

In [61]:
# --------------------------------------
# PROCESAMIENTO DE COLUMNAS CATEGORICAS
# --------------------------------------

In [62]:
result = cp.openCategoricalColumnsUsed(result, 'sourceelementtype')
result = cp.openCategoricalColumnsUsed(result, 'targetelementtype')
result = cp.openCategoricalColumnsUsed(result, 'propertynewvalue')
result = cp.openCategoricalColumnsUsed(result, 'propertyoldvalue')
result = cp.openCategoricalColumnsUsed(result, 'mostlinkedlayer')
result = cp.openCategoricalColumnsUsed(result, 'isnewelement')
result = cp.openCategoricalColumnsUsed(result, 'layersource')
result = cp.openCategoricalColumnsUsed(result, 'layertarget')
result = cp.openCategoricalColumnsUsed(result, 'relatontype')
result = cp.openCategoricalColumnsUsed(result, 'property')
result = cp.openCategoricalColumnsUsed(result, 'facttype')
result = cp.openCategoricalColumnsUsed(result, 'action')

In [63]:
result

Unnamed: 0,incoming,outcoming,ratiolinks,asignacion,asignación,atención,automation,años,cae,centro,centros,cita,cola,componente,cost,cómo,de,dándome,días,hace,llamaba,localización,mal,mecanico,mecánico,mocho,nan,noname,omitida,once,outbound,pata,pav,process,puty,queue,reduction,saludar,selección,service,solicitud,turno,ón,sourceelementtype_component,sourceelementtype_process,sourceelementtype_service,targetelementtype_NoType,targetelementtype_interface,targetelementtype_process,targetelementtype_service,mostlinkedlayer_application,mostlinkedlayer_business,isnewelement_False,isnewelement_NoNew,isnewelement_True,layersource_application,layersource_business,layertarget_NoLayer,layertarget_application,layertarget_business,relatontype_NoRelation,relatontype_composition,relatontype_consume,relatontype_realization,relatontype_triggering,facttype_element,facttype_relation,action_create,action_delete,action_update
0,0.5,1.0,0.29,0.0,0.332204,0.0,0.0,0.0,0.0,0.0,0.0,0.236366,0.0,0.0,0.236366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.709099,0.468201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,1.0,0.2,0.14,0.0,0.285407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.203069,0.0,0.312523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609208,0.402246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.203069,0.0,0.0,0.0,0.0,0.45328,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.246309,0.347142,0.0,0.0,0.0,0.0,0.450152,0.175251,0.0,0.0,0.175251,0.0,0.26971,0.0,0.0,0.0,0.0,0.4182,0.0,0.0,0.0,0.0,0.525752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.175251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.236523,0.0,0.0,0.0,0.0,0.0,0.0,0.168288,0.0,0.0,0.168288,0.0,0.258994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.518232,0.0,0.504863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.168288,0.0,0.518232,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.538454,0.0,0.0,0.0,0.0,0.0,0.0,0.191557,0.0,0.0,0.191557,0.0,0.294806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.574671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191557,0.0,0.0,0.0,0.0,0.427583,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.341123,0.0,0.0,0.0,0.0,0.0,0.0,0.485424,0.0,0.0,0.242712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.728135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.57279,0.0,0.0,0.0,0.0,0.0,0.0,0.407545,0.0,0.0,0.203772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.611317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.203772,0.0,0.0,0.221503,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.212735,0.299824,0.0,0.0,0.0,0.0,0.388793,0.0,0.0,0.0,0.151363,0.0,0.465894,0.0,0.0,0.0,0.0,0.361196,0.0,0.0,0.0,0.0,0.454089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151363,0.0,0.0,0.0,0.0,0.337864,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.206334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146809,0.0,0.451876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.452089,0.0,0.440426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146809,0.0,0.452089,0.0,0.0,0.327698,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.341123,0.0,0.0,0.0,0.0,0.0,0.0,0.485424,0.0,0.0,0.242712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.728135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


## ---------------------------------------------
## PREDICCION PARA ATD

In [64]:
# --------------------------------------
# PREPARACION DE LOS DATOS PARA PREDECIR
# --------------------------------------

In [65]:
x = result.values

In [66]:
# --------------------------------------
# CARGA DE MODELO Y PREDICCION
# --------------------------------------

In [67]:
filename = '/Users/borisperezg/rebelmodels_storing/models/ATDIdentification/atdidentificationmodel.sav'
model_for_ATD = joblib.load(filename)



In [68]:
y_pred_ATD = model_for_ATD.predict(x)
print(y_pred_ATD)

['noatd' 'noatd' 'dependency violations' 'dependency violations'
 'dependency violations' 'dependency violations' 'dependency violations'
 'dependency violations' 'dependency violations' 'dependency violations'
 'noatd' 'noatd' 'noatd' 'noatd' 'noatd' 'noatd' 'noatd' 'noatd'
 'dependency violations' 'noatd' 'noatd' 'noatd']


In [69]:
result['atdcause'] = y_pred_ATD
result = cp.openCategoricalColumnsUsed(result, 'atdcause')

In [70]:
myDF

Unnamed: 0,factid,driver,goal,sourceelementname,sourceelementtype,layersource,targetelementname,targetelementtype,layertarget,isnewelement,facttype,relatontype,action,incoming,outcoming,ratiolinks,mostlinkedlayer,commitlogs_ngrams,chatlogs_ngrams,adrlogs_ngrams
0,45,cost reduction,fast time to deployment,asignación cita,process,business,,,,,element,,create,1,5,0.29,business,,,
1,46,cost reduction,fast time to deployment,asignación de turno,process,business,,,,,element,,create,2,1,0.14,business,,,
2,47,cost reduction,fast time to deployment,asignación cita,process,business,localización centros de atención,process,business,True,relation,composition,create,0,0,0.0,business,,,
3,48,cost reduction,fast time to deployment,asignación cita,process,business,selección de mecánico,process,business,True,relation,composition,create,0,0,0.0,business,,,
4,49,cost reduction,fast time to deployment,asignación cita,process,business,asignación de turno,process,business,True,relation,composition,create,0,0,0.0,business,,,
5,50,cost reduction,fast time to deployment,asignación cita,process,business,post-asignación cita,process,business,True,relation,triggering,create,0,0,0.0,business,,,
6,51,cost reduction,fast time to deployment,asignación cita,process,business,asignación cita service,service,business,True,relation,realization,create,0,0,0.0,business,,,
7,52,cost reduction,fast time to deployment,localización centros de atención,process,business,asignación de turno,process,business,True,relation,triggering,create,0,0,0.0,business,,,
8,53,cost reduction,fast time to deployment,asignación de turno,process,business,selección de mecánico,process,business,True,relation,triggering,create,0,0,0.0,business,,,
9,54,cost reduction,fast time to deployment,pre-asignación cita,process,business,asignación cita,process,business,True,relation,triggering,create,0,0,0.0,business,,,


In [71]:
finalDF = myDF.copy(deep=False)
finalDF['atdcause'] = y_pred_ATD
finalDF

Unnamed: 0,factid,driver,goal,sourceelementname,sourceelementtype,layersource,targetelementname,targetelementtype,layertarget,isnewelement,facttype,relatontype,action,incoming,outcoming,ratiolinks,mostlinkedlayer,commitlogs_ngrams,chatlogs_ngrams,adrlogs_ngrams,atdcause
0,45,cost reduction,fast time to deployment,asignación cita,process,business,,,,,element,,create,1,5,0.29,business,,,,noatd
1,46,cost reduction,fast time to deployment,asignación de turno,process,business,,,,,element,,create,2,1,0.14,business,,,,noatd
2,47,cost reduction,fast time to deployment,asignación cita,process,business,localización centros de atención,process,business,True,relation,composition,create,0,0,0.0,business,,,,dependency violations
3,48,cost reduction,fast time to deployment,asignación cita,process,business,selección de mecánico,process,business,True,relation,composition,create,0,0,0.0,business,,,,dependency violations
4,49,cost reduction,fast time to deployment,asignación cita,process,business,asignación de turno,process,business,True,relation,composition,create,0,0,0.0,business,,,,dependency violations
5,50,cost reduction,fast time to deployment,asignación cita,process,business,post-asignación cita,process,business,True,relation,triggering,create,0,0,0.0,business,,,,dependency violations
6,51,cost reduction,fast time to deployment,asignación cita,process,business,asignación cita service,service,business,True,relation,realization,create,0,0,0.0,business,,,,dependency violations
7,52,cost reduction,fast time to deployment,localización centros de atención,process,business,asignación de turno,process,business,True,relation,triggering,create,0,0,0.0,business,,,,dependency violations
8,53,cost reduction,fast time to deployment,asignación de turno,process,business,selección de mecánico,process,business,True,relation,triggering,create,0,0,0.0,business,,,,dependency violations
9,54,cost reduction,fast time to deployment,pre-asignación cita,process,business,asignación cita,process,business,True,relation,triggering,create,0,0,0.0,business,,,,dependency violations


In [72]:
#df_.to_csv('/Users/borisperezg/rebelmodels_storing/datasets_to_classify/multiclass_entrydataset_atdidentification_done1.csv', index=False, index_label=True)

## -----------------------------------------------------------------
## PREDICCION PARA ATRIBUTOS DE CALIDAD

In [73]:
# --------------------------------------
# PREPARACION DE LOS DATOS PARA PREDECIR
# --------------------------------------

In [74]:
# Se toma el dataframe como quedo
x = result.values

In [75]:
# --------------------------------------
# CARGA DE MODELO Y PREDICCION
# --------------------------------------

In [76]:
filename = '/Users/borisperezg/rebelmodels_storing/models/ATDIdentification/affectedqa_model.sav'
model_for_QA = joblib.load(filename)

In [77]:
y_pred_QA = model_for_QA.predict(x)
print(y_pred_QA)

['none' 'none' 'evolvability' 'evolvability' 'evolvability' 'evolvability'
 'evolvability' 'evolvability' 'evolvability' 'evolvability' 'none' 'none'
 'none' 'none' 'none' 'none' 'none' 'none' 'evolvability' 'none' 'none'
 'none']


In [78]:
finalDF['affectedqa'] = y_pred_QA

In [79]:
finalDF

Unnamed: 0,factid,driver,goal,sourceelementname,sourceelementtype,layersource,targetelementname,targetelementtype,layertarget,isnewelement,facttype,relatontype,action,incoming,outcoming,ratiolinks,mostlinkedlayer,commitlogs_ngrams,chatlogs_ngrams,adrlogs_ngrams,atdcause,affectedqa
0,45,cost reduction,fast time to deployment,asignación cita,process,business,,,,,element,,create,1,5,0.29,business,,,,noatd,none
1,46,cost reduction,fast time to deployment,asignación de turno,process,business,,,,,element,,create,2,1,0.14,business,,,,noatd,none
2,47,cost reduction,fast time to deployment,asignación cita,process,business,localización centros de atención,process,business,True,relation,composition,create,0,0,0.0,business,,,,dependency violations,evolvability
3,48,cost reduction,fast time to deployment,asignación cita,process,business,selección de mecánico,process,business,True,relation,composition,create,0,0,0.0,business,,,,dependency violations,evolvability
4,49,cost reduction,fast time to deployment,asignación cita,process,business,asignación de turno,process,business,True,relation,composition,create,0,0,0.0,business,,,,dependency violations,evolvability
5,50,cost reduction,fast time to deployment,asignación cita,process,business,post-asignación cita,process,business,True,relation,triggering,create,0,0,0.0,business,,,,dependency violations,evolvability
6,51,cost reduction,fast time to deployment,asignación cita,process,business,asignación cita service,service,business,True,relation,realization,create,0,0,0.0,business,,,,dependency violations,evolvability
7,52,cost reduction,fast time to deployment,localización centros de atención,process,business,asignación de turno,process,business,True,relation,triggering,create,0,0,0.0,business,,,,dependency violations,evolvability
8,53,cost reduction,fast time to deployment,asignación de turno,process,business,selección de mecánico,process,business,True,relation,triggering,create,0,0,0.0,business,,,,dependency violations,evolvability
9,54,cost reduction,fast time to deployment,pre-asignación cita,process,business,asignación cita,process,business,True,relation,triggering,create,0,0,0.0,business,,,,dependency violations,evolvability


In [80]:
finalDFNoOtherFields = finalDF[['factid', 'atdcause', 'affectedqa']]

In [81]:
finalDFNoOtherFields.to_csv('/Users/borisperezg/rebelmodels_storing/datasets_classified/'+fileName+'_done.csv', index=False, index_label=True)
