### **A) Importações e primeiros dados**

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importanto dados
df = pd.read_csv('NGcpu_data.csv', sep=' ')
colunas = pd.read_json('NGcpu_headers.csv')
nome_feature = colunas['name']
df.set_axis(nome_feature, axis=1, inplace=True)

In [3]:
lista=[]
for i in df.columns:
    if 'label' in i:
        lista.append(i)

In [4]:
# Colunas de Golden e Failures
df['Golden']=0
df['Failure']=1

In [5]:
# Criando coluna de experimento_id
experimentos = df['Description'].unique()
experimento_id = {}

for i, experimento in enumerate(experimentos):
    experimento_id[experimento] = i + 1
    
df['exp_id'] = df['Description'].map(experimento_id)

In [6]:
# Guardando relações em um dataframe a parte
df_exp = pd.DataFrame({'exp_id': df['exp_id'].values,
                       'Description': df['Description'].values})

In [7]:
# Eliminando colunas referente aos lead times + window prediction e as linhas cujas labels são 8 e 9
df1 = df[[col for col in df.columns if 'label' not in col]]
df2 = df['label_5_5']
df1 = df1.join(df2)
df3 = df1[df1['label_5_5'] != 8]
df4 = df3[df3['label_5_5'] != 9]

In [8]:
# Reorganizar a ordem e nome das colunas
novas_colunas = [df4.columns[0]] + [df4.columns[-2]] + df4.columns[1:3].tolist() + [df4.columns[-4]] + [df4.columns[-3]] + [df4.columns[-1]] + df4.columns[3:-5].tolist()
df4 = df4[novas_colunas]
df4 = df4.rename(columns=lambda x: x.split('{')[1][:-1] if '{' in x else x)

In [9]:
# Criando dataframes sem injeção, com injeção sem falha e com injeção com falha
df_golden_run = df4[df4['Description'].str.startswith('G')]
df_failure_no = df4[~df4['Description'].str.startswith('G') & ~df4['Description'].str.endswith('f')]
df_failure_si = df4[df4['Description'].str.endswith('f')]

In [10]:
# Ajustando colunas Golden e Failure
df_golden_run['Golden']=1
df_golden_run['Failure']=0
df_failure_no['Failure']=0
df_failure_si['Failure'] = df_failure_si['label_5_5'].apply(lambda x: 0 if x == 0 else 1)

In [11]:
df_failure_si.head(1)

Unnamed: 0,Description,exp_id,Index,Fault,Golden,Failure,label_5_5,"chart=""apps.cpu"",family=""cpu"",dimension=""apps.plugin""","chart=""apps.cpu"",family=""cpu"",dimension=""cron""","chart=""apps.cpu"",family=""cpu"",dimension=""go.d.plugin""",...,"chart=""users.pwrites"",family=""disk"",dimension=""root""","chart=""users.sockets"",family=""net"",dimension=""fields""","chart=""users.sockets"",family=""net"",dimension=""messagebus""","chart=""users.sockets"",family=""net"",dimension=""netdata""","chart=""users.sockets"",family=""net"",dimension=""root""","chart=""users.threads"",family=""processes"",dimension=""fields""","chart=""users.threads"",family=""processes"",dimension=""netdata""","chart=""users.threads"",family=""processes"",dimension=""root""","chart=""users.vmem"",family=""mem"",dimension=""fields""","chart=""users.vmem"",family=""mem"",dimension=""netdata"""
31903,run-0-0438-17_4_25-f,54,0,5,0,0,0,0.999,0.0,0.0,...,3.986172,1.0,5.0,8.0,35.0,5.0,23.0,78.0,15.0,964.9062


### **B) Criando três modos**

#### **B.1) Golden group**

#### B.1.1) Modo 1: Dados brutos

In [12]:
df_golden_run_1 = df_golden_run.drop(['Description', 'Fault', 'Golden', 'Failure', 'label_5_5'], axis=1)

In [13]:
df_golden_run_1.shape

(25868, 372)

#### B.1.2) Modo 2: Transformando features trio em qualitativas

In [14]:
# Criando um df para cada feature
inputs = {}

for i in range(3,371):
    inputs[f'input{i-2}'] = df_golden_run_1[df_golden_run_1.columns[:i].tolist()].drop(df_golden_run_1.columns[1:i-1], axis=1)

# Preenchendo a coluna quebrada
for key, df_ in inputs.items():
    for col in df_.columns[-1].split(','):
        df_[col.split('=')[0]] = col.split('=')[1][1:-1]

    df_['valor'] = df_[df_.columns[1]]
    df_.drop(columns=df_.columns[1], inplace=True)

# Combinando todos mini dfs
df_golden_run_2 = pd.concat(list(inputs.values()), ignore_index=True)

In [15]:
df_golden_run_2.shape

(9519424, 5)

#### B.1.3) Modo 3: Transformando features qualitativas em OHE

In [16]:
print(df_golden_run_2.chart.nunique())
print(df_golden_run_2.family.nunique())
print(df_golden_run_2.dimension.nunique())

108
29
80


In [17]:
colunas_desejadas_gr = ['chart','family','dimension']
# Realiza o one hot encoding nas colunas de interesse
df_golden_run_3 = pd.get_dummies(df_golden_run_2, columns=colunas_desejadas_gr)

In [18]:
df_golden_run_3.shape

(9519424, 219)

#### **B.2) Failure group**

#### B.2.1) Modo 1: Dados brutos

In [19]:
df_failure_si_1 = df_failure_si.drop(['Description', 'Golden', 'Failure'], axis=1)

In [20]:
df_failure_si_1.shape

(6125, 374)

#### B.2.2) Modo 2: Transformando features trio em qualitativas

In [21]:
# Criando um df para cada feature
inputs = {}
    
for i in range(5,373):
    inputs[f'input{i-4}'] = df_failure_si_1[df_failure_si_1.columns[:i].tolist()].drop(df_failure_si_1.columns[4:i-1], axis=1)

# Preenchendo a coluna quebrada
for key, df_ in inputs.items():
    for col in df_.columns[-1].split(','):
        df_[col.split('=')[0]] = col.split('=')[1][1:-1]

    df_['valor'] = df_[df_.columns[4]]
    df_.drop(columns=df_.columns[4], inplace=True)

# Combinando todos mini dfs
df_failure_si_2 = pd.concat(list(inputs.values()), ignore_index=True)

In [22]:
df_failure_si_2.shape

(2254000, 8)

#### B.2.3) Modo 3: Transformando features qualitativas em OHE

In [23]:
print(df_failure_si_2.chart.nunique())
print(df_failure_si_2.family.nunique())
print(df_failure_si_2.dimension.nunique())

108
29
80


In [24]:
colunas_desejadas_ff = ['chart','family','dimension']
# Realiza o one hot encoding nas colunas de interesse
df_failure_si_3 = pd.get_dummies(df_failure_si_2, columns=colunas_desejadas_ff)

In [25]:
df_failure_si_3.shape

(2254000, 222)

### **C) Salvando os docs em CSVs**

In [26]:
gg_m1 = 'golden_modo1.csv'
gg_m2 = 'golden_modo2.csv'
gg_m3 = 'golden_modo3.csv'

# Salvar o DataFrame no arquivo CSV
df_golden_run_1.to_csv(gg_m1, index=False)
df_golden_run_2.to_csv(gg_m2, index=False)
df_golden_run_3.to_csv(gg_m3, index=False)

In [27]:
fg_m1 = 'failure_modo1.csv'
fg_m2 = 'failure_modo2.csv'
fg_m3 = 'failure_modo3.csv'

# Salvar o DataFrame no arquivo CSV
df_failure_si_1.to_csv(fg_m1, index=False)
df_failure_si_2.to_csv(fg_m2, index=False)
df_failure_si_3.to_csv(fg_m3, index=False)