# (3) Logs Analysis using Mining. 
# Data set type: Industial-Anoki

## Q1: Is it possible to detect smoke errors through logs?
# 💨🔥💨 Smoke Analysis

#### ✅ python, ✅ Gitlab, ✅ Mongodb

## Qs
### Q1: What is the top of most common problems in pipelines?

## Index

- [Import python libraries](#Import-python-libraries)
- [Page reference](#Page-reference)
- [Create event list](#Create-event-list)
- [Group similar text](#Group-similar-text)
- [Calculate the similarity between texts and apply filters](#Calculate-the-similarity-between-texts-and-apply-filters)

## Nomenclature
    - (STPS) Smoke test possible solution: It is the set of tentative errors that can be avoided by using smoke tests

## Import python libraries

In [2]:
import os
import pymongo
from pymongo import MongoClient
from datetime import date
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import string
import re
# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
# Plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='notebook'

import wordninja
# ==============================================================================
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import words as wordsList
from nltk.corpus import stopwords

# Download resources. 
nltk.download('stopwords')
nltk.download('punkt') # first-time use only
nltk.download('wordnet') # first-time use only
nltk.download('words')

import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import math
from collections import Counter
# Configuración warnings
# ==============================================================================
import warnings

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ceciliocannavaciuolo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ceciliocannavaciuolo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ceciliocannavaciuolo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/ceciliocannavaciuolo/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Load variables to perform the analysis

In [3]:
# Smoke Test Parameters
# ==============================================================================
# Plotly configuration
plotly_template="plotly_dark"
# plotly_template="plotly"
# Filters:
JobsNameBlackList = ['test'] # Remove all jobs of analysis with this name
logsWhiteList = ["error","fail", "warning",'err'] # Check lines of logs with this words
JobsStatusWhiteList = ["failed"] # Check jobs with this status 
similarity = 0.6 # Group similar text
# DATA:
# mongoDbLimit=1000 # Limit of data request in mongodb
mongoDbLimit=False # Limit of data request in mongodb
csvRead=False
csvSave=False

csvName="data-02-logsAnalysis-" # CSV file name
csvFileRead="data-02-logsAnalysis13-08-2021.csv"
pathExperimentsFiles="/Users/ceciliocannavaciuolo/Documents/workspace/phd/experimentsGitlabColellector"

# References and Pages

- [TextBlob: Librarian documentation to do text analysis](https://textblob.readthedodfdfdfio/en/dev/quicksdfadftdfhtml#get-word-and-noun-phrase-frequencies)
- pipenv: https://pipenv.pydfdfdfio/en/latest/
- pythonplot: https://pythonplot.com/
- [Measuring Similarity Between Texts in Python](https://sites.temple.edu/tudsc/2017/03/30/measuring-similarity-between-texts-in-python/)


# Collect data from MongoDB
### Read data from Mongodb database

In [4]:
# Connect with DB
today = date.today()
today = today.strftime("%d-%m-%Y")

if not csvRead:
    MONGODB_URL = os.environ.get('MONGODB_URL')
    NODE_ENV = os.environ.get('NODE_ENV') or "dev"
    DB_NAME = os.environ.get('APP_NAME') + "-"+ NODE_ENV

    client = MongoClient()
    client = MongoClient(MONGODB_URL)
    db = client[DB_NAME]

    if mongoDbLimit:
        jobs = db.gitlablogs.find({}).limit(mongoDbLimit) # Read all data
    else:
        jobs = db.gitlablogs.find({}) # Read all data
        # jobs = db.gitlablogs.find({}).limit(100) # Re/ad all data
        
    jobs = pd.DataFrame(list(jobs)) # Convert to DataFrame
    print("List of data available iside of db structure")

    # Save CSV
    if csvSave:
        jobs.to_csv(pathExperimentsFiles+'/dataAnalysis/'+csvName+today+'.csv', index = False)
else:
    print("@Note-01 ---- 1350489220 -----")
    jobs = pd.read_csv(pathExperimentsFiles+'/dataAnalysis/'+csvName+today+'.csv')

jobs.dtypes

List of data available iside of db structure


_id                                 object
jobId                                int64
projectId                            int64
__v                                  int64
allow_failure                         bool
commitId                            object
commitMessage                       object
commitTitle                         object
committedEmail                      object
created_at                  datetime64[ns]
duration                           float64
jobLog                              object
jobName                             object
jobRef                              object
jobStage                            object
jobStatus                           object
pipelineId                           int64
pipelineRef                         object
pipelineStatus                      object
pipelineUrl                         object
pipelineWebUrl                      object
projectDescriptions                 object
projectName                         object
projectName

# Analysis of data volumes.
## In this section you can obtain general information related to the volume of data.
## Percentage of type jobs

In [1]:
print("------ DATA REPORT ------")
projectsNumber = len(jobs["projectName"].unique())
print("Number of Projects: "+ str(projectsNumber))
numberOfJobs = len(jobs.index)
print("Number of pipelines Jobs (Steps): "+ str(numberOfJobs))
numberOfSuccess= jobs.loc[jobs["jobStatus"] == "success"]["jobStatus"].count()
numberOfFailed= jobs.loc[jobs["jobStatus"] == "failed"]["jobStatus"].count()
numberOfCancel= jobs.loc[jobs["jobStatus"] == "canceled"]["jobStatus"].count()

successPercentage = (1-((numberOfSuccess+numberOfFailed+numberOfCancel)-numberOfSuccess)/(numberOfSuccess+numberOfFailed+numberOfCancel))*100
failedPercentage = (1-((numberOfSuccess+numberOfFailed+numberOfCancel)-numberOfFailed)/(numberOfSuccess+numberOfFailed+numberOfCancel))*100
canceledPercentage = (1-((numberOfSuccess+numberOfFailed+numberOfCancel)-numberOfCancel)/(numberOfSuccess+numberOfFailed+numberOfCancel))*100

print("Number of success Jobs (Steps): "+ str(numberOfSuccess) + " or "+str(successPercentage) + " %")
print("Number of failed Jobs (Steps): "+ str(numberOfFailed)+ " or "+str(failedPercentage) + " %")
print("Number of canceled Jobs (Steps): "+ str(numberOfCancel)+ " or "+str(canceledPercentage) + " %")

------ DATA REPORT ------


NameError: name 'jobs' is not defined

In [7]:
fig = make_subplots(rows=1, cols=2)
fig = px.pie(jobs, names='jobStatus', title='Pipelines Jobs results',color="jobStatus",template=plotly_template)
fig.show()

# Number of fails by stage number

In [8]:
def createBarGraphByJobStatus(variable):

    jobStatusUnique = jobs["jobStatus"].unique().tolist()
    df_list = []
    for status in jobStatusUnique:
        jobs_total = jobs.rename(columns={'jobStatus': status})
        total = jobs_total.groupby(by=variable)[status].count()
        df_list.append(total)

    df = pd.concat(df_list,axis=1)
    df = df.sort_values(by=[jobStatusUnique[0]],ascending=True)
    fig = px.bar(df, orientation='h',template=plotly_template,title="Number of fails by "+ variable)
    fig.show()
    
print(" Number of jobs projectName types")
createBarGraphByJobStatus("projectName")    
print(" Number of jobs stage types")
createBarGraphByJobStatus("jobStage")
createBarGraphByJobStatus("pipelineRef")


 Number of jobs projectName types


 Number of jobs stage types


# Calculate the similarity between texts using the coseno

In [11]:
#! Create functions
def StemTokens(tokens): return [stemmer.stem(token) for token in tokens]
def StemNormalize(text): return StemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
def LemTokens(tokens): return [lemmer.lemmatize(token) for token in tokens]
def LemNormalize(text): return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
def idf(n,df): result = math.log((n+1.0)/(df+1.0)) + 1; return result

# Init data
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
lemmer = nltk.stem.WordNetLemmatizer()
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')

# Filter text
def unionSameText(documents, similarity):
    countI = -1
    data = []
    if len(documents) > 1:
        if len(documents) == 1 and documents[0].find(" ") == -1:
            data.append(documents)
        else:
            #? calculate the matrix of matches
            LemVectorizer.fit_transform(documents) 
            tf_matrix = LemVectorizer.transform(documents).toarray()

            tfidfTran = TfidfTransformer(norm="l2")
            tfidfTran.fit(tf_matrix)
            tfidf_matrix = tfidfTran.transform(tf_matrix)
            cos_similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()
            # Collect unics data:
            for i in range(0,len(cos_similarity_matrix)):
                countData = 0
                sameId = []
                token = documents[i]
                for i2 in range(0,len(cos_similarity_matrix)):
                    if cos_similarity_matrix[i,i2] > similarity:
                        token = documents[i2]
                        sameId.append(i2)
                #! Remove deferents words
                textDocument = ""
                if len(sameId) > 0:
                    count = -1 
                    tokenOne = documents[0]
                    for i3 in sameId:
                        count = count + 1
                        if count >= 1:
                            tokenSplit = documents[count].split(sep = ' ')
                            for t3 in tokenSplit:
                                if tokenOne.find(t3) != -1:
                                    textDocument = textDocument + t3 + " "
                if textDocument == "":
                    textDocument = token
                d = [textDocument]
                data.append(d)
    else:
        data.append(documents)

    data =list(map(list,set(map(tuple,data)))) # Deleret duplicate data
    dataList = []
    if len(data) > 0:
        if len(data[0]) > 0:
            for d in data:
                dataList.append(d[0])
        
    return dataList

def unique_list(l):
    ulist = []
    [ulist.append(x) for x in l if x not in ulist]
    return ulist

textExample = ['assertion error err assertion global var', 'assertion error err assertion class foo', 'npm err test failed', 'error job failed exit code']
print("==============================")
print(" Before of apply the filters")
print(textExample)
textExample = unionSameText(textExample,0.4)
print(" After of apply the filters")
print(textExample)

 Before of apply the filters
['assertion error err assertion global var', 'assertion error err assertion class foo', 'npm err test failed', 'error job failed exit code']
 After of apply the filters
['error job failed exit code', 'npm err test failed', 'assertion error err assertion ']



Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.



# Filter logs data
## Get fragment of text with error 

In [None]:
# Get fragment of text with error
# ==============================================================================
def getErrorText(texto):
    #! Get only last range. 
    nuevo_texto = texto
    #! Convert all text to lowercase.
    try:
        nuevo_texto = nuevo_texto.lower()
    except:
        return ""

    #! Web page removal (words beginning with "http")
    nuevo_texto = re.sub('http\S+', ' ', nuevo_texto)
    nuevo_texto = nuevo_texto.split(sep = '\n');
    
    whiteList = ["err"];
    newTextList = []; nuevoTexto = ""
    for text in nuevo_texto:
        for listI in whiteList:
            if listI in text:
                #! Remove special characters
                regex = '[\\!\\"\\#\\>\\<\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\;\\\\]\\<\\=\\,\\>\\?\\:\\-\\|\\@\\@\\\\^_\\`\\{\\|\\}\\~]'
                text = re.sub(regex , ' ', text)

                text = re.sub('http\S+', ' ', text)
                #! remove date
                text = re.sub('\d{4}-\d{2}-\d{2}', ' ', text)
                text = re.sub(' +', ' ', text)
                text = re.sub('- - t : :','',text)
                text = re.sub(r'[0-9]','',text)

                #! Removing emojis
                emoji_pattern = re.compile("["
                      u"\U0001F600-\U0001F64F"  # emoticons
                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

                text = emoji_pattern.sub(r'', text)
                
                if not text in newTextList:
                    #! cut string. 
                    text = text.strip() # Split text
                    text = text.replace('=',' ') # Delete =
                    text = text.replace('_',' ') # Delete _
                    text = text.replace(':','') # Delete :
                    text = text.replace(';','') # Delete ;
                    text = text.replace("'","'") # Delete '
                    text = text.replace("'","") # Delete '
                    text = text.replace("!","") # Delete !
                    # text = text.replace("m ","") # Delete !

                    text = text.replace('[',' ').replace(']','') # Delete [ ]
                    text = re.sub(' +', ' ', text) # Delete multiple spaces
                    text = text[0:text.find(".")] # Delete text after .
                    text = ' '.join(unique_list(text.split())) # Delete duplicate words

                    # Remove not engilish words
                    text = wordninja.split(text)
                    text = [ item for pos,item in enumerate(text) if text.index(item)==pos ] # Delete duplicates
                    nuevoTexto = text

                    text = ' '.join(text)


                newTextList.append(text)
                newTextList = [string for string in newTextList if string.strip()]# Remove multiple empty spaces from string List
                newTextList = [string for string in newTextList if string != " "] # Delete empty strings
                newTextList = [string for string in newTextList if string != ""] # Delete empty strings  
                newTextList = [string for string in newTextList if (len(string) > 2)] # Eliminación de tokens con una longitud < 2 o que se encuentren en la lista de palabras ignoradas
    
                unics = set(); newTextList = [string for string in newTextList if string not in unics and (unics.add(string) or True)] # Delete duplicate data

    unics = set(); newTextList = [string for string in newTextList if string not in unics and (unics.add(string) or True)] # Delete duplicate data

    newTextList = unionSameText(newTextList,0.4) #TODO ative
    textList = []
    for text in newTextList:
        text = ' '.join(unique_list(text.split()))
        textList.append(text)
    
    return textList


textExample = '''
\u001b[0KRunning with gitlab-runner 13.11.0 (7f7a4bb0)\n\u001b[0;m\u001b[0K  on pax-italia-pot HRhNkEr3\n\u001b[0;msection_start:1628150616:prepare_executor\r\u001b[0K\u001b[0K\u001b[36;1mPreparing the \"shell\" executor\u001b[0;m\n\u001b[0;m\u001b[0KUsing Shell executor...\n\u001b[0;msection_end:1628150616:prepare_executor\r\u001b[0Ksection_start:1628150616:prepare_script\r\u001b[0K\u001b[0K\u001b[36;1mPreparing environment\u001b[0;m\n\u001b[0;mRunning on TEST-POT...\r\nsection_end:1628150616:prepare_script\r\u001b[0Ksection_start:1628150616:get_sources\r\u001b[0K\u001b[0K\u001b[36;1mGetting source from Git repository\u001b[0;m\n\u001b[0;m\u001b[32;1mFetching changes...\u001b[0;m\r\nReinitialized existing Git repository in C:/GitLab-Runner/builds/HRhNkEr3/0/tech-gap-italia/pax-italia-pot/pax-italia-pot-api/.git/\n\u001b[32;1mChecking out fec6b887 as deployTest...\u001b[0;m\r\nRemoving .m2/\nRemoving db/\nRemoving potStatusCode.txt\nRemoving target/\ngit-lfs/2.13.3 (GitHub; windows amd64; go 1.16.2; git a5e65851)\n\r\n\u001b[32;1mSkipping Git submodules setup\u001b[0;m\r\nsection_end:1628150622:get_sources\r\u001b[0Ksection_start:1628150622:restore_cache\r\u001b[0K\u001b[0K\u001b[36;1mRestoring cache\u001b[0;m\n\u001b[0;mVersion:      13.11.0\nGit revision: 7f7a4bb0\nGit branch:   13-11-stable\nGO version:   go1.13.8\nBuilt:        2021-04-20T17:02:32+0000\nOS/Arch:      windows/amd64\n\u001b[32;1mChecking cache for %CI_COMMIT_REF_SLUG%-1...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=3224 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\nNo URL provided, cache will not be downloaded from shared cache server. Instead a local version of cache will be extracted.\u001b[0;m \n\u001b[32;1mSuccessfully extracted cache\u001b[0;m\r\nsection_end:1628150623:restore_cache\r\u001b[0Ksection_start:1628150623:download_artifacts\r\u001b[0K\u001b[0K\u001b[36;1mDownloading artifacts\u001b[0;m\n\u001b[0;mVersion:      13.11.0\nGit revision: 7f7a4bb0\nGit branch:   13-11-stable\nGO version:   go1.13.8\nBuilt:        2021-04-20T17:02:32+0000\nOS/Arch:      windows/amd64\n\u001b[32;1mDownloading artifacts for updateDataBase (1478970780)...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=2688 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\n\u001b[31;1mERROR: Downloading artifacts from coordinator... error couldn't execute GET against https://gitlab.com/api/v4/jobs/1478970780/artifacts?direct_download=true: Get https://storage.googleapis.com/gitlab-gprd-artifacts/a5/35/a5358b9b40fe2e633b7d371c0dc4767d051c7fc2cbcc5d15e596c9c1bcb4d0ac/2021_08_05/1478970780/1591261703/artifacts.zip?GoogleAccessId=gitlab-object-storage-prd@gitlab-production.iam.gserviceaccount.com&Signature=eO9MxI2VRMDf1Po5llLAjH6nRO%2FbfZmI%2Fpt9%2F8Lvtnl8KW6BsSIdHH%2B4QJmy%0Awb8qlZxqkxl9VKyb1x9C%2FPJrDoH9Pz%2FwqmuVBQjWCiUbl2mWdHKrBiJr1A9y%0A%2BlNkIOp8Cdn1XWe9m50qCh1gBO5I5CRgzslCzMYZn4QcoZtC%2FQuPEzHm2n0x%0AemV8kGdcq3Z2JIom8oqU91wkdxx1a4IvvXCmgIAdPZ26OInj%2BhKA4cVeCp%2FO%0AKbPqLVBz%2BZgnuj5klSJdPei2I3vb%2F9D6v02K3mgqPwim0u9vjLCUHDZZVUU1%0AZbJ7Ms63VIuYY8OWrwmok2ZtZiFj%2BVoUeLav1xECKg%3D%3D&Expires=1628151224: x509: certificate signed by unknown authority\u001b[0;m  \u001b[31;1mid\u001b[0;m=1478970780 \u001b[31;1mtoken\u001b[0;m=gZyEXDvG\n\u001b[0;33mWARNING: Retrying...                              \u001b[0;m  \u001b[0;33merror\u001b[0;m=invalid argument\nDownloading artifacts from coordinator... ok      \u001b[0;m  id\u001b[0;m=1478970780 responseStatus\u001b[0;m=200 OK token\u001b[0;m=gZyEXDvG\n\u001b[32;1mDownloading artifacts for compilePackage (1478970783)...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=5644 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\n\u001b[31;1mERROR: Downloading artifacts from coordinator... error couldn't execute GET against https://gitlab.com/api/v4/jobs/1478970783/artifacts?direct_download=true: Get https://storage.googleapis.com/gitlab-gprd-artifacts/a5/35/a5358b9b40fe2e633b7d371c0dc4767d051c7fc2cbcc5d15e596c9c1bcb4d0ac/2021_08_05/1478970783/1591271101/artifacts.zip?GoogleAccessId=gitlab-object-storage-prd@gitlab-production.iam.gserviceaccount.com&Signature=1ejAvX%2FpdR%2FbDY4W9t7gNwTj%2F%2FoQKBzwaYvKO7pLWFka30UwPLOJJ7fi4vCe%0A1CZojYoZdQ0Yrb8IMhTG9jg6WTf3b5zsWPbwlwMdU%2BZn4hRP01NjY9GRPeqx%0A4S9A2T85qUjL%2F%2BOfXrorqP%2FiFB%2BgI%2BO0pemERPNtqI27ms%2BS4VJATLzpkpIl%0AMRhZPMd0YYP9StKmoWxPEwNtXjcKh%2FevoourBU2Orn%2BXIbgqlHpZtntZagpv%0ARt8KUTohOwqjah9N53J%2B%2BRBtqmmIN6z2tyglu1kqqVBFdPMRntt8PXjXNrIK%0AZIRt9C1wR%2FKzdJD4ZEcWkmIz8LELSYuFJXYUUTGGzA%3D%3D&Expires=1628151226: x509: certificate signed by unknown authority\u001b[0;m  \u001b[31;1mid\u001b[0;m=1478970783 \u001b[31;1mtoken\u001b[0;m=SEszxyKc\n\u001b[0;33mWARNING: Retrying...                              \u001b[0;m  \u001b[0;33merror\u001b[0;m=invalid argument\nDownloading artifacts from coordinator... ok      \u001b[0;m  id\u001b[0;m=1478970783 responseStatus\u001b[0;m=200 OK token\u001b[0;m=SEszxyKc\n\u001b[32;1mDownloading artifacts for smokeTest (1478970788)...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=2608 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\n\u001b[31;1mERROR: Downloading artifacts from coordinator... error couldn't execute GET against https://gitlab.com/api/v4/jobs/1478970788/artifacts?direct_download=true: Get https://storage.googleapis.com/gitlab-gprd-artifacts/a5/35/a5358b9b40fe2e633b7d371c0dc4767d051c7fc2cbcc5d15e596c9c1bcb4d0ac/2021_08_05/1478970788/1591285532/artifacts.zip?GoogleAccessId=gitlab-object-storage-prd@gitlab-production.iam.gserviceaccount.com&Signature=HavL1z7T%2BJwTsaIW0zHoB9aUI4vkawvyoDunTn85PukDEltxhxfi4ZHJ2130%0At7R1PDjZE18RfAe%2B80zCY6nY66%2B%2Be8%2F096sHPaqf5vkhBE16ml%2FQH0ALLn7F%0ARmDQoI83rcOTEE5%2BK35RDh8I9gV%2BP62H90zSTlCfxWx4OUqHfssA4OI26B10%0Am9bDXfXORZys78YNFMGMAidb7PaMc7HVDf%2FhhRwo%2BqpaROCttOxyYVi%2Bgsgj%0AXwwK%2BUdAtf7gHi%2BU9CpBXV2wOjjk6wqow62vxsz0%2F5ioBedAkpTX5CFiFCfO%0AgmuptgwEYSeLu%2BOPnXKNFH1kYoIf%2BoqdXrlOmCkxlA%3D%3D&Expires=1628151247: x509: certificate signed by unknown authority\u001b[0;m  \u001b[31;1mid\u001b[0;m=1478970788 \u001b[31;1mtoken\u001b[0;m=mzxPxF9Y\n\u001b[0;33mWARNING: Retrying...                              \u001b[0;m  \u001b[0;33merror\u001b[0;m=invalid argument\nDownloading artifacts from coordinator... ok      \u001b[0;m  id\u001b[0;m=1478970788 responseStatus\u001b[0;m=200 OK token\u001b[0;m=mzxPxF9Y\nsection_end:1628150649:download_artifacts\r\u001b[0Ksection_start:1628150649:step_script\r\u001b[0K\u001b[0K\u001b[36;1mExecuting \"step_script\" stage of the job script\u001b[0;m\n\u001b[0;m\u001b[32;1m$ echo \"CREATE PRODUCTION ACTIFACTS\"\u001b[0;m\r\nCREATE PRODUCTION ACTIFACTS\r\nsection_end:1628150650:step_script\r\u001b[0Ksection_start:1628150650:archive_cache\r\u001b[0K\u001b[0K\u001b[36;1mSaving cache for successful job\u001b[0;m\n\u001b[0;mVersion:      13.11.0\nGit revision: 7f7a4bb0\nGit branch:   13-11-stable\nGO version:   go1.13.8\nBuilt:        2021-04-20T17:02:32+0000\nOS/Arch:      windows/amd64\n\u001b[32;1mCreating cache %CI_COMMIT_REF_SLUG%-1...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=1832 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\n\u001b[0;33mWARNING: binaries/: no matching files             \u001b[0;m \nArchive is up to date!                            \u001b[0;m \n\u001b[32;1mCreated cache\u001b[0;m\r\nsection_end:1628150650:archive_cache\r\u001b[0Ksection_start:1628150650:upload_artifacts_on_success\r\u001b[0K\u001b[0K\u001b[36;1mUploading artifacts for successful job\u001b[0;m\n\u001b[0;mVersion:      13.11.0\nGit revision: 7f7a4bb0\nGit branch:   13-11-stable\nGO version:   go1.13.8\nBuilt:        2021-04-20T17:02:32+0000\nOS/Arch:      windows/amd64\n\u001b[32;1mUploading artifacts...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=5780 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\n./db: found 20 matching files and directories     \u001b[0;m \n./target/tool-repair-api-0.0.1-SNAPSHOT.jar: found 1 matching files and directories\u001b[0;m \n./flyway.production.properties: found 1 matching files and directories\u001b[0;m \n./InstructionsDeploy.txt: found 1 matching files and directories\u001b[0;m \n./pom.xml: found 1 matching files and directories \u001b[0;m \nUploading artifacts as \"archive\" to coordinator... ok\u001b[0;m  id\u001b[0;m=1478970795 responseStatus\u001b[0;m=201 Created token\u001b[0;m=YqeuAFFa\nsection_end:1628150666:upload_artifacts_on_success\r\u001b[0Ksection_start:1628150666:cleanup_file_variables\r\u001b[0K\u001b[0K\u001b[36;1mCleaning up file based variables\u001b[0;m\n\u001b[0;msection_end:1628150667:cleanup_file_variables\r\u001b[0K\u001b[32;1mJob succeeded\n\u001b[0;m
'''

errorText = getErrorText(textExample)
print(errorText)


# Remove word if not exist

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()

def check(word):

    if word == spell.correction(word):
        return True
    else:
        return False

check("hello")


def removeWordIfNotExist(textList):
    textListFilter = []
    for text in textList:
        textList2 = [];
        for t in text.split(sep = ' '):
            # if t in wordsList.words() and len(t) >1: # TODO TIME PROBLEM
            if check(t):
                textList2.append(t)
        textListFilter.append(' '.join(textList2))
    return textListFilter

textExample = ['m error downloading artifacts from coordinator', 'm warning retrying']

text = removeWordIfNotExist(textExample)
print(text)




In [None]:
textExample = '''
\u001b[0KRunning with gitlab-runner 13.11.0 (7f7a4bb0)\n\u001b[0;m\u001b[0K  on pax-italia-pot HRhNkEr3\n\u001b[0;msection_start:1628150616:prepare_executor\r\u001b[0K\u001b[0K\u001b[36;1mPreparing the \"shell\" executor\u001b[0;m\n\u001b[0;m\u001b[0KUsing Shell executor...\n\u001b[0;msection_end:1628150616:prepare_executor\r\u001b[0Ksection_start:1628150616:prepare_script\r\u001b[0K\u001b[0K\u001b[36;1mPreparing environment\u001b[0;m\n\u001b[0;mRunning on TEST-POT...\r\nsection_end:1628150616:prepare_script\r\u001b[0Ksection_start:1628150616:get_sources\r\u001b[0K\u001b[0K\u001b[36;1mGetting source from Git repository\u001b[0;m\n\u001b[0;m\u001b[32;1mFetching changes...\u001b[0;m\r\nReinitialized existing Git repository in C:/GitLab-Runner/builds/HRhNkEr3/0/tech-gap-italia/pax-italia-pot/pax-italia-pot-api/.git/\n\u001b[32;1mChecking out fec6b887 as deployTest...\u001b[0;m\r\nRemoving .m2/\nRemoving db/\nRemoving potStatusCode.txt\nRemoving target/\ngit-lfs/2.13.3 (GitHub; windows amd64; go 1.16.2; git a5e65851)\n\r\n\u001b[32;1mSkipping Git submodules setup\u001b[0;m\r\nsection_end:1628150622:get_sources\r\u001b[0Ksection_start:1628150622:restore_cache\r\u001b[0K\u001b[0K\u001b[36;1mRestoring cache\u001b[0;m\n\u001b[0;mVersion:      13.11.0\nGit revision: 7f7a4bb0\nGit branch:   13-11-stable\nGO version:   go1.13.8\nBuilt:        2021-04-20T17:02:32+0000\nOS/Arch:      windows/amd64\n\u001b[32;1mChecking cache for %CI_COMMIT_REF_SLUG%-1...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=3224 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\nNo URL provided, cache will not be downloaded from shared cache server. Instead a local version of cache will be extracted.\u001b[0;m \n\u001b[32;1mSuccessfully extracted cache\u001b[0;m\r\nsection_end:1628150623:restore_cache\r\u001b[0Ksection_start:1628150623:download_artifacts\r\u001b[0K\u001b[0K\u001b[36;1mDownloading artifacts\u001b[0;m\n\u001b[0;mVersion:      13.11.0\nGit revision: 7f7a4bb0\nGit branch:   13-11-stable\nGO version:   go1.13.8\nBuilt:        2021-04-20T17:02:32+0000\nOS/Arch:      windows/amd64\n\u001b[32;1mDownloading artifacts for updateDataBase (1478970780)...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=2688 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\n\u001b[31;1mERROR: Downloading artifacts from coordinator... error couldn't execute GET against https://gitlab.com/api/v4/jobs/1478970780/artifacts?direct_download=true: Get https://storage.googleapis.com/gitlab-gprd-artifacts/a5/35/a5358b9b40fe2e633b7d371c0dc4767d051c7fc2cbcc5d15e596c9c1bcb4d0ac/2021_08_05/1478970780/1591261703/artifacts.zip?GoogleAccessId=gitlab-object-storage-prd@gitlab-production.iam.gserviceaccount.com&Signature=eO9MxI2VRMDf1Po5llLAjH6nRO%2FbfZmI%2Fpt9%2F8Lvtnl8KW6BsSIdHH%2B4QJmy%0Awb8qlZxqkxl9VKyb1x9C%2FPJrDoH9Pz%2FwqmuVBQjWCiUbl2mWdHKrBiJr1A9y%0A%2BlNkIOp8Cdn1XWe9m50qCh1gBO5I5CRgzslCzMYZn4QcoZtC%2FQuPEzHm2n0x%0AemV8kGdcq3Z2JIom8oqU91wkdxx1a4IvvXCmgIAdPZ26OInj%2BhKA4cVeCp%2FO%0AKbPqLVBz%2BZgnuj5klSJdPei2I3vb%2F9D6v02K3mgqPwim0u9vjLCUHDZZVUU1%0AZbJ7Ms63VIuYY8OWrwmok2ZtZiFj%2BVoUeLav1xECKg%3D%3D&Expires=1628151224: x509: certificate signed by unknown authority\u001b[0;m  \u001b[31;1mid\u001b[0;m=1478970780 \u001b[31;1mtoken\u001b[0;m=gZyEXDvG\n\u001b[0;33mWARNING: Retrying...                              \u001b[0;m  \u001b[0;33merror\u001b[0;m=invalid argument\nDownloading artifacts from coordinator... ok      \u001b[0;m  id\u001b[0;m=1478970780 responseStatus\u001b[0;m=200 OK token\u001b[0;m=gZyEXDvG\n\u001b[32;1mDownloading artifacts for compilePackage (1478970783)...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=5644 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\n\u001b[31;1mERROR: Downloading artifacts from coordinator... error couldn't execute GET against https://gitlab.com/api/v4/jobs/1478970783/artifacts?direct_download=true: Get https://storage.googleapis.com/gitlab-gprd-artifacts/a5/35/a5358b9b40fe2e633b7d371c0dc4767d051c7fc2cbcc5d15e596c9c1bcb4d0ac/2021_08_05/1478970783/1591271101/artifacts.zip?GoogleAccessId=gitlab-object-storage-prd@gitlab-production.iam.gserviceaccount.com&Signature=1ejAvX%2FpdR%2FbDY4W9t7gNwTj%2F%2FoQKBzwaYvKO7pLWFka30UwPLOJJ7fi4vCe%0A1CZojYoZdQ0Yrb8IMhTG9jg6WTf3b5zsWPbwlwMdU%2BZn4hRP01NjY9GRPeqx%0A4S9A2T85qUjL%2F%2BOfXrorqP%2FiFB%2BgI%2BO0pemERPNtqI27ms%2BS4VJATLzpkpIl%0AMRhZPMd0YYP9StKmoWxPEwNtXjcKh%2FevoourBU2Orn%2BXIbgqlHpZtntZagpv%0ARt8KUTohOwqjah9N53J%2B%2BRBtqmmIN6z2tyglu1kqqVBFdPMRntt8PXjXNrIK%0AZIRt9C1wR%2FKzdJD4ZEcWkmIz8LELSYuFJXYUUTGGzA%3D%3D&Expires=1628151226: x509: certificate signed by unknown authority\u001b[0;m  \u001b[31;1mid\u001b[0;m=1478970783 \u001b[31;1mtoken\u001b[0;m=SEszxyKc\n\u001b[0;33mWARNING: Retrying...                              \u001b[0;m  \u001b[0;33merror\u001b[0;m=invalid argument\nDownloading artifacts from coordinator... ok      \u001b[0;m  id\u001b[0;m=1478970783 responseStatus\u001b[0;m=200 OK token\u001b[0;m=SEszxyKc\n\u001b[32;1mDownloading artifacts for smokeTest (1478970788)...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=2608 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\n\u001b[31;1mERROR: Downloading artifacts from coordinator... error couldn't execute GET against https://gitlab.com/api/v4/jobs/1478970788/artifacts?direct_download=true: Get https://storage.googleapis.com/gitlab-gprd-artifacts/a5/35/a5358b9b40fe2e633b7d371c0dc4767d051c7fc2cbcc5d15e596c9c1bcb4d0ac/2021_08_05/1478970788/1591285532/artifacts.zip?GoogleAccessId=gitlab-object-storage-prd@gitlab-production.iam.gserviceaccount.com&Signature=HavL1z7T%2BJwTsaIW0zHoB9aUI4vkawvyoDunTn85PukDEltxhxfi4ZHJ2130%0At7R1PDjZE18RfAe%2B80zCY6nY66%2B%2Be8%2F096sHPaqf5vkhBE16ml%2FQH0ALLn7F%0ARmDQoI83rcOTEE5%2BK35RDh8I9gV%2BP62H90zSTlCfxWx4OUqHfssA4OI26B10%0Am9bDXfXORZys78YNFMGMAidb7PaMc7HVDf%2FhhRwo%2BqpaROCttOxyYVi%2Bgsgj%0AXwwK%2BUdAtf7gHi%2BU9CpBXV2wOjjk6wqow62vxsz0%2F5ioBedAkpTX5CFiFCfO%0AgmuptgwEYSeLu%2BOPnXKNFH1kYoIf%2BoqdXrlOmCkxlA%3D%3D&Expires=1628151247: x509: certificate signed by unknown authority\u001b[0;m  \u001b[31;1mid\u001b[0;m=1478970788 \u001b[31;1mtoken\u001b[0;m=mzxPxF9Y\n\u001b[0;33mWARNING: Retrying...                              \u001b[0;m  \u001b[0;33merror\u001b[0;m=invalid argument\nDownloading artifacts from coordinator... ok      \u001b[0;m  id\u001b[0;m=1478970788 responseStatus\u001b[0;m=200 OK token\u001b[0;m=mzxPxF9Y\nsection_end:1628150649:download_artifacts\r\u001b[0Ksection_start:1628150649:step_script\r\u001b[0K\u001b[0K\u001b[36;1mExecuting \"step_script\" stage of the job script\u001b[0;m\n\u001b[0;m\u001b[32;1m$ echo \"CREATE PRODUCTION ACTIFACTS\"\u001b[0;m\r\nCREATE PRODUCTION ACTIFACTS\r\nsection_end:1628150650:step_script\r\u001b[0Ksection_start:1628150650:archive_cache\r\u001b[0K\u001b[0K\u001b[36;1mSaving cache for successful job\u001b[0;m\n\u001b[0;mVersion:      13.11.0\nGit revision: 7f7a4bb0\nGit branch:   13-11-stable\nGO version:   go1.13.8\nBuilt:        2021-04-20T17:02:32+0000\nOS/Arch:      windows/amd64\n\u001b[32;1mCreating cache %CI_COMMIT_REF_SLUG%-1...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=1832 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\n\u001b[0;33mWARNING: binaries/: no matching files             \u001b[0;m \nArchive is up to date!                            \u001b[0;m \n\u001b[32;1mCreated cache\u001b[0;m\r\nsection_end:1628150650:archive_cache\r\u001b[0Ksection_start:1628150650:upload_artifacts_on_success\r\u001b[0K\u001b[0K\u001b[36;1mUploading artifacts for successful job\u001b[0;m\n\u001b[0;mVersion:      13.11.0\nGit revision: 7f7a4bb0\nGit branch:   13-11-stable\nGO version:   go1.13.8\nBuilt:        2021-04-20T17:02:32+0000\nOS/Arch:      windows/amd64\n\u001b[32;1mUploading artifacts...\u001b[0;m\r\nRuntime platform                                  \u001b[0;m  arch\u001b[0;m=amd64 os\u001b[0;m=windows pid\u001b[0;m=5780 revision\u001b[0;m=7f7a4bb0 version\u001b[0;m=13.11.0\n./db: found 20 matching files and directories     \u001b[0;m \n./target/tool-repair-api-0.0.1-SNAPSHOT.jar: found 1 matching files and directories\u001b[0;m \n./flyway.production.properties: found 1 matching files and directories\u001b[0;m \n./InstructionsDeploy.txt: found 1 matching files and directories\u001b[0;m \n./pom.xml: found 1 matching files and directories \u001b[0;m \nUploading artifacts as \"archive\" to coordinator... ok\u001b[0;m  id\u001b[0;m=1478970795 responseStatus\u001b[0;m=201 Created token\u001b[0;m=YqeuAFFa\nsection_end:1628150666:upload_artifacts_on_success\r\u001b[0Ksection_start:1628150666:cleanup_file_variables\r\u001b[0K\u001b[0K\u001b[36;1mCleaning up file based variables\u001b[0;m\n\u001b[0;msection_end:1628150667:cleanup_file_variables\r\u001b[0K\u001b[32;1mJob succeeded\n\u001b[0;m
'''

errorText = getErrorText(textExample)
errorText = removeWordIfNotExist(errorText)
print(errorText)


In [None]:
textExample = '''
    ✓ Statements \n ✓ Exceptions \n ✓ Static \n ✓ Class (55ms) \n ✓ Class props (47ms) \n 3) Types \n 19 passing (1s) \n 3 failing \n 1) AST translation \n      Reserved keyword: \n 2) AST translation
       Templates:
      AssertionError [ERR_ASSERTION]: 'global[var];' == 'global[var_r];'
      + expected - actual
      
      at translates (test/helper.js:24:16)
      at Context.<anonymous> (test/translation.js:91:9)
      at processImmediate (internal/timers.js:456:21)
  3) AST translation
       Types:
      AssertionError [ERR_ASSERTION]: 'class Foo {\n' +
  '  annotated(untyped, class: Cls.Name, self: Foo, array: {} | any[], callable: Function, bool: boolean, float: number, int: number, string: string, iter: {} | any[]) {}\n' +
  '\n' +
  '};' == 'class Foo {\n' +
  '        annotated(untyped, class_r: Cls.Name, self: Foo, array: {} | any[], callable: Function, bool: boolean, float: number, int: number, string: string, iter: {} | any[]) {}};'
      + expected - actual
       class Foo {
      -  annotated(untyped, class: Cls.Name, self: Foo, array: {} | any[], callable: Function, bool: boolean, float: number, int: number, string: string, iter: {} | any[]) {}
      -
      -};
      +        annotated(untyped, class_r: Cls.Name, self: Foo, array: {} | any[], callable: Function, bool: boolean, float: number, int: number, string: string, iter: {} | any[]) {}};
      
      at translates (test/helper.js:24:16)
      at Context.<anonymous> (test/translation.js:198:9)
      at processImmediate (internal/timers.js:456:21)
npm ERR! Test failed.  See above for more details.
ERROR: Job failed: exit code 1
'''

errorText = getErrorText(textExample)
print(errorText)
# ['error job failed exit code', 'assertion error err class foo', 'n pm err test failed', 'assertion error err global var']

## Test Filter for search the text error inside of the logs

In [None]:
textExample = '''
 * [new branch]      frontend-test           -> origin/frontend-test
 * [new branch]      functional-testing-junit -> origin/functional-testing-junit
 * [new branch]      master                  -> origin/master
 * [new branch]      sol-dev                 -> origin/sol-dev
 * [new branch]      sql-fix-branch          -> origin/sql-fix-branch
 * [new branch]      testing-report          -> origin/testing-report
 * [new tag]         CR-H1-2021-deploy-intermedio -> CR-H1-2021-deploy-intermedio
 * [new tag]         v1dffdfdfdf0_20190405         -df dff1dff0df0_20190405
 * [new tag]         v1dffdfdfdf1_20190424         -df dff1dff0df1_20190424
Checking out 18b927c2 as devdfdfdffdffdf

Skipping Git submodules setup
Downloading artifacts for build-jar (1473723797)dfdfdffdffdf
Downloading artifacts from coordinatordfdfdffdffdf ok        id=1473723797 responseStatus=200 OK token=G4kf2uft
$ echo "{\"auths\":{\"$CI_REGISTRY\":{\"username\":\"$CI_REGISTRY_USER\",\"password\":\"$CI_REGISTRY_PASSWORD\"}}}" > /kaniko/.docker/confdfdfdfjson
$ /kaniko/executor --context $CI_PROJECT_DIR --dockerfile $CI_PROJECT_DIR/Dockerfile --destination $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
INFO[0001] Retrieving image manifest openjdk:11.0       
INFO[0001] Retrieving image openjdk:11.0 from registry inddfdfdfdodfkdfrdfio 
error building image: GET https://index.dockdfdfdfio/v2/library/openjdk/manifestdf/df1df0: TOOMANYREQUESTS: You have reached your pull ratedflidfitdf You may increase the limit by authenticating and upgrading: httdfs:/dfwdfwdfddfckerdfcom/increase-rate-limit
ERROR: Job failed: command terminated with exit code 1
'''

errorText =  getErrorText(textExample)
print(errorText)
# This need to do defferent to 0.  
assert  len(errorText) > 0
print("✅ PASS TEST: OK 👍")

textExample = '''
    >>>> 03_add_account_toEntity-06 :              |'\n  │ '|------------------------------------------------|'\n  │ \n  │ 'https://edutelling-api-develop.openshidfdfdftecdfgdfpdfit/a\n  │ pi/v1/tutors/create/tutor-from-ambassador?ambassadorId\n  │ ='\n  │ 'RESPONSE'\n  │ \n  │ `{\"success\":true,\"message\":\"Tutor '#17:-2' cdfeadfeddf\",\"\n  │ data\":{\"tutorId\":\"#17:-2\"}}`\n  │ '|-**********************************************-|'\n  │ \n  └\n\n→ 04_login_no2_v3\n  POST https://edutelling-apidfdevdflopddfopendfhdfftdftdfchgapdfit/api/v1/auth/authentication [200 OK, 798B, 137ms]\n  ✓  [(POST)/api/v1/auth/authentication] Login Delete (200)\n\n→ 05_login_switch_how_account\n  POST https://edutellidfg-api-dfedfelopdfodffenshiftdfftechgapdfit/api/v1/auth/complete-authedftication df200 OK, 1df01KB, 27ms]\n  ✓  CHECK IF EXIST JWT\n  ✓  CHECK IF EXIST jwtRefresh\n  ✓  [(POST)/api/v1/auth/authentication] Login Delete (200)\n\n→ 01_entityTypeAccountService_getAll\n  GET https://dfdutellingdfdfpi-dedfelodfdfopensdfiftdftechgapdfit/apdf/v1/accounts/dfmbassador@botdfcom/all [200 OK, 680B, 23ms]\n  ┌\n  │ 'deleteAccountId :', '#12:1564'\n  │ 'deleteAccountId :', '12%3A1564'\n  └\n\n→ 02_reamoveAccount\n  DELETE dfttps://eddftelldfndf-api-dedfelopdfodfenshiftdftechgapdfit/api/v1/accounts/remove?dataId=12%3A1564 [200 OK, 397B, 15ms]\n\n→ goTo_workflowControl\n  OPdfIONS httpdf://edutdfflling-apidfdevelopdffopenshiftdftechgapdfit [404 Not Found, 255B, 4ms]\n\nAttempting to set next request to controlOfWorkFlowTempalte_06\n\n→ controlOfWorkFlowTempalte_06dfn  OPTIONdf https:df/edfutelling-dfpi-devedfopdfopenshiftdftechgapdfit [404 Not Found, 255B, 6ms]\n  ┌\n  │ ' ------ ------ ---------- ---------'\n  │ \n  │ '{\"nameOfConfigFile\":\"config_multiAccount_tutorHowAmba\n  │ ssador\",\"templateRun\":[\"01_entityTypeAccountService_ge\n  │ tAll_06\",\"config_multiAccount_studentHowAmbassador\"],\"\n  │ executed\":[false,true]}'\n  │ \n  │ 'WORKFLOW GO TO: ------------------------'\n  │ \n  │ 'WORKFLOW CONTROL REDIRECTING TEMPATE TO: ', 'config_m\n  │ ultiAccount_studentHowAmbassador'\n  │ \n  │ \n  └\n\nAttempting to set next request to config_multiAccount_studentHowAmbassador\n\n→ config_multiAccount_studentHowAmbadfsador\n  dfPTIONS dfttps:df/edutellidfg-api-ddfvelopdfopenshiftdftechgapdfit [404 Not Found, 255B, 5ms]\n\nAttempting to set next request to controlOfWorkFlowTempalte_06\n\n→ controlOfWorkFlodfTempalte_df6\n  OPdfIONS httdfs://edutedfling-apdf-developdfopenshiftdftechgapdfit [404 Not Found, 255B, 8ms]\n  ┌\n  │ ' ------ ------ ---------- ---------'\n  │ \n  │ '{\"nameOfConfigFile\":\"config_multiAccount_tutorHowAmba\n  │ ssador\",\"templateRun\":[\"01_entityTypeAccountService_ge\n  │ tAll_06\",\"stop_06_createCourseModuleAndStage\"]}'\n  │ \n  │ 'WORKFLOW GO TO: ------------------------'\n  │ \n  │ 'WORKFLOW CONTROL REDIRECTING TEMPATE TO: ', '01_entit\n  │ yTypeAccountService_getAll_06'\n  │ \n  │ \n  └\n\nAttempting to set next request to 01_entityTypeAccountService_getAll_06\n\n→ 01_entityTypeAcdfountServidfe_getAldf_06\n  GET dfttps://eddftellindfdfapi-developdfopenshiftdftechgadfdfit/api/v1/accounts/student@botdfcom/all [200 OK, 475B, 8ms]\n  ┌\n  │ 'entityTypeUppperCase: ', 'Student'\n  │ '20%3A273'\n  df\n\n→ 02_dfeamoveAdfcount_06\n  DELdfTE https:df/eduteldfing-api-developdfopenshiftdftechgapdfit/api/v1/accounts/remove?dataId=12%3A1564 [200 OK, 379B, 53dfs]\n\n→ 0df_add_acdfount_toEntity-06\ndf POST httdfs://edudfelling-api-developdfopenshiftdftechgapdfit/api/v1/ambassadors/create/ambassador-from-student?studentId=20%3A273 [200 OK, 420B, 84ms]\n  ✓   [(POST) /api/v1/teachers/create/teacher-from-student?studentId ] Status code is 200\n  ✓  Check if was Success to add access student how teacher\n  ┌\n  │ \n  │ '|------------------------------------------------|'\n  │ '| >>>> 03_add_account_toEntity-06 :              |'\n  │ '|-------------df---------df-------df----------------|'\n df│ \n  │ 'dfttps://dfdutelling-api-developdfopenshiftdftechgapdfit/a\n  │ pi/v1/ambassadors/create/ambassador-from-student?stude\n  │ ntId='\n  │ 'RdfSPONSE'\n  │ \n  │ `{\"success\":true,\"medfsage\":\"Ambassador '#19:-2' create\n  │ ddf\",\"data\":{\"ambassadorId\":\"#19:-2\"}}`\n  │ '|-********************************df*********df***-|'\df  │ \n  └\n\n→ 04_login_ndf2_v3\n  PdfST httpdf://edutelling-api-developdfopenshiftdftechgapdfit/api/v1/auth/authentication [200 OK, 790B, 152ms]\n  ✓  [(POST)/api/v1/auth/authenticatidfn] Login dfelete (df00)\n\n→ 05_login_switch_howdfaccount\ndf POST hddftps://edutelling-api-developdfopenshiftdftechgapddfit/api/v1/auth/complete-authentication [200 OK, 1df03KB, 23ms]\n  ✓  CHECK IF EXIST JWT\n  ✓  CHECK IF EXIST jwtRefresh\n  ✓  [(POST)/api/v1/auth/authenticadfion] Logidf Deletedf(200)\n\n→ 01_entityTypeAccoundfSdfrvice_getdfll\n  GdfT https://edutelling-api-develdfpdfopenshiftdftechgapdfit/api/v1/accounts/student@botdfcom/all [200 OK, 676B, 9ms]\n  ┌\n  │ 'deleteAccountId :', '#12:15df5'\n  │ 'dfeleteAcdfountId :', '12%3A1565'\n  └\n\n→ 02_dfeamoveAccdfunt\n  dfELETE https://edutelling-api-developdfopenshiftdftechgapdfit/api/v1/accoudfts/removedfdataId=df2%3A1565 [200 OK, 397B, 37ms]\n\n→ goTodfworkflowCdfntrol\ndf OPTIONS https://edutelling-api-developdfopenshiftdftechgapdfit [404 Not Found, 255B, 6ms]\n\nAttempting to sedf next reqdfest to dfontrolOfWorkFlowTempalte_06\n\n→ controlOfdforkFlowTedfpalte_0df\n  OPTIONS https://edutelling-api-developdfopenshiftdftechgapdfit [404 Not Found, 255B, 4ms]\n  ┌\n  │ ' ------ ------ ---------- ---------'\n  │ \n  │ '{\"nameOfConfigFile\":\"config_multiAccount_tutorHowAmba\n  │ ssador\",\"templateRun\":[\"01_entityTypeAccountService_ge\n  │ tAll_06\",\"stop_06_createCourseModuleAndStage\"],\"execut\n  │ ed\":[false,true]}'\n  │ \n  │ 'WORKFLOW GO TO: ------------------------'\n  │ \n  │ 'WORKFLOW CONTROL REDIRECTING TEMPATE TO: ', 'stop_06_\n  │ createCourseModuleAndStage'\n  │ \n  │ \n  └\n\nAttempting to set nextdfrequest tdf stop_0df_createCourseModuleAndStage\n\n→ stop_06_creadfeCourseModfuleAndSdfage\n  OPTIONS httpdf://edutelling-api-developdfopenshiftdftechgapdfit [404 Not Found, df55B, 5ms]\nsummary: 0\nNUMBER OF FAILS 0\ncollection run completeddf\n\n┌─────────────────────────┬───────────────────┬──────────────────┐\n│                         │          executed │           failed │\n├─────────────────────────┼───────────────────┼──────────────────┤\n│              iterations │                 1 │                0 │\n├─────────────────────────┼───────────────────┼──────────────────┤\n│                requests │               425 │                0 │\n├─────────────────────────┼───────────────────┼──────────────────┤\n│            test-scripts │               425 │                0 │\n├─────────────────────────┼───────────────────┼──────────────────┤\n│      prerequest-scripts │                 3 │                0 │\n├─────────────────────────┼───────────────────┼──────────────────┤\n│              assertions │df              247 │                0 │\n├─────────────────────────┴df──────────────────┴──────────────────┤\n│ total run duration: 4m df9df3s                                   │\n├────────────────────────df───────────────────────────────────────┤\n│ total data received: 755df83KB (approx)          df df            │\n├───────────────────────────────────────────────────df─df──────────┤\n│ avedfagedfresponse time: 44ms [min: 3ms, max: 605ms, sdfddf: 58ms] │\n└──df───df───ddf───────df─────────────────────────────────────────────┘\nDone indfdf60df84sdddf\n\u001dff[32;1m$ echo 'http://edutelling-functional-test-backenddfdfpenshiftddftechgapdffit/'\u001b[0;m\nhttp://edutelling-functional-test-backenddfopenshiftdftechgapdfit/\nsection_end:1626336539:step_script\r\u001b[0Ksection_start:1626336539:cleanup_file_variables\r\u001b[0K\u001b[0K\u001b[36;1mCleaning up file based variables\u001b[0;m\n\u001b[0;msection_end:1626336540:cleanup_file_variables\r\u001b[0K\u001b[32;1mJob succeeded\n\u001b[0;m
'''
errorText =  getErrorText(textExample)
assert  len(errorText) == 0
print("✅ PASS TEST: OK 👍")
# ['error building image get toomanyrequests you have reached your pull ratedflidfitdf you may increase the limit by authenticating and upgrading httdfs /dfwdfwdfddfckerdfcom/increase rate limi', 'error job failed command terminated with exit cod']

In [None]:
textExample = '''
Running with gitlab-runner 13.11.0 (7f7a4bb0)
  on pax-italia-pot HRhNkEr3
Preparing the "shell" executor
Using Shell executor...
Preparing environment
Running on TEST-POT...
Getting source from Git repository
Fetching changes...
Reinitialized existing Git repository in C:/GitLab-Runner/builds/HRhNkEr3/0/tech-gap-italia/pax-italia-pot/pax-italia-pot-api/.git/
Checking out fec6b887 as deployTest...
Removing .m2/
Removing db/
Removing potStatusCode.txt
Removing target/
git-lfs/2.13.3 (GitHub; windows amd64; go 1.16.2; git a5e65851)

Skipping Git submodules setup
Restoring cache
Version:      13.11.0
Git revision: 7f7a4bb0
Git branch:   13-11-stable
GO version:   go1.13.8
Built:        2021-04-20T17:02:32+0000
OS/Arch:      windows/amd64
show more (open the raw output data in a text editor) ...
'''

errorText =  getErrorText(textExample)

print(errorText)

## Apply filter to all data 

In [None]:
# The cleaning and tokenization function is applied to each job
# ==============================================================================
df = jobs
df['jobLog_token'] = df['jobLog'].apply(lambda x: getErrorText(x))
# ! Delete all empty list 
df = df[(df['jobLog_token'].str.len() != 0) | (df['jobLog_token'].str.len() != 0)]
df[['jobId','jobLog', 'jobLog_token']].head(10)

In [None]:
# Remove word in Not Exist in english dicctoriary 
# ==============================================================================
df = jobs
df['jobLog_token'] = df['jobLog_token'].apply(lambda x: removeWordIfNotExist(x))
# ! Delete all empty list 
df = df[(df['jobLog_token'].str.len() != 0) | (df['jobLog_token'].str.len() != 0)]
df[['jobId','jobLog', 'jobLog_token']].head(10)

# Remove stopwords


In [None]:
# Obtaining a list of stopwords in English
# ==============================================================================
stop_words = list(stopwords.words('english'))
# Se añade la stoprword: amp, ax, ex
stop_words.extend(("amp", "xa", "xe"))
print(stop_words[:10])

# Exploratory analysis

### In Python, one of the structures that most facilitates exploratory analysis is the Pandas DataFrame, which is the structure in which the information from the df is now stored. However, when tokenizing, there has been a major chandfdfdf Before dividing the text, the study elements were the df, and each one was in a row, thus fulfilling the condition of tidy data: an observation, adfrdfwdf When performing the tokenization, the element of study has become each token (word), thus violating the condition of tiddf ddftadf To get back to the ideal structure, each token list has to be expanded, doubling the value of the other columns as many times as ndfcesdfarydf This process is known as expansiondfor udfnestdf


### Although it may seem an inefficient process (the number of rows increases a lot), this simple change facilitates activities of the type: grouping, counting, graphics dfdfdffdffdf


In [None]:
# Unnest de la columna texto_tokenizado
# ==============================================================================
jobs_tidy = df.explode(column='jobLog_token')
jobs_tidy = jobs_tidy.drop(columns='jobLog')
jobs_tidy = jobs_tidy.rename(columns={'jobLog_token':'token'})
jobs_tidy.head(3)


# Total words used by each log event

In [None]:
jobs_tidy.groupby(by='jobStatus')['token'].count()

# Total words used by each project

In [None]:
jobs_tidy.groupby(by='projectName')['token'].count()

# Frequency of words

In [None]:
# Total words per event
# ==============================================================================
print('--------------------------')
print('Total words per event')
print('--------------------------')
jobs_tidy.groupby(by='jobStatus')['token'].nunique()

In [None]:
# Longitud media y desviación de los jobs de cada jobStatus
# ==============================================================================
temp_df = pd.DataFrame(jobs_tidy.groupby(by = ["jobStatus", "jobId"])["token"].count())
temp_df.reset_index().groupby("jobStatus")["token"].agg(['mean', 'std'])

# Create list of STPS (derivate 1)

In [None]:
# Top 50 palabras más utilizadas por cada evento
# ==============================================================================
jobs_tidy_text = jobs_tidy.groupby(['jobStatus','token','commitMessage', 'jobStage', 'jobName'])['token'] \
 .count() \
 .reset_index(name='count') \
 .groupby('jobStatus') \
 .apply(lambda x: x.sort_values('count', ascending=False).head(10))

jobs_tidy_text

In [None]:
# Clean text and apply filters
# ==============================================================================
def getTextFilter(textList):
    unics = set(); textList = [string for string in textList if string not in unics and (unics.add(string) or True)] # Delete duplicate data
    listText2 = []
    for text in textList:
        regex = '[\\!\\"\\#\\>\\<\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\;\\\\\]\\<\\=\\,\\>\\?\\:\\-\\|\\@\\@\\\\^_\\`\\{\\|\\\\}\\~]'
        text = text.lower()
        text = re.sub(regex , ' ', text)
        text = re.sub('http\S+', ' ', text)
        text = text.replace("\n", "")
        text = text + "\n"

        # Delete stop 
        text_temp = []        
        for tweet in text.split(sep = ' '):
            if tweet not in stop_words:
                text_temp.append(tweet.replace('\n',''))
        listText2 =  listText2+text_temp

    unics = set(); listText2 = [string for string in listText2 if string not in unics and (unics.add(string) or True)] # Delete duplicate data
    listText2 = ', '.join(listText2)

    return listText2
    
textList = ['Merge branch \'344-projectqueryfragments-500-error\' into \'develop\'\n\nResolve "projectQueryFragments 500 error"\n\nCloses #344\n\nSee merge request tech-gap-italia/ckp/ckp-api!247', 'Merge branch \'342-add-an-image-to-project-description\' into \'develop\'\n\nResolve "Add an image to project description"\n\nCloses #342\n\nSee merge request tech-gap-italia/ckp/ckp-api!245', "Merge branch 'develop' into sidip\n", 'Merge C1-S21 and C2-S20, Important! Is necessary update the db with 0303 and 0304\n', 'Update .gitlab-ci.yml', 'Merge branch \'23-fix-report-device-in-repair\' into \'dev\'\n\nResolve "Fix report device in repair"\n\nCloses #23\n\nSee merge request tech-gap-italia/pax-italia-pot/pax-italia-pot-api!36']
print(" --- --- Before applying the filter --- ---")
print(textList)
textOut = getTextFilter(textList)
print(" --- --- After applying the filter --- ---")
print(textOut)


# Create list of STPS (derivate 2)

In [None]:
# Collect data 
# https://sites.temple.edu/tudsc/2017/03/30/measuring-similarity-between-texts-in-python/
#================================================================================
def StemTokens(tokens):
    return [stemmer.stem(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def StemNormalize(text):
    return StemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
nltk.download('wordnet') # first-time use only
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')

def idf(n,df):
    result = math.log((n+1.0)/(df+1.0)) + 1
    return result

def groupDataFrame(jobs_tidy_text, jobStatusUnique, similarity):
    countI = -1
    data = []
    LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')

    for status in jobStatusUnique:
        jobs_temp = jobs_tidy_text[jobs_tidy_text["jobStatus"] == status]
        documents = jobs_temp['token'].to_list()
        countX = jobs_temp['count'].to_list()

        
        if len(documents) > 0:
            LemVectorizer.fit_transform(documents) 
            tf_matrix = LemVectorizer.transform(documents).toarray()
            tfidfTran = TfidfTransformer(norm="l2")
            tfidfTran.fit(tf_matrix)
            tfidf_matrix = tfidfTran.transform(tf_matrix)
            cos_similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()
            # Collect unics data:
            commitMessage = ""; jobStage=""; jobName="";
            for i in range(0,len(cos_similarity_matrix)):
                countData = 0
                for i2 in range(0,len(cos_similarity_matrix)):
                    if cos_similarity_matrix[i,i2] > similarity:
                        token = documents[i2]
                        countData = countData + countX[i2]
                        commitMessage = getTextFilter(jobs_temp['commitMessage'].to_list())
                        jobStage = getTextFilter(jobs_temp['jobStage'].to_list())
                        jobName  = getTextFilter(jobs_temp['jobName'].to_list())

                d = [status, token, countData,commitMessage,jobStage,jobName]
                data.append(d)

    data =list(map(list,set(map(tuple,data)))) # Deleret duplicate data

    df_stps = pd.DataFrame(data, columns = ['jobStatus', 'token', 'count',"commitMessage","jobStage","jobName"])
    if countI == -1:
        countI = countI + 1
        df_STPS = df_stps
    else:
        df_STPS.append(df_stps)
    return df_STPS
    #     LemVectorizer

df_stps = groupDataFrame(pd.DataFrame(jobs_tidy_text,columns = ['jobStatus', 'token', 'count','commitMessage','jobStage','jobName']), 
    jobs["jobStatus"].unique().tolist(),
    similarity
    )
print("=======================================")
print("Text related to a similarity of:")
print("=======================================")
df_stps

# TODO. 
Eliminar acronimos, normalizar
Alogirtmos de procesamiento de lenguaje natural
Buscar de similidaridad semantica. WorkToVect.
wordEmebing. 


buscar y recomendar alertas elastichares

In [None]:
nltk.download('wordnet') # first-time use only
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
print(string.punctuation)

In [None]:
# Order dataframe. 
df_stps= df_stps.sort_values(by=['jobStatus'])

fig = go.Figure(data=[go.Table(header=dict(values=[
    'jobStatus', 
    'STPS',"Events"
    ]),
    columnwidth = [90,90,30,40,20,10],    
    cells=dict(values=[
                     df_stps['token'].to_list(),
                     df_stps['commitMessage'].to_list(),
                     df_stps['jobStage'].to_list(),
                     df_stps['jobName'].to_list(),
                     df_stps['jobStatus'].to_list(),
                     df_stps['count'].to_list()
                 ],
                 align='left',
                 font_size=14,
                 height=30
                 ))
                     ])

fig.show()

pathSave = './03-logsAnalysisPublics_STPS_3_'+today+'.csv'

print("===== ===== ====== ====")
print(pathSave)

df_stps.to_csv(pathSave, index = False)


In [None]:
# Top 10 palabras por jobStatus (sin stopwords)
# ==============================================================================
jobStatusUnique = df_stps["jobStatus"].unique().tolist()
df_list = []
for status in jobStatusUnique:
    jobs_total = df_stps.rename(columns={'jobStatus': status})
    total = jobs_total.groupby(by="token")[status].count()
    df_list.append(total)
df = pd.concat(df_list,axis=1)
df = df.sort_values(by=["failed"],ascending=True)
fig = px.bar(df, orientation='h',template=plotly_template,title="Number of fails by token")
fig.show()


In [None]:

import sys
sys.executable