# (3) Logs Analysis using Mining. 
# Data set type: Industial-Anoki

# 💨🔥💨 Smoke Analysis

#### ✅ python, ✅ Gitlab, ✅ Mongodb

## Qs
### Q1: What is the top of most common problems in pipelines?

## Index

- [Import python libraries](#Import-python-libraries)
- [Page reference](#Page-reference)
- [Create event list](#Create-event-list)
- [Group similar text](#Group-similar-text)

## Nomenclature
    - (STPS) Smoke test possible solution: It is the set of tentative errors that can be avoided by using smoke tests

## Import python libraries

In [349]:
import os
import pymongo
from pymongo import MongoClient

# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import string
import re
# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
# Plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import plotly.io as pio
pio.renderers.default='notebook'
# ==============================================================================
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
nltk.download('punkt') # first-time use only
nltk.download('wordnet') # first-time use only
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import math
# Configuración warnings
# ==============================================================================
import warnings

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ceciliocannavaciuolo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ceciliocannavaciuolo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ceciliocannavaciuolo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Config Variables

In [350]:
plotly_template="plotly_dark"
# plotly_template="plotly"
# Smoke Test Parameters
# ==============================================================================
JobsNameBlackList = ['test'] # Remove all jobs of analysis with this name
logsWhiteList = ["error","fail", "warning"] # Check lines of logs with this words
JobsStatusWhiteList = ["failed"] # Check jobs with this status 
# Group similar text
similarity = 0.6

# Page reference

- [TextBlob: Librarian documentation to do text analysis](https://textblob.readthedodfdfdfio/en/dev/quicksdfadftdfhtml#get-word-and-noun-phrase-frequencies)
- pipenv: https://pipenv.pydfdfdfio/en/latest/
- pythonplot: https://pythonplot.com/

# Get data from MongoDB
### Read data from Mongodb database

In [351]:
# Connect with DB
MONGODB_URL = os.environ.get('MONGODB_URL')
NODE_ENV = os.environ.get('NODE_ENV') or "dev"
DB_NAME = os.environ.get('APP_NAME') + "-"+ NODE_ENV

client = MongoClient()
client = MongoClient(MONGODB_URL)
db = client[DB_NAME]

jobs = db.gitlablogs.find({}).limit(1000) # Read all data
# jobs = db.gitlablogs.find({}) # Read all data

jobs = pd.DataFrame(list(jobs)) # Convert to DataFrame
print("List of data available iside of db structure")
jobs.dtypes

List of data available iside of db structure


_id                                 object
jobId                                int64
projectId                            int64
__v                                  int64
allow_failure                         bool
commitId                            object
commitMessage                       object
commitTitle                         object
committedEmail                      object
created_at                  datetime64[ns]
duration                           float64
jobName                             object
jobRef                              object
jobStage                            object
jobStatus                           object
pipelineId                           int64
pipelineRef                         object
pipelineStatus                      object
pipelineUrl                         object
pipelineWebUrl                      object
projectDescriptions                 object
projectName                         object
projectNameWithNamespace            object
queued_dura

# Delete pipelines data of the analysis with the name inside of the BlackList
# Check whit @Leo

## This is to avoid including logs from other types of tests in the analysis. A logs of a functional test (to mention an example) could throw large volumes of events from other types of tests

In [352]:
jobsBlackList = ['test']

# Analysis of data volumes.
## In this section you can obtain general information related to the volume of data.

## Percentage of type jobs

In [353]:
# jobs
print("------ DATA REPORT ------")
projectsNumber = len(jobs["projectName"].unique())
print("Number of Projects: "+ str(projectsNumber))
numberOfJobs = len(jobs.index)
print("Number of pipelines Jobs (Steps): "+ str(numberOfJobs))
numberOfSuccess= jobs.loc[jobs["jobStatus"] == "success"]["jobStatus"].count()
numberOfFailed= jobs.loc[jobs["jobStatus"] == "failed"]["jobStatus"].count()
numberOfCancel= jobs.loc[jobs["jobStatus"] == "canceled"]["jobStatus"].count()

successPercentage = (1-((numberOfSuccess+numberOfFailed+numberOfCancel)-numberOfSuccess)/(numberOfSuccess+numberOfFailed+numberOfCancel))*100
failedPercentage = (1-((numberOfSuccess+numberOfFailed+numberOfCancel)-numberOfFailed)/(numberOfSuccess+numberOfFailed+numberOfCancel))*100
canceledPercentage = (1-((numberOfSuccess+numberOfFailed+numberOfCancel)-numberOfCancel)/(numberOfSuccess+numberOfFailed+numberOfCancel))*100

print("Number of success Jobs (Steps): "+ str(numberOfSuccess) + " or "+str(successPercentage) + " %")
print("Number of failed Jobs (Steps): "+ str(numberOfFailed)+ " or "+str(failedPercentage) + " %")
print("Number of canceled Jobs (Steps): "+ str(numberOfCancel)+ " or "+str(canceledPercentage) + " %")

------ DATA REPORT ------
Number of Projects: 8
Number of pipelines Jobs (Steps): 1000
Number of success Jobs (Steps): 612 or 63.816475495307614 %
Number of failed Jobs (Steps): 51 or 5.318039624608972 %
Number of canceled Jobs (Steps): 296 or 30.865484880083415 %


In [354]:
fig = make_subplots(rows=1, cols=2)
fig = px.pie(jobs, names='jobStatus', title='Pipelines Jobs results',color="jobStatus",template=plotly_template)
fig.show()

# Number of fails by stage number. 

In [355]:
def createBarGraphByJobStatus(variable):

    jobStatusUnique = jobs["jobStatus"].unique().tolist()
    df_list = []
    for status in jobStatusUnique:
        jobs_total = jobs.rename(columns={'jobStatus': status})
        total = jobs_total.groupby(by=variable)[status].count()
        df_list.append(total)

    df = pd.concat(df_list,axis=1)
    df = df.sort_values(by=[jobStatusUnique[0]],ascending=True)
    fig = px.bar(df, orientation='h',template=plotly_template,title="Number of fails by "+ variable)
    fig.show()
    
print(" Number of jobs projectName types")
createBarGraphByJobStatus("projectName")    
print(" Number of jobs stage types")
createBarGraphByJobStatus("jobStage")
createBarGraphByJobStatus("pipelineRef")


 Number of jobs projectName types


 Number of jobs stage types


## Test. Measuring Similarity Between Texts in Python
https://sites.temple.edu/tudsc/2017/03/30/measuring-similarity-between-texts-in-python/

# Filter logs data


## Get fragment of text with error 

In [356]:
# Get fragment of text with error
# ==============================================================================
def getErrorText(texto):
    #! Get only last range. 
    nuevo_texto = texto
    #! Convert all text to lowercase.
    nuevo_texto = nuevo_texto.lower()
    #! Web page removal (words beginning with "http")
    nuevo_texto = re.sub('http\S+', ' ', nuevo_texto)
    nuevo_texto = nuevo_texto.split(sep = '\n');

    
    whiteList = ["error"]; newTextList = []; nuevoTexto = ""
    for text in nuevo_texto:
        for listI in whiteList:
            if listI in text:
                #! Remove special characters
                regex = '[\\!\\"\\#\\>\\<\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\;\\\\\]\\<\\=\\,\\>\\?\\:\\-\\|\\@\\@\\\\^_\\`\\{\\|\\}\\~]'
                text = re.sub(regex , ' ', text)

                text = re.sub('http\S+', ' ', text)

                #! Remove numbers
                text = re.sub('\d+', ' ', text)
                #! remove date
                text = re.sub('\d{4}-\d{2}-\d{2}', ' ', text)
                text = re.sub(' +', ' ', text)
                text = re.sub('- - t : :','',text)
                #! Removing emojis
                emoji_pattern = re.compile("["
                      u"\U0001F600-\U0001F64F"  # emoticons
                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
                text = emoji_pattern.sub(r'', text)
                
                if not text in newTextList:
                    #! cut string. 
                    text = text.strip()
                    text.replace('[','').replace(']','')
                    text = text[0:text.find(".")]
                    nuevoTexto = nuevoTexto + "\n" + text

                newTextList.append(text)

                newTextList = [string for string in newTextList if string.strip()]# Remove multiple empty spaces from string List
                newTextList = [string for string in newTextList if string != " "] # Delete empty strings
                newTextList = [string for string in newTextList if string != ""] # Delete empty strings  
                newTextList = [string for string in newTextList if (len(string) > 2)] # Eliminación de tokens con una longitud < 2 o que se encuentren en la lista de palabras ignoradas
    
                unics = set(); newTextList = [string for string in newTextList if string not in unics and (unics.add(string) or True)] # Delete duplicate data

    return newTextList

## Test Filter for search the text error inside of the logs

In [357]:
textExample = '''
 * [new branch]      frontend-test           -> origin/frontend-test
 * [new branch]      functional-testing-junit -> origin/functional-testing-junit
 * [new branch]      master                  -> origin/master
 * [new branch]      sol-dev                 -> origin/sol-dev
 * [new branch]      sql-fix-branch          -> origin/sql-fix-branch
 * [new branch]      testing-report          -> origin/testing-report
 * [new tag]         CR-H1-2021-deploy-intermedio -> CR-H1-2021-deploy-intermedio
 * [new tag]         v1dffdfdfdf0_20190405         -df dff1dff0df0_20190405
 * [new tag]         v1dffdfdfdf1_20190424         -df dff1dff0df1_20190424
Checking out 18b927c2 as devdfdfdffdffdf

Skipping Git submodules setup
Downloading artifacts for build-jar (1473723797)dfdfdffdffdf
Downloading artifacts from coordinatordfdfdffdffdf ok        id=1473723797 responseStatus=200 OK token=G4kf2uft
$ echo "{\"auths\":{\"$CI_REGISTRY\":{\"username\":\"$CI_REGISTRY_USER\",\"password\":\"$CI_REGISTRY_PASSWORD\"}}}" > /kaniko/.docker/confdfdfdfjson
$ /kaniko/executor --context $CI_PROJECT_DIR --dockerfile $CI_PROJECT_DIR/Dockerfile --destination $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
INFO[0001] Retrieving image manifest openjdk:11.0       
INFO[0001] Retrieving image openjdk:11.0 from registry inddfdfdfdodfkdfrdfio 
error building image: GET https://index.dockdfdfdfio/v2/library/openjdk/manifestdf/df1df0: TOOMANYREQUESTS: You have reached your pull ratedflidfitdf You may increase the limit by authenticating and upgrading: httdfs:/dfwdfwdfddfckerdfcom/increase-rate-limit
ERROR: Job failed: command terminated with exit code 1
'''

errorText =  getErrorText(textExample)

print(errorText)
# This need to do defferent to 0.  
assert  len(errorText) > 0
print("✅ PASS TEST: OK 👍")

textExample = '''
    >>>> 03_add_account_toEntity-06 :              |'\n  │ '|------------------------------------------------|'\n  │ \n  │ 'https://edutelling-api-develop.openshidfdfdftecdfgdfpdfit/a\n  │ pi/v1/tutors/create/tutor-from-ambassador?ambassadorId\n  │ ='\n  │ 'RESPONSE'\n  │ \n  │ `{\"success\":true,\"message\":\"Tutor '#17:-2' cdfeadfeddf\",\"\n  │ data\":{\"tutorId\":\"#17:-2\"}}`\n  │ '|-**********************************************-|'\n  │ \n  └\n\n→ 04_login_no2_v3\n  POST https://edutelling-apidfdevdflopddfopendfhdfftdftdfchgapdfit/api/v1/auth/authentication [200 OK, 798B, 137ms]\n  ✓  [(POST)/api/v1/auth/authentication] Login Delete (200)\n\n→ 05_login_switch_how_account\n  POST https://edutellidfg-api-dfedfelopdfodffenshiftdfftechgapdfit/api/v1/auth/complete-authedftication df200 OK, 1df01KB, 27ms]\n  ✓  CHECK IF EXIST JWT\n  ✓  CHECK IF EXIST jwtRefresh\n  ✓  [(POST)/api/v1/auth/authentication] Login Delete (200)\n\n→ 01_entityTypeAccountService_getAll\n  GET https://dfdutellingdfdfpi-dedfelodfdfopensdfiftdftechgapdfit/apdf/v1/accounts/dfmbassador@botdfcom/all [200 OK, 680B, 23ms]\n  ┌\n  │ 'deleteAccountId :', '#12:1564'\n  │ 'deleteAccountId :', '12%3A1564'\n  └\n\n→ 02_reamoveAccount\n  DELETE dfttps://eddftelldfndf-api-dedfelopdfodfenshiftdftechgapdfit/api/v1/accounts/remove?dataId=12%3A1564 [200 OK, 397B, 15ms]\n\n→ goTo_workflowControl\n  OPdfIONS httpdf://edutdfflling-apidfdevelopdffopenshiftdftechgapdfit [404 Not Found, 255B, 4ms]\n\nAttempting to set next request to controlOfWorkFlowTempalte_06\n\n→ controlOfWorkFlowTempalte_06dfn  OPTIONdf https:df/edfutelling-dfpi-devedfopdfopenshiftdftechgapdfit [404 Not Found, 255B, 6ms]\n  ┌\n  │ ' ------ ------ ---------- ---------'\n  │ \n  │ '{\"nameOfConfigFile\":\"config_multiAccount_tutorHowAmba\n  │ ssador\",\"templateRun\":[\"01_entityTypeAccountService_ge\n  │ tAll_06\",\"config_multiAccount_studentHowAmbassador\"],\"\n  │ executed\":[false,true]}'\n  │ \n  │ 'WORKFLOW GO TO: ------------------------'\n  │ \n  │ 'WORKFLOW CONTROL REDIRECTING TEMPATE TO: ', 'config_m\n  │ ultiAccount_studentHowAmbassador'\n  │ \n  │ \n  └\n\nAttempting to set next request to config_multiAccount_studentHowAmbassador\n\n→ config_multiAccount_studentHowAmbadfsador\n  dfPTIONS dfttps:df/edutellidfg-api-ddfvelopdfopenshiftdftechgapdfit [404 Not Found, 255B, 5ms]\n\nAttempting to set next request to controlOfWorkFlowTempalte_06\n\n→ controlOfWorkFlodfTempalte_df6\n  OPdfIONS httdfs://edutedfling-apdf-developdfopenshiftdftechgapdfit [404 Not Found, 255B, 8ms]\n  ┌\n  │ ' ------ ------ ---------- ---------'\n  │ \n  │ '{\"nameOfConfigFile\":\"config_multiAccount_tutorHowAmba\n  │ ssador\",\"templateRun\":[\"01_entityTypeAccountService_ge\n  │ tAll_06\",\"stop_06_createCourseModuleAndStage\"]}'\n  │ \n  │ 'WORKFLOW GO TO: ------------------------'\n  │ \n  │ 'WORKFLOW CONTROL REDIRECTING TEMPATE TO: ', '01_entit\n  │ yTypeAccountService_getAll_06'\n  │ \n  │ \n  └\n\nAttempting to set next request to 01_entityTypeAccountService_getAll_06\n\n→ 01_entityTypeAcdfountServidfe_getAldf_06\n  GET dfttps://eddftellindfdfapi-developdfopenshiftdftechgadfdfit/api/v1/accounts/student@botdfcom/all [200 OK, 475B, 8ms]\n  ┌\n  │ 'entityTypeUppperCase: ', 'Student'\n  │ '20%3A273'\n  df\n\n→ 02_dfeamoveAdfcount_06\n  DELdfTE https:df/eduteldfing-api-developdfopenshiftdftechgapdfit/api/v1/accounts/remove?dataId=12%3A1564 [200 OK, 379B, 53dfs]\n\n→ 0df_add_acdfount_toEntity-06\ndf POST httdfs://edudfelling-api-developdfopenshiftdftechgapdfit/api/v1/ambassadors/create/ambassador-from-student?studentId=20%3A273 [200 OK, 420B, 84ms]\n  ✓   [(POST) /api/v1/teachers/create/teacher-from-student?studentId ] Status code is 200\n  ✓  Check if was Success to add access student how teacher\n  ┌\n  │ \n  │ '|------------------------------------------------|'\n  │ '| >>>> 03_add_account_toEntity-06 :              |'\n  │ '|-------------df---------df-------df----------------|'\n df│ \n  │ 'dfttps://dfdutelling-api-developdfopenshiftdftechgapdfit/a\n  │ pi/v1/ambassadors/create/ambassador-from-student?stude\n  │ ntId='\n  │ 'RdfSPONSE'\n  │ \n  │ `{\"success\":true,\"medfsage\":\"Ambassador '#19:-2' create\n  │ ddf\",\"data\":{\"ambassadorId\":\"#19:-2\"}}`\n  │ '|-********************************df*********df***-|'\df  │ \n  └\n\n→ 04_login_ndf2_v3\n  PdfST httpdf://edutelling-api-developdfopenshiftdftechgapdfit/api/v1/auth/authentication [200 OK, 790B, 152ms]\n  ✓  [(POST)/api/v1/auth/authenticatidfn] Login dfelete (df00)\n\n→ 05_login_switch_howdfaccount\ndf POST hddftps://edutelling-api-developdfopenshiftdftechgapddfit/api/v1/auth/complete-authentication [200 OK, 1df03KB, 23ms]\n  ✓  CHECK IF EXIST JWT\n  ✓  CHECK IF EXIST jwtRefresh\n  ✓  [(POST)/api/v1/auth/authenticadfion] Logidf Deletedf(200)\n\n→ 01_entityTypeAccoundfSdfrvice_getdfll\n  GdfT https://edutelling-api-develdfpdfopenshiftdftechgapdfit/api/v1/accounts/student@botdfcom/all [200 OK, 676B, 9ms]\n  ┌\n  │ 'deleteAccountId :', '#12:15df5'\n  │ 'dfeleteAcdfountId :', '12%3A1565'\n  └\n\n→ 02_dfeamoveAccdfunt\n  dfELETE https://edutelling-api-developdfopenshiftdftechgapdfit/api/v1/accoudfts/removedfdataId=df2%3A1565 [200 OK, 397B, 37ms]\n\n→ goTodfworkflowCdfntrol\ndf OPTIONS https://edutelling-api-developdfopenshiftdftechgapdfit [404 Not Found, 255B, 6ms]\n\nAttempting to sedf next reqdfest to dfontrolOfWorkFlowTempalte_06\n\n→ controlOfdforkFlowTedfpalte_0df\n  OPTIONS https://edutelling-api-developdfopenshiftdftechgapdfit [404 Not Found, 255B, 4ms]\n  ┌\n  │ ' ------ ------ ---------- ---------'\n  │ \n  │ '{\"nameOfConfigFile\":\"config_multiAccount_tutorHowAmba\n  │ ssador\",\"templateRun\":[\"01_entityTypeAccountService_ge\n  │ tAll_06\",\"stop_06_createCourseModuleAndStage\"],\"execut\n  │ ed\":[false,true]}'\n  │ \n  │ 'WORKFLOW GO TO: ------------------------'\n  │ \n  │ 'WORKFLOW CONTROL REDIRECTING TEMPATE TO: ', 'stop_06_\n  │ createCourseModuleAndStage'\n  │ \n  │ \n  └\n\nAttempting to set nextdfrequest tdf stop_0df_createCourseModuleAndStage\n\n→ stop_06_creadfeCourseModfuleAndSdfage\n  OPTIONS httpdf://edutelling-api-developdfopenshiftdftechgapdfit [404 Not Found, df55B, 5ms]\nsummary: 0\nNUMBER OF FAILS 0\ncollection run completeddf\n\n┌─────────────────────────┬───────────────────┬──────────────────┐\n│                         │          executed │           failed │\n├─────────────────────────┼───────────────────┼──────────────────┤\n│              iterations │                 1 │                0 │\n├─────────────────────────┼───────────────────┼──────────────────┤\n│                requests │               425 │                0 │\n├─────────────────────────┼───────────────────┼──────────────────┤\n│            test-scripts │               425 │                0 │\n├─────────────────────────┼───────────────────┼──────────────────┤\n│      prerequest-scripts │                 3 │                0 │\n├─────────────────────────┼───────────────────┼──────────────────┤\n│              assertions │df              247 │                0 │\n├─────────────────────────┴df──────────────────┴──────────────────┤\n│ total run duration: 4m df9df3s                                   │\n├────────────────────────df───────────────────────────────────────┤\n│ total data received: 755df83KB (approx)          df df            │\n├───────────────────────────────────────────────────df─df──────────┤\n│ avedfagedfresponse time: 44ms [min: 3ms, max: 605ms, sdfddf: 58ms] │\n└──df───df───ddf───────df─────────────────────────────────────────────┘\nDone indfdf60df84sdddf\n\u001dff[32;1m$ echo 'http://edutelling-functional-test-backenddfdfpenshiftddftechgapdffit/'\u001b[0;m\nhttp://edutelling-functional-test-backenddfopenshiftdftechgapdfit/\nsection_end:1626336539:step_script\r\u001b[0Ksection_start:1626336539:cleanup_file_variables\r\u001b[0K\u001b[0K\u001b[36;1mCleaning up file based variables\u001b[0;m\n\u001b[0;msection_end:1626336540:cleanup_file_variables\r\u001b[0K\u001b[32;1mJob succeeded\n\u001b[0;m
'''
errorText =  getErrorText(textExample)
print(errorText)
assert  len(errorText) == 0
print("✅ PASS TEST: OK 👍")

['error building image get toomanyrequests you have reached your pull ratedflidfitdf you may increase the limit by authenticating and upgrading httdfs /dfwdfwdfddfckerdfcom/increase rate limi', 'error job failed command terminated with exit cod']
✅ PASS TEST: OK 👍
[]
✅ PASS TEST: OK 👍


## Apply filter to all data 

In [358]:
# The cleaning and tokenization function is applied to each job
# ==============================================================================
df = jobs
df['jobLog_token'] = df['jobLog'].apply(lambda x: getErrorText(x))
#! Delete all empty list 
df = df[(df['jobLog_token'].str.len() != 0) | (df['jobLog_token'].str.len() != 0)]
df[['jobLog', 'jobLog_token']].head(10)

Unnamed: 0,jobLog,jobLog_token
0,[0KRunning with gitlab-runner 14.1.0-rc1 (e94...,[health check erro]
1,[0KRunning with gitlab-runner 14.1.0-rc1 (e94...,[health check erro]
2,[0KRunning with gitlab-runner 14.1.0-rc1 (e94...,"[health check erro, typeerror cannot read prop..."
3,[0KRunning with gitlab-runner 14.1.0-rc1 (e94...,[health check erro]
4,[0KRunning with gitlab-runner 14.1.0-rc1 (e94...,[health check erro]
5,[0KRunning with gitlab-runner 14.1.0-rc1 (e94...,[health check erro]
6,[0KRunning with gitlab-runner 14.1.0-rc1 (e94...,[health check erro]
7,[0KRunning with gitlab-runner 14.1.0-rc1 (e94...,[health check erro]
8,[0KRunning with gitlab-runner 14.1.0-rc1 (e94...,"[health check erro, typeerror cannot read prop..."
9,[0KRunning with gitlab-runner 14.1.0-rc1 (e94...,[health check erro]


# Remove stopwords


In [359]:
# Obtaining a list of stopwords in English
# ==============================================================================
stop_words = list(stopwords.words('english'))
# Se añade la stoprword: amp, ax, ex
stop_words.extend(("amp", "xa", "xe"))
print(stop_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


# Exploratory analysis

### In Python, one of the structures that most facilitates exploratory analysis is the Pandas DataFrame, which is the structure in which the information from the df is now stored. However, when tokenizing, there has been a major chandfdfdf Before dividing the text, the study elements were the df, and each one was in a row, thus fulfilling the condition of tidy data: an observation, adfrdfwdf When performing the tokenization, the element of study has become each token (word), thus violating the condition of tiddf ddftadf To get back to the ideal structure, each token list has to be expanded, doubling the value of the other columns as many times as ndfcesdfarydf This process is known as expansiondfor udfnestdf


### Although it may seem an inefficient process (the number of rows increases a lot), this simple change facilitates activities of the type: grouping, counting, graphics dfdfdffdffdf


In [360]:
# Unnest de la columna texto_tokenizado
# ==============================================================================
jobs_tidy = df.explode(column='jobLog_token')
jobs_tidy = jobs_tidy.drop(columns='jobLog')
jobs_tidy = jobs_tidy.rename(columns={'jobLog_token':'token'})
jobs_tidy.head(3)


Unnamed: 0,_id,jobId,projectId,__v,allow_failure,commitId,commitMessage,commitTitle,committedEmail,created_at,...,projectNameWithNamespace,queued_duration,runnerDescription,runnerId,runnerIpAddress,runnerName,sha,started_at,username,token
0,611102da01a14071451c22c7,1452810390,15112024,0,False,fa3cb44c5c76f4955d7b63f6159bc17916c4a29b,Merge branch '226-A-getStagesByTeacherId' into...,Merge branch '226-A-getStagesByTeacherId' into...,c.cannavacciuolo@anoki.it,2021-07-26 15:10:20.281,...,Anoki S.r.l. / EDUtelling-Prj / edutelling-api,0.257544,shared-runners-manager-7,,35.237.21.158,gitlab-runner,fa3cb44c5c76f4955d7b63f6159bc17916c4a29b,2021-07-26 15:23:34.058,cecilio.cannav,health check erro
1,6111047501a14071451c2354,1452810387,15112024,0,False,fa3cb44c5c76f4955d7b63f6159bc17916c4a29b,Merge branch '226-A-getStagesByTeacherId' into...,Merge branch '226-A-getStagesByTeacherId' into...,c.cannavacciuolo@anoki.it,2021-07-26 15:10:20.251,...,Anoki S.r.l. / EDUtelling-Prj / edutelling-api,3.88532,shared-runners-manager-3.gitlab.com,,104.196.48.2,gitlab-runner,fa3cb44c5c76f4955d7b63f6159bc17916c4a29b,2021-07-26 15:18:18.317,cecilio.cannav,health check erro
2,6111047501a14071451c2356,1452810384,15112024,0,True,fa3cb44c5c76f4955d7b63f6159bc17916c4a29b,Merge branch '226-A-getStagesByTeacherId' into...,Merge branch '226-A-getStagesByTeacherId' into...,c.cannavacciuolo@anoki.it,2021-07-26 15:10:20.225,...,Anoki S.r.l. / EDUtelling-Prj / edutelling-api,0.132675,shared-runners-manager-7,,35.237.21.158,gitlab-runner,fa3cb44c5c76f4955d7b63f6159bc17916c4a29b,2021-07-26 15:15:53.474,cecilio.cannav,health check erro


# Total words used by each log event

In [361]:
jobs_tidy.groupby(by='jobStatus')['token'].count()

jobStatus
canceled     35
failed      197
success     770
Name: token, dtype: int64

# Total words used by each project

In [362]:
jobs_tidy.groupby(by='projectName')['token'].count()

projectName
API                     150
EduTelling              106
TRusT-FE                 44
app-prenotazioni-api    168
ckp-api                  40
edutelling-api          482
pax-Italia-pot-cicd      12
Name: token, dtype: int64

# Frequency of words

In [363]:
# Total words per event
# ==============================================================================
print('--------------------------')
print('Total words per event')
print('--------------------------')
jobs_tidy.groupby(by='jobStatus')['token'].nunique()

--------------------------
Total words per event
--------------------------


jobStatus
canceled     7
failed      44
success     31
Name: token, dtype: int64

In [364]:
# Longitud media y desviación de los jobs de cada jobStatus
# ==============================================================================
temp_df = pd.DataFrame(jobs_tidy.groupby(by = ["jobStatus", "jobId"])["token"].count())
temp_df.reset_index().groupby("jobStatus")["token"].agg(['mean', 'std'])

Unnamed: 0_level_0,mean,std
jobStatus,Unnamed: 1_level_1,Unnamed: 2_level_1
canceled,1.25,0.645497
failed,3.862745,2.52206
success,1.412844,0.793019


# Create event list
## In this section we try to get the list of STPS.


In [365]:
# Top 50 palabras más utilizadas por cada evento
# ==============================================================================
jobs_tidy_text = jobs_tidy.groupby(['jobStatus','token','commitMessage', 'jobStage', 'jobName'])['token'] \
 .count() \
 .reset_index(name='count') \
 .groupby('jobStatus') \
 .apply(lambda x: x.sort_values('count', ascending=False).head(10))

jobs_tidy_text

Unnamed: 0_level_0,Unnamed: 1_level_0,jobStatus,token,commitMessage,jobStage,jobName,count
jobStatus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
canceled,17,canceled,health check erro,Update .gitlab-ci.yml,package,docker-build,2
canceled,25,canceled,health check erro,fix error in app.js\n,package,docker-build,1
canceled,19,canceled,health check erro,add step for check cluster conditions,functional-test,test-develop,1
canceled,20,canceled,health check erro,amended .gitlab-ci.yml\n,package,docker-build,1
canceled,21,canceled,health check erro,check smoke test,build,checkCluster,1
canceled,22,canceled,health check erro,check smoke test,build,compile-java,1
canceled,23,canceled,health check erro,check smoke test,build,logsStatus,1
canceled,24,canceled,health check erro,check smoke test,build,podsUp,1
canceled,26,canceled,health check erro,fix test,package,package-develop,1
canceled,1,canceled,[info error stacktraces are turned on,Update .gitlab-ci.yml,build,compile-java,1


In [366]:
def getTextFilter(textList):
    unics = set(); textList = [string for string in textList if string not in unics and (unics.add(string) or True)] # Delete duplicate data
    listText2 = []
    for text in textList:
        regex = '[\\!\\"\\#\\>\\<\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\;\\\\\]\\<\\=\\,\\>\\?\\:\\-\\|\\@\\@\\\\^_\\`\\{\\|\\\\}\\~]'
        text = text.lower()
        text = re.sub(regex , ' ', text)
        text = re.sub('http\S+', ' ', text)
        text = text.replace("\n", "")
        text = text + "\n"

        # Delete stop 
        text_temp = []        
        for tweet in text.split(sep = ' '):
            if tweet not in stop_words:
                text_temp.append(tweet.replace('\n',''))
        listText2 =  listText2+text_temp

    unics = set(); listText2 = [string for string in listText2 if string not in unics and (unics.add(string) or True)] # Delete duplicate data
    listText2 = ', '.join(listText2)

    return listText2
    
textList = ['Merge branch \'344-projectqueryfragments-500-error\' into \'develop\'\n\nResolve "projectQueryFragments 500 error"\n\nCloses #344\n\nSee merge request tech-gap-italia/ckp/ckp-api!247', 'Merge branch \'342-add-an-image-to-project-description\' into \'develop\'\n\nResolve "Add an image to project description"\n\nCloses #342\n\nSee merge request tech-gap-italia/ckp/ckp-api!245', "Merge branch 'develop' into sidip\n", 'Merge C1-S21 and C2-S20, Important! Is necessary update the db with 0303 and 0304\n', 'Update .gitlab-ci.yml', 'Merge branch \'23-fix-report-device-in-repair\' into \'dev\'\n\nResolve "Fix report device in repair"\n\nCloses #23\n\nSee merge request tech-gap-italia/pax-italia-pot/pax-italia-pot-api!36']
textOut = getTextFilter(textList)

# Group similar text

In [367]:
# Collect data 
# https://sites.temple.edu/tudsc/2017/03/30/measuring-similarity-between-texts-in-python/
#================================================================================
def StemTokens(tokens):
    return [stemmer.stem(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def StemNormalize(text):
    return StemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))
nltk.download('wordnet') # first-time use only
lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')


def idf(n,df):
    result = math.log((n+1.0)/(df+1.0)) + 1
    return result


def groupDataFrame(jobs_tidy_text, jobStatusUnique, similarity):
    countI = -1
    data = []

    for status in jobStatusUnique:
        jobs_temp = jobs_tidy_text[jobs_tidy_text["jobStatus"] == status]
        documents = jobs_temp['token'].to_list()
        countX = jobs_temp['count'].to_list()

        # commitMessage =  getTextFilter(jobs_temp['commitMessage'].to_list()) #TODO
        if len(documents) > 0:
            LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')
            LemVectorizer.fit_transform(documents) 
            tf_matrix = LemVectorizer.transform(documents).toarray()
            tfidfTran = TfidfTransformer(norm="l2")
            tfidfTran.fit(tf_matrix)
            tfidf_matrix = tfidfTran.transform(tf_matrix)
            cos_similarity_matrix = (tfidf_matrix * tfidf_matrix.T).toarray()

            # Collect unics data.
            for i in range(0,len(cos_similarity_matrix)):
                countData = 0
                for i2 in range(0,len(cos_similarity_matrix)):
                    if cos_similarity_matrix[i,i2] > similarity:
                        token = documents[i2]
                        countData = countData + countX[i2] # Get total of events
                d = [status, token, countData]
                data.append(d)

    data =list(map(list,set(map(tuple,data)))) # Deleret duplicate data


    df_stps = pd.DataFrame(data, columns = ['jobStatus', 'token', 'count'])
    if countI == -1:
        countI = countI + 1
        df_STPS = df_stps
    else:
        df_STPS.append(df_stps)
    return df_STPS
    #     LemVectorizer

df_stps = groupDataFrame(pd.DataFrame(jobs_tidy_text,columns = ['jobStatus', 'token', 'count','commitMessage']), 
    jobs["jobStatus"].unique().tolist(),
    similarity
    )


# print("=======================================")
# print("Text related to a similarity of:")
# print("=======================================")

df_stps

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ceciliocannavaciuolo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['ha', 'le', 'u', 'wa'] not in stop_words.



Unnamed: 0,jobStatus,token,count
0,success,tlmgr action install returned an error continuing,3
1,success,tlmgr an error has occurred,3
2,canceled,health check erro,10
3,success,warning illegal reflective access by com,2
4,failed,[ msection end cleanup file variables\r[ k[...,5
5,success,[info error stacktraces are turned on,2
6,failed,fullyqualifiederrorid commandnotfoundexceptio,5
7,canceled,[info error stacktraces are turned on,1
8,failed,health check erro,7
9,success,health check erro,29


In [368]:
nltk.download('wordnet') # first-time use only
lemmer = nltk.stem.WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ceciliocannavaciuolo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [369]:
import plotly.graph_objects as go

# Order dataframe. 
df_stps= df_stps.sort_values(by=['jobStatus'])

fig = go.Figure(data=[go.Table(header=dict(values=[
    'jobStatus', 
    'STPS',"Events"
    ]),
    columnwidth = [400,80,80],
    
    cells=dict(values=[
                     df_stps['token'].to_list(),
                     df_stps['jobStatus'].to_list(),
                     df_stps['count'].to_list()
                 ],
                 align='left',
                 font_size=14,
                 height=30
                 ))
                     ])

fig.show()

In [370]:
# Top 10 palabras por jobStatus (sin stopwords)
# ==============================================================================
jobStatusUnique = df_stps["jobStatus"].unique().tolist()
df_list = []
for status in jobStatusUnique:
    jobs_total = df_stps.rename(columns={'jobStatus': status})
    total = jobs_total.groupby(by="token")[status].count()
    df_list.append(total)
df = pd.concat(df_list,axis=1)
df = df.sort_values(by=["failed"],ascending=True)
fig = px.bar(df, orientation='h',template=plotly_template,title="Number of fails by token")
fig.show()