# Setup

In [1]:
# Import initial dependencies 

import pandas as pd
import numpy as np
import os

In [2]:
# Build dataframe with all csv files

directory = "/Users/annadeniz/Documents/py_scripts/digital_procurement/data/raw data/compras"

csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

compras_df = pd.DataFrame()

for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path, encoding='latin-1', delimiter=';')
    compras_df = pd.concat([compras_df, df], ignore_index=True)

compras_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452617 entries, 0 to 452616
Data columns (total 24 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Número do Contrato                  452617 non-null  object 
 1   Objeto                              452407 non-null  object 
 2   Fundamento Legal                    380758 non-null  object 
 3   Modalidade Compra                   452617 non-null  object 
 4   Situação Contrato                   452617 non-null  object 
 5   Código Órgão Superior               452617 non-null  int64  
 6   Nome Órgão Superior                 452617 non-null  object 
 7   Código Órgão                        452617 non-null  int64  
 8   Nome Órgão                          452617 non-null  object 
 9   Código UG                           452617 non-null  int64  
 10  Nome UG                             452617 non-null  object 
 11  Data Assinatura Contrato  

# Data preprocessing

In [3]:
# Import dependencies for data preprocessing

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Convert the text column to string type

compras_df['Objeto'] = compras_df['Objeto'].astype(str) 

In [5]:
# Tokenization

compras_df['tokens'] = compras_df['Objeto'].apply(lambda x: word_tokenize(x, language='portuguese'))

In [6]:
# Lowercasing

compras_df['tokens'] = compras_df['tokens'].apply(lambda x: [word.lower() for word in x])

In [7]:
# Stopwords Removal

stop_words = set(stopwords.words('portuguese'))

extra_stop_words = ['objeto', 'contrato', 'fornecimento', 'contratação', ':', 'serviços', 'empresa', 'execução', 'prestação', 'aquisição', 'br', 'material', 'serviço', 'especializada', 'elaboração', '.', '/', ',','!','?']
stop_words.update(extra_stop_words)

compras_df['tokens'] = compras_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [8]:
# Check new dataframe with tokens

compras_df.head()

Unnamed: 0,Número do Contrato,Objeto,Fundamento Legal,Modalidade Compra,Situação Contrato,Código Órgão Superior,Nome Órgão Superior,Código Órgão,Nome Órgão,Código UG,...,Código Contratado,Nome Contratado,Valor Inicial Compra,Valor Final Compra,Número Licitação,Código UG Licitação,Nome UG Licitação,Código Modalidade Compra Licitação,Modalidade Compra Licitação,tokens
0,22017,"Objeto: Execução dos serviços de dedetização, ...","Fundamento Legal: Lei 10520/2002, Lei 8666/93 ...",Pregão,Publicado,40000,Ministério do Trabalho,37202,Instituto Nacional do Seguro Social,511080,...,15501741000123,P. R. DE ALMEIDA & CIA LTDA,1199888600,1199888600,22016,511080.0,GERENCIA EXECUTIVA CUIABA,5.0,Pregão,"[dedetização, englobando, desinsetização, desr..."
1,32017,Objeto: Contratação de serviços de apoio oper...,Fundamento Legal: Lei nº 8.666/93,Pregão - Registro de Preço,Publicado,26000,Ministério da Educação,26403,Instituto Federal do Amazonas,158446,...,4465383000124,SUP SERVICOS DE CONSTRUCOES E MANUTENCAO EIRELI,82406000,82406000,22015,194008.0,COORDENACAO REGIONAL DO RIO NEGRO/AM,-99.0,Pregão - Registro de Preço,"[apoio, operacional, prestados, condições, est..."
2,12017,Objeto: CONTRATAÇÃO DE SERVIÇOS GRÁFICOS PARA ...,Fundamento Legal: 10520/2002 E 8666/93,Pregão,Não se aplica,25000,Ministério da Economia,25000,Ministério da Economia - Unidades com vínculo,170010,...,17615848000128,TEIXEIRA IMPRESSAO DIGITAL E SOLUCOES GRAFICAS...,333189400,333189400,112016,170010.0,SECRETARIA ESP. DA RECEITA FEDERAL DO BRASIL,5.0,Pregão,"[gráficos, unidades, receita, federal, brasil,..."
3,2332016,"Objeto: SERVIÇOS DE LIMPEZA, CONSERVAÇÃO E HIG...",LEI 8.666 DE 1993 E SUAS ALTERAÇÕES POSTERIORES.,Sem Informação,Fechado,32000,Ministério de Minas e Energia,91081,Empresas de Energia,910808,...,29212545000143,NOVA RIO SERVICOS GERAIS LTDA,9603480000,9603480000,-2,,,,,"[limpeza, conservação, higienização, serem, re..."
4,12017,Objeto: Contratação de empresa especializada n...,"Fundamento Legal: lei 8666/93, artg 24 - ii",Dispensa de Licitação,Publicado,25000,Ministério da Economia,25000,Ministério da Economia - Unidades com vínculo,170258,...,7346326000114,REPROS SOLUCOES EM DOCUMENTOS LTDA,72000000,72000000,12017,170258.0,INSPETORIA DA RFB NO RIO DE JANEIRO,6.0,Dispensa de Licitação,"[impressao, equipamentos, manutenção, preventi..."


In [9]:
# Vectorization (Word Embeddings)

vectorizer = TfidfVectorizer(max_df=0.9, min_df=100)
X = vectorizer.fit_transform(compras_df['tokens'].apply(lambda x: ' '.join(x)))

# Topic modelling

In [10]:
# Perform topic modelling using Latent Dirichlet Allocation (LDA)

from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=100, random_state=42)
lda.fit(X)

In [11]:
# Get words and corresponding weights resulting from the topic model in a table

def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    
    return pd.DataFrame(topic_dict)

In [12]:
# Get name of each title: augment the number of words per topic and base it on the results 

no_top_words = 15

display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,...,Topic 95 words,Topic 95 weights,Topic 96 words,Topic 96 weights,Topic 97 words,Topic 97 weights,Topic 98 words,Topic 98 weights,Topic 99 words,Topic 99 weights
0,água,1244.5,reabilitação,842.3,mão,2508.4,receita,858.2,instalação,474.6,...,bens,668.8,ml,370.5,tensão,397.3,meses,1217.0,assinatura,408.0
1,esgoto,1193.6,próteses,526.8,obra,2393.5,brasil,749.2,montagem,434.1,...,imóveis,662.8,lavagem,256.2,alta,296.9,12,1065.3,descrito,277.6
2,laboratorial,942.6,órteses,470.5,exclusiva,2037.4,federal,556.3,divisórias,318.3,...,gráficos,530.6,patrocínio,240.4,subestação,237.9,período,749.0,primeira,275.2
3,univasf,789.9,profissional,429.8,dedicação,1948.8,delegacia,510.9,desmontagem,209.0,...,manutenção,346.4,realização,235.6,baixa,206.9,doze,635.9,ifpr,258.9
4,coleta,602.2,medida,417.9,continuados,1208.9,chaveiro,222.7,goiânia,205.3,...,móveis,339.2,mesa,231.0,média,196.2,60,230.9,cláusula,232.7
5,uso,551.8,favor,373.8,disponibilização,1136.3,dtcea,222.0,persianas,203.2,...,áreas,326.3,roupas,228.5,ufrgs,185.4,prazo,222.6,anual,187.5
6,tratamento,461.9,paciente,343.3,regime,1088.0,monitoramento,216.5,ifg,201.9,...,externas,240.6,evento,213.7,corpo,162.7,periodo,182.6,digital,156.6
7,contidas,452.7,segurados,340.0,conservação,553.0,cftv,196.7,contrataçao,175.2,...,internas,236.6,cultural,195.3,nobreak,120.1,anatel,154.8,revista,150.9
8,características,445.6,prontuário,289.1,limpeza,548.7,sistema,170.0,cabeamento,163.0,...,ifce,215.8,bndes,187.7,subestações,119.5,2019,140.9,conforme,150.3
9,demais,363.4,sob,288.8,prestados,520.9,alarme,168.8,go,162.2,...,conjunto,199.8,oficina,160.0,entrada,114.9,sessenta,138.8,líquido,142.4


In [13]:
# Create df with topics per item

W = lda.transform(X)

df_w = pd.DataFrame(W)

df_w.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.001736,0.001736,0.001736,0.001736,0.001736,0.001736,0.001736,0.001736,0.138207,0.001736,...,0.001736,0.001736,0.001736,0.001736,0.118934,0.001736,0.001736,0.001736,0.001736,0.001736
1,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,...,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761
2,0.001803,0.001803,0.001803,0.291804,0.001803,0.001803,0.001803,0.071687,0.001803,0.001803,...,0.001803,0.001803,0.001803,0.001803,0.001803,0.001803,0.001803,0.001803,0.001803,0.001803
3,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,...,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439
4,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396,0.564944,0.002396,0.002396,...,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396


In [14]:
# Merge compras_df with df of topics per item

compras_topics_df = pd.concat([compras_df,df_w], axis=1)

compras_topics_df.head()

Unnamed: 0,Número do Contrato,Objeto,Fundamento Legal,Modalidade Compra,Situação Contrato,Código Órgão Superior,Nome Órgão Superior,Código Órgão,Nome Órgão,Código UG,...,90,91,92,93,94,95,96,97,98,99
0,22017,"Objeto: Execução dos serviços de dedetização, ...","Fundamento Legal: Lei 10520/2002, Lei 8666/93 ...",Pregão,Publicado,40000,Ministério do Trabalho,37202,Instituto Nacional do Seguro Social,511080,...,0.001736,0.001736,0.001736,0.001736,0.118934,0.001736,0.001736,0.001736,0.001736,0.001736
1,32017,Objeto: Contratação de serviços de apoio oper...,Fundamento Legal: Lei nº 8.666/93,Pregão - Registro de Preço,Publicado,26000,Ministério da Educação,26403,Instituto Federal do Amazonas,158446,...,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761,0.002761
2,12017,Objeto: CONTRATAÇÃO DE SERVIÇOS GRÁFICOS PARA ...,Fundamento Legal: 10520/2002 E 8666/93,Pregão,Não se aplica,25000,Ministério da Economia,25000,Ministério da Economia - Unidades com vínculo,170010,...,0.001803,0.001803,0.001803,0.001803,0.001803,0.001803,0.001803,0.001803,0.001803,0.001803
3,2332016,"Objeto: SERVIÇOS DE LIMPEZA, CONSERVAÇÃO E HIG...",LEI 8.666 DE 1993 E SUAS ALTERAÇÕES POSTERIORES.,Sem Informação,Fechado,32000,Ministério de Minas e Energia,91081,Empresas de Energia,910808,...,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439,0.002439
4,12017,Objeto: Contratação de empresa especializada n...,"Fundamento Legal: lei 8666/93, artg 24 - ii",Dispensa de Licitação,Publicado,25000,Ministério da Economia,25000,Ministério da Economia - Unidades com vínculo,170258,...,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396,0.002396


# Set initial dataset for labelling

In [15]:
# Analyse topics with potential for digital technology

topics_df = pd.DataFrame(display_topics(lda, vectorizer.get_feature_names_out(), no_top_words))

topics_df.to_csv('topics_df.csv', index=False)

In [16]:
# Topics with potential for digital technology

digital_topics = topics_df.loc[:,['Topic 36 words', 'Topic 64 words', 'Topic 75 words', 'Topic 92 words']]

digital_topics

Unnamed: 0,Topic 36 words,Topic 64 words,Topic 75 words,Topic 92 words
0,solução,digitais,processos,software
1,garantia,dados,secretaria,suporte
2,suporte,processamento,direitos,licenças
3,instalação,pacote,tic,técnico
4,rede,certificados,tecnologia,atualização
5,site,meio,informação,licença
6,configuração,correios,técnicos,manutenção
7,on,emissão,ministério,uso
8,armazenamento,produtos,desenvolvimento,softwares
9,meses,digital,gestão,sistema


In [17]:
# Create subset of the dataset with most relevant data for labelling

subset_1 = compras_topics_df[compras_topics_df[92] > 0.75] # 732 entries

subset_2 = compras_topics_df[compras_topics_df[92] < 0.75]

subset_2 = subset_2.sample(frac=0.0008, random_state=42) # 362 entries

subset_1.info()
subset_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 732 entries, 697 to 451686
Columns: 125 entries, Número do Contrato to 99
dtypes: float64(102), int64(4), object(19)
memory usage: 720.6+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 362 entries, 371484 to 148642
Columns: 125 entries, Número do Contrato to 99
dtypes: float64(102), int64(4), object(19)
memory usage: 356.3+ KB


In [18]:
# Join datasets

initial_df = pd.concat([subset_1,subset_2])

initial_df = initial_df.sample(frac=1)

initial_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1094 entries, 25617 to 165164
Columns: 125 entries, Número do Contrato to 99
dtypes: float64(102), int64(4), object(19)
memory usage: 1.1+ MB


# Label data

In [19]:
# Export dataset for labeling

initial_df.to_csv('initial_df.csv')

In [20]:
# Launch Open Source Data Labeling Platform - https://labelstud.io/

In [21]:
# Import labeled data 

# Defined categories: 'Digital Technology' and 'Other'

seed_df = pd.read_json('labeled_data.json')

seed_df.head()

Unnamed: 0.1,Unnamed: 0,Número do Contrato,Objeto,Fundamento Legal,Modalidade Compra,Situação Contrato,Código Órgão Superior,Nome Órgão Superior,Código Órgão,Nome Órgão,...,97,98,99,id,sentiment,annotator,annotation_id,created_at,updated_at,lead_time
0,25564,1122018,Objeto: PRESTAÇÃO DE SERVIÇO DE PACOTE DE ACES...,Fundamento Legal: LEI 8666/93,Inexigibilidade de Licitação,Não se aplica,26000,Ministério da Educação,26247,Universidade Federal de Santa Maria,...,0.003679,0.003679,0.003679,1362,Other,1,48,2023-03-22 17:25:52.126185+00:00,2023-03-22 17:25:52.126224+00:00,39.786
1,307257,352021,Objeto: O OBJETO DO PRESENTE INSTRUMENTO É A C...,,Inexigibilidade de Licitação,Não se aplica,36000,Ministério da Saúde,36201,Fundação Oswaldo Cruz,...,0.002161,0.002161,0.002161,1363,Other,1,49,2023-03-22 17:26:16.896686+00:00,2023-03-22 17:26:16.896715+00:00,24.531
2,86384,22015,Objeto: PRESTAÇÃO DE SERVIÇOS DE LOCAÇÃO DE CE...,Fundamento Legal: LEI 8.666/93,Pregão - Registro de Preço,Não se aplica,25000,Ministério da Economia,25000,Ministério da Economia - Unidades com vínculo,...,0.002157,0.002157,0.002157,1364,Other,1,50,2023-03-22 17:26:24.304412+00:00,2023-03-22 17:26:24.304443+00:00,7.204
3,133339,102018,Objeto: Contrato de prestação de serviços de s...,Fundamento Legal: LEI 8.666/93 E DEMAIS CORREL...,Pregão,Publicado,22000,"Ministério da Agricultura, Pecuária e Abastec",22202,Empresa Brasileira de Pesquisa Agropecuária,...,0.002524,0.002524,0.002524,1365,Digital Technology,1,51,2023-03-22 17:28:00.333478+00:00,2023-03-22 17:28:00.333501+00:00,95.832
4,444569,262013,Objeto: Aquisição de licenças perpétuas de sol...,"Fundamento Legal: Leis 8666/93, 10520/02 e Dec...",Pregão,Publicado,25000,Ministério da Economia,25000,Ministério da Economia - Unidades com vínculo,...,0.00163,0.00163,0.00163,1366,Digital Technology,1,52,2023-03-22 17:28:12.296439+00:00,2023-03-22 17:28:12.296489+00:00,11.782


In [22]:
# Check distribution of categories in the dataset

categories_labelled = seed_df.groupby('sentiment')

categories_labelled['Objeto'].count()

sentiment
Digital Technology    586
Other                 508
Name: Objeto, dtype: int64

# Train and evaluate classification model

In [23]:
# Import dependencies for training and evaluating model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [24]:
# Set features and target variables

X_seed = seed_df['tokens']
y = seed_df['sentiment']

In [25]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X_seed, y, test_size=0.25, random_state=42)

In [26]:
# Vectorize the text using the TF-IDF vectorizer

X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [27]:
# Train a logistic regression model

classifier = LogisticRegression()
classifier.fit(X_train_vec, y_train)

In [28]:
# Predict the labels for the test set

y_pred = classifier.predict(X_test_vec)

In [29]:
# Evaluate the accuracy of the model

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.96


In [30]:
# Print the classification report with further evaluations

report = classification_report(y_test, y_pred, zero_division=0)
print(report)

                    precision    recall  f1-score   support

Digital Technology       0.97      0.95      0.96       148
             Other       0.94      0.97      0.95       126

          accuracy                           0.96       274
         macro avg       0.96      0.96      0.96       274
      weighted avg       0.96      0.96      0.96       274



In [31]:
# Get the number of parameters in the model

num_params = classifier.coef_.size

print(f"Number of parameters in the model: {num_params}")

Number of parameters in the model: 5409


In [32]:
# Check model classes

classifier.classes_

array(['Digital Technology', 'Other'], dtype=object)

In [33]:
# Get most relevant features for 'Digital Technology' class

feature_names = vectorizer.get_feature_names_out()

coefs = classifier.coef_

print(f"Top 20 features for class 'Digital Technology':")
coef_abs = abs(coefs[0])
top10_idx = coef_abs.argsort()[-20:][::-1]
for feature in top10_idx:
    print(f"{feature_names[feature]}: {coefs[0][feature]}")


Top 20 features for class 'Digital Technology':
software: -4.097781289999977
licenças: -3.2584363700368755
atualização: -2.7044657702411
suporte: -2.603873437477209
dados: -2.4516997210995517
softwares: -2.370901460705264
solução: -2.2769216949297135
microsoft: -2.057119782629076
uso: -1.9814468118452255
técnico: -1.6309383783885174
licença: -1.5470595958444389
subscrição: -1.5232830611343329
versão: -1.2856923743007609
enterprise: -1.2762595132436283
sistemas: -1.2712556073317824
treinamento: -1.2626865644447918
gerenciamento: -1.2617815536403716
banco: -1.191091656000277
server: -1.127502130655172
tecnologia: -1.0918165070056312


In [34]:
# Save trained model

import pickle

with open('trained_classifier.pkl', 'wb') as f:
    pickle.dump(classifier, f)

# Predict labels for the whole dataset

In [35]:
# Vectorize the text using the same vectorizer as before

X_unlabeled_vec = X

In [36]:
# Predict the labels for the unlabeled data

y_pred = classifier.predict(X_unlabeled_vec)

In [37]:
# Store the predicted labels in original dataframe

compras_df['category'] = y_pred

In [38]:
# Check final distribution of categories in the dataset

categories = compras_df.groupby('category')

categories['Objeto'].count()

category
Digital Technology      9740
Other                 442877
Name: Objeto, dtype: int64

# Exploratory data analysis

Set up

In [39]:
# Import dependencies and check current dataframe

import matplotlib.pyplot as plt
import seaborn as sns

compras_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452617 entries, 0 to 452616
Data columns (total 26 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Número do Contrato                  452617 non-null  object 
 1   Objeto                              452617 non-null  object 
 2   Fundamento Legal                    380758 non-null  object 
 3   Modalidade Compra                   452617 non-null  object 
 4   Situação Contrato                   452617 non-null  object 
 5   Código Órgão Superior               452617 non-null  int64  
 6   Nome Órgão Superior                 452617 non-null  object 
 7   Código Órgão                        452617 non-null  int64  
 8   Nome Órgão                          452617 non-null  object 
 9   Código UG                           452617 non-null  int64  
 10  Nome UG                             452617 non-null  object 
 11  Data Assinatura Contrato  

In [40]:
# Create new dataframe with relevant columns for exploration and analysis

eda_df = compras_df.drop(['Fundamento Legal', 'Modalidade Compra', 'Situação Contrato', 'Data Assinatura Contrato', 
                          'Data Início Vigência', 'Data Fim Vigência', 'Valor Inicial Compra', 'Número Licitação', 
                          'Código UG Licitação', 'Nome UG Licitação', 'Código Modalidade Compra Licitação', 'Modalidade Compra Licitação', 'tokens'], axis=1)

eda_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452617 entries, 0 to 452616
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   Número do Contrato     452617 non-null  object
 1   Objeto                 452617 non-null  object
 2   Código Órgão Superior  452617 non-null  int64 
 3   Nome Órgão Superior    452617 non-null  object
 4   Código Órgão           452617 non-null  int64 
 5   Nome Órgão             452617 non-null  object
 6   Código UG              452617 non-null  int64 
 7   Nome UG                452617 non-null  object
 8   Data Publicação DOU    452617 non-null  object
 9   Código Contratado      452617 non-null  object
 10  Nome Contratado        452614 non-null  object
 11  Valor Final Compra     452617 non-null  object
 12  category               452617 non-null  object
dtypes: int64(3), object(10)
memory usage: 44.9+ MB


In [41]:
# Reorganise columns

# Get Dates
eda_df['Data Publicação DOU'] = pd.to_datetime(eda_df['Data Publicação DOU'], format='%d/%m/%Y')

eda_df['Ano Contrato'] = eda_df['Data Publicação DOU'].dt.year

eda_df['Mês Contrato'] = eda_df['Data Publicação DOU'].dt.month

# Get values R$
eda_df['Valor Contrato'] = eda_df['Valor Final Compra'].str.replace(',', '.').astype(float)

# Drop unecessary columns
eda_df = eda_df.drop(['Data Publicação DOU', 'Valor Final Compra'], axis=1)

eda_df = eda_df.rename(columns={'category': 'Classification'})

eda_df = eda_df[eda_df['Valor Contrato'] > 0]

eda_df['Valor Contrato em MM'] = eda_df['Valor Contrato']/1000000

eda_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 449488 entries, 0 to 452616
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Número do Contrato     449488 non-null  object 
 1   Objeto                 449488 non-null  object 
 2   Código Órgão Superior  449488 non-null  int64  
 3   Nome Órgão Superior    449488 non-null  object 
 4   Código Órgão           449488 non-null  int64  
 5   Nome Órgão             449488 non-null  object 
 6   Código UG              449488 non-null  int64  
 7   Nome UG                449488 non-null  object 
 8   Código Contratado      449488 non-null  object 
 9   Nome Contratado        449486 non-null  object 
 10  Classification         449488 non-null  object 
 11  Ano Contrato           449488 non-null  int64  
 12  Mês Contrato           449488 non-null  int64  
 13  Valor Contrato         449488 non-null  float64
 14  Valor Contrato em MM   449488 non-nu

Overall

In [42]:
# Number and values of digital technology contracts

digital_eda_df = eda_df[eda_df['Classification'] == 'Digital Technology']

num_digital = len(digital_eda_df)

sum_digital = (digital_eda_df['Valor Contrato'].sum())/1000000

perc_num_digital = (num_digital / len(eda_df) * 100)

perc_sum_digital = (sum_digital / (eda_df['Valor Contrato'].sum()/1000000) * 100)

avr_digital = sum_digital / num_digital

table_1 = pd.DataFrame({
    'Number of digital technology contracts': [num_digital],
    'Sum of values of digital technology contracts in R$ MM': [sum_digital],
    'Average value of digital technlogy contracts in R$ MM': [avr_digital],
    '% of digital technology contracts': [perc_num_digital],
    '% of values of digital technology contracts': [perc_sum_digital]
})

table_1 = table_1.transpose()

table_1.columns = ['Value']

table_1 = table_1.round(2)

table_1

Unnamed: 0,Value
Number of digital technology contracts,9698.0
Sum of values of digital technology contracts in R$ MM,29853.06
Average value of digital technlogy contracts in R$ MM,3.08
% of digital technology contracts,2.16
% of values of digital technology contracts,2.38


In [43]:
# Distribution of digital technology contract values

digital_eda_df = eda_df[eda_df['Classification'] == 'Digital Technology']

bins = [0, 0.5, 1, 2, 3, 4, 5, 10, 50, 5000]

digital_eda_df['Value Range'] = pd.cut(digital_eda_df['Valor Contrato em MM'], bins=bins)

value_counts = digital_eda_df['Value Range'].value_counts().sort_index()

table_11 = pd.DataFrame({'Value Range in R$ MM': value_counts.index, 'Number of digital technology contracts': value_counts.values})

table_11

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  digital_eda_df['Value Range'] = pd.cut(digital_eda_df['Valor Contrato em MM'], bins=bins)


Unnamed: 0,Value Range in R$ MM,Number of digital technology contracts
0,"(0.0, 0.5]",6027
1,"(0.5, 1.0]",982
2,"(1.0, 2.0]",830
3,"(2.0, 3.0]",469
4,"(3.0, 4.0]",286
5,"(4.0, 5.0]",207
6,"(5.0, 10.0]",451
7,"(10.0, 50.0]",385
8,"(50.0, 5000.0]",61


In [44]:
# Check for big contracts

table_12 = digital_eda_df.sort_values('Valor Contrato em MM', ascending=False)

pd.set_option('display.max_colwidth', None)

table_12.head(5)


Unnamed: 0,Número do Contrato,Objeto,Código Órgão Superior,Nome Órgão Superior,Código Órgão,Nome Órgão,Código UG,Nome UG,Código Contratado,Nome Contratado,Classification,Ano Contrato,Mês Contrato,Valor Contrato,Valor Contrato em MM,Value Range
35451,162021,"Objeto: CONTRATAÇÃO DE SERVIÇOS DE TECNOLOGIA DA INFORMAÇÃO E COMUNICAÇÃO, DE EMPRESA ESPECIALIZADA PARA PRESTAÇÃO DE SERVIÇOS GERENCIADOS DE COMPUTAÇÃO EM NUVEM, SOB O MODELO DE CLOUD BROKER (INTEGRADOR) DE MULTI-NUVEM, QUE INCLUI A CONCEPÇÃO, PROJETO, PROVISIONAMENTO, CONFIGURAÇÃO, MIGRAÇÃO, SUPORTE, MANUTENÇÃO E GESTÃO DE TOPOLOGIAS DE SERVIÇOS EM DOIS OU MAIS PROVEDORES DE NUVEM PÚBLICA.",26000,Ministério da Educação,26000,Ministério da Educação - Unidades com vínculo,150004,SUBSECRETARIA DE TECNOLOGIA DA INFORMACAO E C,14139773000168,EXTREME DIGITAL CONSULTORIA E REPRESENTACOES LTDA,Digital Technology,2021,6,3388022000.0,3388.022202,"(50.0, 5000.0]"
217803,1062012,Objeto: Prestação dos serviços de tecnologia e informação.,40000,Ministério do Trabalho,37202,Instituto Nacional do Seguro Social,512006,COORDENACAO DE ORCAMENTO E FINANCAS,42422253000101,EMPRESA DE TECNOLOGIA E INFORMACOES DA PREVIDENCIA - DATAPREV S.A.,Digital Technology,2013,1,1598982000.0,1598.982341,"(50.0, 5000.0]"
257456,192017,Objeto: Serviços especializados de tecnologia da informação.,25000,Ministério da Economia,25000,Ministério da Economia - Unidades com vínculo,170007,SUBSECRETARIA DE ASSUNTOS CORPORATIVOS- SUCOP,33683111000107,SERVICO FEDERAL DE PROCESSAMENTO DE DADOS (SERPRO),Digital Technology,2017,6,885631200.0,885.631217,"(50.0, 5000.0]"
209211,452018,"Objeto: Contratação de serviços estratégicos de Tecnologia da Informação - TI voltados, direta ou indiretamente, ao suporte necessário para manutenção dos sistemas estruturantes de Governo e departamentais, que atendem as unidades do Ministério do Planejamento, Desenvolvimento e Gestão - MP, em todo o território nacional. Tais serviços consistem na manutenção de ambiente de sistemas, nuvem, desenvolvimento e manutenção de sistemas, apoio à infraestrutura, consultoria técnica, entre outros serviços técnicos.",25000,Ministério da Economia,25000,Ministério da Economia - Unidades com vínculo,201004,COORDENACAO-GERAL DE AQUISICOES - CGEAQ,33683111000107,SERVICO FEDERAL DE PROCESSAMENTO DE DADOS (SERPRO),Digital Technology,2018,10,674788600.0,674.788609,"(50.0, 5000.0]"
373943,62018,"Objeto: Contratação de seviço especializados de Solução de Tecnologia da Informação e Comunicação (TIC), a serem prestados pela DATAPREV, relativos a sistemas informatizados para operacionalização das ações do FAT.",40000,Ministério do Trabalho,38901,Fundo de Amparo ao Trabalhador,380918,GERAL DE RECURSOS LOGISTICOS - CGRL,42422253000101,EMPRESA DE TECNOLOGIA E INFORMACOES DA PREVIDENCIA - DATAPREV S.A.,Digital Technology,2018,4,491207200.0,491.207215,"(50.0, 5000.0]"


Over time

In [45]:
# Number and values of digital technology contracts over the years

group_year = eda_df.groupby('Ano Contrato')

table_2 = pd.DataFrame(columns=['Year', 'Number of digital technology contracts', 
                                'Sum of values of digital technology contracts in R$ MM', 
                                'Average value of digital technlogy contracts in R$ MM', 
                                '% of digital technology contracts', 
                                '% of values of digital technology contracts'])

for year, group in group_year:
    digital_eda_df = group[group['Classification'] == 'Digital Technology']

    num_digital = len(digital_eda_df)

    sum_digital = (digital_eda_df['Valor Contrato'].sum())/1000000

    perc_num_digital = (num_digital / len(group) * 100)

    perc_sum_digital = (sum_digital / (group['Valor Contrato'].sum()/1000000) * 100)

    avr_digital = sum_digital / num_digital

    table_2 = table_2.append({'Year': year,
                              'Number of digital technology contracts': num_digital,
                              'Sum of values of digital technology contracts in R$ MM': sum_digital,
                              'Average value of digital technlogy contracts in R$ MM': avr_digital,
                              '% of digital technology contracts': perc_num_digital,
                              '% of values of digital technology contracts': perc_sum_digital},
                             ignore_index=True)

table_2.set_index('Year', inplace=True)

table_2 = table_2.round(2)

table_2

Unnamed: 0_level_0,Number of digital technology contracts,Sum of values of digital technology contracts in R$ MM,Average value of digital technlogy contracts in R$ MM,% of digital technology contracts,% of values of digital technology contracts
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013.0,962.0,3606.21,3.75,2.09,3.84
2014.0,1007.0,2801.23,2.78,1.96,3.29
2015.0,782.0,2097.81,2.68,1.85,2.86
2016.0,912.0,2959.85,3.25,2.08,3.1
2017.0,1091.0,4551.57,4.17,2.42,2.72
2018.0,1238.0,4015.11,3.24,2.49,5.2
2019.0,985.0,1747.84,1.77,1.96,2.69
2020.0,1202.0,2359.1,1.96,2.25,2.9
2021.0,858.0,4775.44,5.57,2.32,4.52
2022.0,661.0,938.91,1.42,2.16,0.23


In [46]:
# Number and values of digital technology contracts over months

group_month = eda_df.groupby('Mês Contrato')

table_3 = pd.DataFrame(columns=['Month', 'Number of digital technology contracts', 
                                'Sum of values of digital technology contracts in R$ MM', 
                                'Average value of digital technlogy contracts in R$ MM', 
                                '% of digital technology contracts', 
                                '% of values of digital technology contracts'])

for month, group in group_month:
    digital_eda_df = group[group['Classification'] == 'Digital Technology']

    num_digital = len(digital_eda_df)

    sum_digital = (digital_eda_df['Valor Contrato'].sum())/1000000

    perc_num_digital = (num_digital / len(group) * 100)

    perc_sum_digital = (sum_digital / (group['Valor Contrato'].sum()/1000000) * 100)

    avr_digital = sum_digital / num_digital

    table_3 = table_3.append({'Month': month,
                              'Number of digital technology contracts': num_digital,
                              'Sum of values of digital technology contracts in R$ MM': sum_digital,
                              'Average value of digital technlogy contracts in R$ MM': avr_digital,
                              '% of digital technology contracts': perc_num_digital,
                              '% of values of digital technology contracts': perc_sum_digital},
                             ignore_index=True)

table_3.set_index('Month', inplace=True)

table_3 = table_3.round(2)

table_3

Unnamed: 0_level_0,Number of digital technology contracts,Sum of values of digital technology contracts in R$ MM,Average value of digital technlogy contracts in R$ MM,% of digital technology contracts,% of values of digital technology contracts
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,1238.0,4129.92,3.34,2.96,5.2
2.0,648.0,1103.05,1.7,1.85,1.57
3.0,639.0,1766.02,2.76,1.89,2.75
4.0,565.0,1911.65,3.38,1.83,3.28
5.0,597.0,1097.99,1.84,1.79,1.57
6.0,588.0,5910.79,10.05,1.82,11.24
7.0,636.0,2103.77,3.31,1.75,4.01
8.0,681.0,1496.55,2.2,1.84,1.04
9.0,715.0,1220.86,1.71,2.03,1.71
10.0,793.0,3082.31,3.89,2.14,0.76


Over departments

In [47]:
# Reorganise ministries

departments_group = {'Ministério da Educação': 'Ministry of Education', 
                     'Ministério da Defesa': 'Ministry of Defence', 
                     'Ministério do Trabalho': 'Ministry of Labour', 
                     'Ministério da Saúde': 'Ministry of Health', 
                     'Ministério da Economia': 'Ministry of Economy', 
                     'Ministério de Minas e Energia': 'Ministry of Mines and Energy', 
                     'Ministério da Justiça e Segurança Pública': 'Ministry of Justice and Public Security', 
                     'Ministério da Infraestrutura': 'Ministry of Infrastructure', 
                     'Ministério da Agricultura, Pecuária e Abastec': 'Ministry of Agriculture, Livestock and Supply', 
                     'Ministério do Desenvolvimento Regional': 'Ministry of Regional Development', 
                     'Banco Central do Brasil - Orçamento Fiscal e ': 'Central Bank of Brazil', 
                     'Ministério da Ciência, Tecnologia, Inovações ': 'Ministry of Science, Technology and Innovations', 
                     'Ministério do Turismo': 'Ministry of Tourism', 
                     'Ministério do Meio Ambiente': 'Ministry of Environment', 
                     'MINISTERIO DE MINAS E ENERGIA': 'Ministry of Mines and Energy', 
                     'Ministério das Comunicações': 'Ministry of Communications', 
                     'Presidência da República': 'Presidency of the Republic', 
                     'Advocacia-Geral da União': 'Advocacy General of the Union', 
                     'Ministério da Cidadania': 'Ministry of Citizenship', 
                     'PRESIDENCIA DA REPUBLICA': 'Presidency of the Republic', 
                     'Ministério das Relações Exteriores': 'Ministry of Foreign Affairs', 
                     'MINISTERIO DA ECONOMIA': 'Ministry of Economy', 
                     'MINISTERIO DAS COMUNICACOES': 'Ministry of Communications', 
                     'Ministério do Trabalho e Emprego': 'Ministry of Labour', 
                     'Ministério do Desenvolvimento Agrário': 'Ministry of Agriculture, Livestock and Supply', 
                     'Ministério da Mulher, Família e Direitos Huma': 'Ministry of Citizenship', 
                     'Controladoria-Geral da União': 'Office of the Comptroller General', 
                     'Justiça Federal': 'Federal Justice', 
                     'MINISTERIO DA SAUDE': 'Ministry of Health', 
                     'Ministério da Pesca e Aquicultura': 'Ministry of Agriculture, Livestock and Supply', 
                     'Justiça do Trabalho': 'Labour Justice', 
                     'MINIST. DA AGRICUL.,PECUARIA E ABASTECIMENTO': 'Ministry of Agriculture, Livestock and Supply', 
                     'Ministério da Previdência Social': 'Ministry of Economy', 
                     'Ministério do Planejamento, Desenvolvimento e': 'Ministry of Economy', 
                     'MINISTERIO DOS TRANSPORTES': 'Ministry of Infrastructure', 
                     'MINISTERIO DO PLANEJAMENTO,DESENV. E GESTÃO': 'Ministry of Economy', 
                     'Ministério das Mulheres, Igualdade Racial, da': 'Ministry of Citizenship'}

eda_df['Ministério'] = eda_df['Nome Órgão Superior'].replace(departments_group)

In [48]:
# Number and values of digital technology contracts over ministries

group_ministry = eda_df.groupby('Ministério')

table_4 = pd.DataFrame(columns=['Ministry', 'Number of digital technology contracts', 
                                'Sum of values of digital technology contracts in R$ MM', 
                                'Average value of digital technlogy contracts in R$ MM', 
                                '% of digital technology contracts', 
                                '% of values of digital technology contracts'])

for ministry, group in group_ministry:
    digital_eda_df = group[group['Classification'] == 'Digital Technology']

    num_digital = len(digital_eda_df)

    sum_digital = (digital_eda_df['Valor Contrato'].sum())/1000000

    perc_num_digital = (num_digital / len(group) * 100)

    perc_sum_digital = (sum_digital / (group['Valor Contrato'].sum()/1000000) * 100)

    avr_digital = sum_digital / num_digital

    table_4 = table_4.append({'Ministry': ministry,
                              'Number of digital technology contracts': num_digital,
                              'Sum of values of digital technology contracts in R$ MM': sum_digital,
                              'Average value of digital technlogy contracts in R$ MM': avr_digital,
                              '% of digital technology contracts': perc_num_digital,
                              '% of values of digital technology contracts': perc_sum_digital},
                             ignore_index=True)

table_4.set_index('Ministry', inplace=True)

table_4 = table_4.round(2)

table_4 = table_4.sort_values('Number of digital technology contracts', ascending=False)

table_4

Unnamed: 0_level_0,Number of digital technology contracts,Sum of values of digital technology contracts in R$ MM,Average value of digital technlogy contracts in R$ MM,% of digital technology contracts,% of values of digital technology contracts
Ministry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ministry of Education,2662,6039.27,2.27,2.0,4.07
Ministry of Defence,1191,2237.76,1.88,1.22,0.45
Ministry of Economy,1129,6281.65,5.56,3.66,10.11
Ministry of Mines and Energy,912,1306.7,1.43,3.45,3.21
Ministry of Health,655,2717.52,4.15,2.02,1.21
Ministry of Infrastructure,425,2337.95,5.5,3.51,1.77
"Ministry of Science, Technology and Innovations",374,516.33,1.38,6.06,7.0
Ministry of Justice and Public Security,320,1528.7,4.78,1.48,5.62
"Ministry of Agriculture, Livestock and Supply",290,675.25,2.33,2.35,5.86
Ministry of Regional Development,270,456.84,1.69,2.81,2.17


In [49]:
# Number and values of digital technology contracts over departments

pd.set_option('display.max_rows', 1000)

group_dep = eda_df.groupby('Nome Órgão')

table_5 = pd.DataFrame(columns=['Department', 'Number of digital technology contracts', 
                                'Sum of values of digital technology contracts in R$ MM', 
                                'Average value of digital technlogy contracts in R$ MM', 
                                '% of digital technology contracts', 
                                '% of values of digital technology contracts'])

for department, group in group_dep:
    digital_eda_df = group[group['Classification'] == 'Digital Technology']

    num_digital = len(digital_eda_df)

    sum_digital = (digital_eda_df['Valor Contrato'].sum())/1000000

    perc_num_digital = (num_digital / len(group) * 100)

    perc_sum_digital = (sum_digital / (group['Valor Contrato'].sum()/1000000) * 100)

    avr_digital = sum_digital / num_digital

    table_5 = table_5.append({'Department': department,
                              'Number of digital technology contracts': num_digital,
                              'Sum of values of digital technology contracts in R$ MM': sum_digital,
                              'Average value of digital technlogy contracts in R$ MM': avr_digital,
                              '% of digital technology contracts': perc_num_digital,
                              '% of values of digital technology contracts': perc_sum_digital},
                             ignore_index=True)

table_5.set_index('Department', inplace=True)

table_5 = table_5.round(2)

table_5 = table_5.sort_values('Number of digital technology contracts', ascending=False)

table_5.head(10)

  avr_digital = sum_digital / num_digital


Unnamed: 0_level_0,Number of digital technology contracts,Sum of values of digital technology contracts in R$ MM,Average value of digital technlogy contracts in R$ MM,% of digital technology contracts,% of values of digital technology contracts
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Comando do Exército,496,576.25,1.16,0.8,0.14
Empresas de Energia,382,399.51,1.05,2.34,1.37
Ministério da Economia - Unidades com vínculo,320,3927.61,12.27,1.81,8.44
Comando da Aeronáutica,247,1152.14,4.66,1.39,3.36
Serviço Federal de Processamento de Dados,240,1540.96,6.42,13.64,30.13
Banco Central do Brasil - Orçamento Fiscal e,235,566.44,2.41,3.74,6.54
Ministério da Saúde - Unidades com vínculo di,233,1203.98,5.17,1.24,1.24
Comando da Marinha,225,241.1,1.07,2.01,0.94
Fundação Oswaldo Cruz,203,980.48,4.83,2.36,0.82
"Ministério da Ciência, Tecnologia, Inovações",185,287.79,1.56,5.05,6.03


In [50]:
# Number and values of digital technology contracts over federal universities

group_ug = eda_df[eda_df['Nome UG'].str.contains('UNIVERSIDADE FEDERAL')]

group_ug = group_ug.groupby('Nome UG')

table_6 = pd.DataFrame(columns=['Management Unit', 'Number of digital technology contracts', 
                                'Sum of values of digital technology contracts in R$ MM', 
                                'Average value of digital technlogy contracts in R$ MM', 
                                '% of digital technology contracts', 
                                '% of values of digital technology contracts'])

for ug, group in group_ug:
    digital_eda_df = group[group['Classification'] == 'Digital Technology']

    num_digital = len(digital_eda_df)

    sum_digital = (digital_eda_df['Valor Contrato'].sum())/1000000

    perc_num_digital = (num_digital / len(group) * 100)

    perc_sum_digital = (sum_digital / (group['Valor Contrato'].sum()/1000000) * 100)

    avr_digital = sum_digital / num_digital

    table_6 = table_6.append({'Management Unit': ug,
                              'Number of digital technology contracts': num_digital,
                              'Sum of values of digital technology contracts in R$ MM': sum_digital,
                              'Average value of digital technlogy contracts in R$ MM': avr_digital,
                              '% of digital technology contracts': perc_num_digital,
                              '% of values of digital technology contracts': perc_sum_digital},
                             ignore_index=True)

table_6.set_index('Management Unit', inplace=True)

table_6 = table_6.round(2)

table_6 = table_6.sort_values('% of values of digital technology contracts', ascending=False)

table_6.head(1000)

  avr_digital = sum_digital / num_digital


Unnamed: 0_level_0,Number of digital technology contracts,Sum of values of digital technology contracts in R$ MM,Average value of digital technlogy contracts in R$ MM,% of digital technology contracts,% of values of digital technology contracts
Management Unit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
UNIVERSIDADE FEDERAL DE CAMPINA GRANDE,9,36.51,4.06,1.6,6.47
UNIVERSIDADE FEDERAL DE RORAIMA,24,13.7,0.57,2.47,5.2
UNIVERSIDADE FEDERAL DE CATALAO,2,2.0,1.0,4.26,4.48
UNIVERSIDADE FEDERAL DO AGRESTE DE PERNAMBUCO,2,0.1,0.05,14.29,3.81
FUNDACAO UNIVERSIDADE FEDERAL DE S.J.DEL-REI,12,10.97,0.91,1.41,2.8
UNIVERSIDADE FEDERAL DE JATAI,2,1.17,0.59,4.55,2.79
UNIVERSIDADE FEDERAL DE RONDONOPOLIS,3,1.25,0.42,6.0,2.73
UNIVERSIDADE FEDERAL DO OESTE DA BAHIA,21,3.48,0.17,7.64,2.45
FUNDACAO UNIVERSIDADE FEDERAL DO ABC,29,7.78,0.27,4.65,1.86
UNIVERSIDADE FEDERAL DA FRONTEIRA SUL,39,6.57,0.17,4.45,1.71


Over suppliers

In [51]:
# Overview of suppliers

digital_eda_df = eda_df[eda_df['Classification'] == 'Digital Technology']

unique_count = digital_eda_df['Nome Contratado'].nunique()
objeto_mean = digital_eda_df['Objeto'].count() / digital_eda_df['Nome Contratado'].nunique()
valor_mean = digital_eda_df['Valor Contrato em MM'].sum() / unique_count

table_71 = pd.DataFrame({
    'Total number of suppliers': [unique_count],
    'Average number of digital technology contracts per supplier': [objeto_mean],
    'Average value of digital technology contracts per supplier in R$ MM': [valor_mean]
})

table_71 = table_71.transpose()

table_71.columns = ['Value']

table_71 = table_71.round(2)

table_71

Unnamed: 0,Value
Total number of suppliers,1939.0
Average number of digital technology contracts per supplier,5.0
Average value of digital technology contracts per supplier in R$ MM,15.4


In [52]:
# Distribution of suppliers per number of contracts

digital_eda_df = eda_df[eda_df['Classification'] == 'Digital Technology']

contract_counts = digital_eda_df['Nome Contratado'].value_counts()

bins = [0, 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100, 250]

table_72 = pd.cut(contract_counts, bins=bins).value_counts().sort_index()
table_72.index = pd.IntervalIndex(table_72.index).astype(str)

table_72.index.name = 'Number of digital technology contracts per supplier'
table_72.name = 'Number of suppliers'

table_72 = pd.DataFrame(table_72)

table_72

Unnamed: 0_level_0,Number of suppliers
Number of digital technology contracts per supplier,Unnamed: 1_level_1
"(0, 1]",878
"(1, 2]",355
"(2, 3]",177
"(3, 4]",110
"(4, 5]",58
"(5, 10]",162
"(10, 20]",108
"(20, 30]",46
"(30, 40]",13
"(40, 50]",8


In [53]:
# Distribution of suppliers per sum of value of contracts

digital_eda_df = eda_df[eda_df['Classification'] == 'Digital Technology']

contract_sum_df = digital_eda_df.groupby('Nome Contratado')['Valor Contrato em MM'].sum()
bins = [0, 1, 2, 3, 4, 5, 10, 25, 50, 100, 250, 500, 1000, 5000]

supplier_range = pd.cut(contract_sum_df, bins)
suppliers_by_range = supplier_range.value_counts().sort_index()

table_df = pd.DataFrame({'Number of suppliers': suppliers_by_range})
table_df.index = pd.Series(table_df.index).astype(str)

table_df.index.name = 'Sum of values of digital technology contracts in R$ MM'

table_df

Unnamed: 0_level_0,Number of suppliers
Sum of values of digital technology contracts in R$ MM,Unnamed: 1_level_1
"(0, 1]",1223
"(1, 2]",150
"(2, 3]",93
"(3, 4]",48
"(4, 5]",46
"(5, 10]",116
"(10, 25]",117
"(25, 50]",61
"(50, 100]",44
"(100, 250]",24


In [54]:
# Number and values of digital technology contracts over suppliers - 1

group_sup = eda_df.groupby('Nome Contratado')

table_7 = pd.DataFrame(columns=['Supplier', 'Number of digital technology contracts', 
                                'Sum of values of digital technology contracts in R$ MM', 
                                'Average value of digital technlogy contracts in R$ MM'])

for supplier, group in group_sup:
    digital_eda_df = group[group['Classification'] == 'Digital Technology']

    num_digital = len(digital_eda_df)

    sum_digital = (digital_eda_df['Valor Contrato'].sum())/1000000

    avr_digital = sum_digital / num_digital

    table_7 = table_7.append({'Supplier': supplier,
                              'Number of digital technology contracts': num_digital,
                              'Sum of values of digital technology contracts in R$ MM': sum_digital,
                              'Average value of digital technlogy contracts in R$ MM': avr_digital},
                             ignore_index=True)

table_7.set_index('Supplier', inplace=True)

table_7 = table_7.round(2)

table_7 = table_7.sort_values('Number of digital technology contracts', ascending=False)

table_7.head(10)

  avr_digital = sum_digital / num_digital


Unnamed: 0_level_0,Number of digital technology contracts,Sum of values of digital technology contracts in R$ MM,Average value of digital technlogy contracts in R$ MM
Supplier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BRASOFTWARE INFORMATICA LTDA,208,755.87,3.63
ORACLE DO BRASIL SISTEMAS LTDA,181,480.78,2.66
PRIMASOFT INFORMATICA LTDA.,157,6.62,0.04
MCR SISTEMAS E CONSULTORIA LTDA,149,55.13,0.37
ASSOCIACAO PARANAENSE DE CULTURA - APC,124,3.43,0.03
SERVICO FEDERAL DE PROCESSAMENTO DE DADOS (SERPRO),112,4470.97,39.92
ALLEN RIO SERV. E COM. DE PROD. DE INFORMATICA LTDA,108,285.51,2.64
IMAGEM GEOSISTEMAS E COMERCIO LTDA,100,106.45,1.06
NCT INFORMATICA LTDA,94,80.52,0.86
INGRAM MICRO INFORMATICA LTDA,76,239.47,3.15


In [55]:
# Number and values of digital technology contracts over suppliers - 2

group_sup = eda_df.groupby('Nome Contratado')

table_7 = pd.DataFrame(columns=['Supplier', 'Number of digital technology contracts', 
                                'Sum of values of digital technology contracts in R$ MM', 
                                'Average value of digital technlogy contracts in R$ MM'])

for supplier, group in group_sup:
    digital_eda_df = group[group['Classification'] == 'Digital Technology']

    num_digital = len(digital_eda_df)

    sum_digital = (digital_eda_df['Valor Contrato'].sum())/1000000

    avr_digital = sum_digital / num_digital

    table_7 = table_7.append({'Supplier': supplier,
                              'Number of digital technology contracts': num_digital,
                              'Sum of values of digital technology contracts in R$ MM': sum_digital,
                              'Average value of digital technlogy contracts in R$ MM': avr_digital},
                             ignore_index=True)

table_7.set_index('Supplier', inplace=True)

table_7 = table_7.round(2)

table_7 = table_7.sort_values('Sum of values of digital technology contracts in R$ MM', ascending=False)

table_7 = table_7.head(10)

  avr_digital = sum_digital / num_digital


# Results to latex

In [56]:
# Import dependencies

from tabulate import tabulate
import plotly.express as px

In [57]:
# Table 1: Overview of digital technology contracts.

headers_1 = ["","Value"]

print(tabulate(table_1, headers_1, tablefmt="latex"))

\begin{tabular}{lr}
\hline
                                                        &    Value \\
\hline
 Number of digital technology contracts                 &  9698    \\
 Sum of values of digital technology contracts in R\$ MM & 29853.1  \\
 Average value of digital technlogy contracts in R\$ MM  &     3.08 \\
 \% of digital technology contracts                      &     2.16 \\
 \% of values of digital technology contracts            &     2.38 \\
\hline
\end{tabular}


In [58]:
# Table 2: Distribution of digital technology contract values.

headers_2 = ["Value Range in R$ MM","Number of digital technology contracts"]

print(tabulate(table_11, headers_2, tablefmt="latex"))

\begin{tabular}{rlr}
\hline
    & Value Range in R\$ MM   &   Number of digital technology contracts \\
\hline
  0 & (0.0, 0.5]             &                                     6027 \\
  1 & (0.5, 1.0]             &                                      982 \\
  2 & (1.0, 2.0]             &                                      830 \\
  3 & (2.0, 3.0]             &                                      469 \\
  4 & (3.0, 4.0]             &                                      286 \\
  5 & (4.0, 5.0]             &                                      207 \\
  6 & (5.0, 10.0]            &                                      451 \\
  7 & (10.0, 50.0]           &                                      385 \\
  8 & (50.0, 5000.0]         &                                       61 \\
\hline
\end{tabular}


In [59]:
# Table 3: Digital technology contracts from 2013 to 2022.

headers_3 = ["Year","Number of digital technology contracts",
             "Sum of values of digital technology contracts in R$ MM",
             "Average value of digital technlogy contracts in R$ MM",
            "% of digital technology contracts","% of values of digital technology contracts"]

print(tabulate(table_2, headers_3, tablefmt="latex"))

\begin{tabular}{rrrrrr}
\hline
   Year &   Number of digital technology contracts &   Sum of values of digital technology contracts in R\$ MM &   Average value of digital technlogy contracts in R\$ MM &   \% of digital technology contracts &   \% of values of digital technology contracts \\
\hline
   2013 &                                      962 &                                                  3606.21 &                                                    3.75 &                                2.09 &                                          3.84 \\
   2014 &                                     1007 &                                                  2801.23 &                                                    2.78 &                                1.96 &                                          3.29 \\
   2015 &                                      782 &                                                  2097.81 &                                                    2.68 &                  

In [60]:
# Table 4: Distribution of digital technology contracts over months.

headers_4 = ["Month","Amount of contracts",
             "Sum of values in R$ MM",
             "Average value in R$ MM",
            "% of total contracts","% of total contracts values"]

print(tabulate(table_3, headers_4, tablefmt="latex"))

\begin{tabular}{rrrrrr}
\hline
   Month &   Amount of contracts &   Sum of values in R\$ MM &   Average value in R\$ MM &   \% of total contracts &   \% of total contracts values \\
\hline
       1 &                  1238 &                  4129.92 &                     3.34 &                   2.96 &                          5.2  \\
       2 &                   648 &                  1103.05 &                     1.7  &                   1.85 &                          1.57 \\
       3 &                   639 &                  1766.02 &                     2.76 &                   1.89 &                          2.75 \\
       4 &                   565 &                  1911.65 &                     3.38 &                   1.83 &                          3.28 \\
       5 &                   597 &                  1097.99 &                     1.84 &                   1.79 &                          1.57 \\
       6 &                   588 &                  5910.79 &               

In [61]:
# Table 5: Overview of digital technology contracts per ministry.

headers_5 = ["Ministry","Amount of contracts",
             "Sum of values in R$ MM",
             "Average value in R$ MM",
            "% of total contracts","% of total contracts values"]

print(tabulate(table_4, headers_5, tablefmt="latex_longtable"))

\begin{longtable}{lrrrrr}
\hline
 Ministry                                        &   Amount of contracts &   Sum of values in R\$ MM &   Average value in R\$ MM &   \% of total contracts &   \% of total contracts values \\
\hline
\endhead
 Ministry of Education                           &                  2662 &                  6039.27 &                     2.27 &                   2    &                          4.07 \\
 Ministry of Defence                             &                  1191 &                  2237.76 &                     1.88 &                   1.22 &                          0.45 \\
 Ministry of Economy                             &                  1129 &                  6281.65 &                     5.56 &                   3.66 &                         10.11 \\
 Ministry of Mines and Energy                    &                   912 &                  1306.7  &                     1.43 &                   3.45 &                          3.21 \\
 Ministry of

In [62]:
# Table 6: Digital technology contracts in federal universities.

headers_6 = ["Federal University","Amount of contracts",
             "Sum of values in R$ MM",
             "Average value in R$ MM",
            "% of total contracts","% of total contracts values"]

print(tabulate(table_6, headers_6, tablefmt="latex_longtable"))

\begin{longtable}{lrrrrr}
\hline
 Federal University                            &   Amount of contracts &   Sum of values in R\$ MM &   Average value in R\$ MM &   \% of total contracts &   \% of total contracts values \\
\hline
\endhead
 UNIVERSIDADE FEDERAL DE CAMPINA GRANDE        &                     9 &                    36.51 &                     4.06 &                   1.6  &                          6.47 \\
 UNIVERSIDADE FEDERAL DE RORAIMA               &                    24 &                    13.7  &                     0.57 &                   2.47 &                          5.2  \\
 UNIVERSIDADE FEDERAL DE CATALAO               &                     2 &                     2    &                     1    &                   4.26 &                          4.48 \\
 UNIVERSIDADE FEDERAL DO AGRESTE DE PERNAMBUCO &                     2 &                     0.1  &                     0.05 &                  14.29 &                          3.81 \\
 FUNDACAO UNIVERSIDADE

In [63]:
# Table 7: Overview of suppliers of digital technology contracts.

headers_1 = ["","Value"]

print(tabulate(table_71, headers_1, tablefmt="latex"))

\begin{tabular}{lr}
\hline
                                                                     &   Value \\
\hline
 Total number of suppliers                                           &  1939   \\
 Average number of digital technology contracts per supplier         &     5   \\
 Average value of digital technology contracts per supplier in R\$ MM &    15.4 \\
\hline
\end{tabular}


In [64]:
# Table 8: Distribution of suppliers per number of contracts.

headers_7 = ["Number of digital technology contracts per supplier","Number of suppliers"]

print(tabulate(table_72, headers_7, tablefmt="latex"))

\begin{tabular}{lr}
\hline
 Number of digital technology contracts per supplier   &   Number of suppliers \\
\hline
 (0, 1]                                                &                   878 \\
 (1, 2]                                                &                   355 \\
 (2, 3]                                                &                   177 \\
 (3, 4]                                                &                   110 \\
 (4, 5]                                                &                    58 \\
 (5, 10]                                               &                   162 \\
 (10, 20]                                              &                   108 \\
 (20, 30]                                              &                    46 \\
 (30, 40]                                              &                    13 \\
 (40, 50]                                              &                     8 \\
 (50, 100]                                             &        

In [65]:
# Table 9: Distribution of suppliers per sum of value of contracts.

headers_8 = ["Sum of values of digital technology contracts in R$ MM","Number of suppliers"]

print(tabulate(table_df, headers_8, tablefmt="latex"))

\begin{tabular}{lr}
\hline
 Sum of values of digital technology contracts in R\$ MM   &   Number of suppliers \\
\hline
 (0, 1]                                                   &                  1223 \\
 (1, 2]                                                   &                   150 \\
 (2, 3]                                                   &                    93 \\
 (3, 4]                                                   &                    48 \\
 (4, 5]                                                   &                    46 \\
 (5, 10]                                                  &                   116 \\
 (10, 25]                                                 &                   117 \\
 (25, 50]                                                 &                    61 \\
 (50, 100]                                                &                    44 \\
 (100, 250]                                               &                    24 \\
 (250, 500]                   

In [66]:
# Table 10: Main suppliers per sum of values.

headers_9 = ["Supplier",
             "Amount of contracts",
             "Sum of values in R$ MM",
             "Average value in R$ MM",
             "% of total contracts",
             "% of total contracts values"]

print(tabulate(table_7, headers_8, tablefmt="latex"))

\begin{tabular}{lrrr}
\hline
                                                                    &     &   Sum of values of digital technology contracts in R\$ MM &   Number of suppliers \\
\hline
 SERVICO FEDERAL DE PROCESSAMENTO DE DADOS (SERPRO)                 & 112 &                                                  4470.97 &                 39.92 \\
 EXTREME DIGITAL CONSULTORIA E REPRESENTACOES LTDA                  &  59 &                                                  3549.13 &                 60.15 \\
 EMPRESA DE TECNOLOGIA E INFORMACOES DA PREVIDENCIA - DATAPREV S.A. &  19 &                                                  2222.76 &                116.99 \\
 BRASOFTWARE INFORMATICA LTDA                                       & 208 &                                                   755.87 &                  3.63 \\
 SERPRO - SEDE - BRASILIA                                           &  19 &                                                   731.34 &                 38.49 \\
 IB

In [101]:
# Figure 1: Digital technology contracts over time.

df = eda_df[eda_df['Classification'] == 'Digital Technology']

df = df.groupby("Ano Contrato")['Objeto'].count()

df = pd.DataFrame(df).reset_index()

fig = px.bar(df, x="Ano Contrato", y="Objeto",
             labels={"Objeto": "Number of contracts", "Ano Contrato": "Year"})

fig.update_traces(marker_color='darkred', width=0.5)
fig.update_layout(showlegend=True)
