## Converção e extração dos dados das imagens

#### Importação das dependencias

In [1]:
import re
import pandas as pd
from google.cloud import vision

#### Constantes

In [2]:
REGEX_DATA = r"([0-9]{2}\/?\/[0-9]{2}\/?\/[0-9]{2,4})"
REGEX_INVESTIGADO = "\\nInvestiga.*?([o|0-9]*)\\n"
REGEX_DESCARTADO = "\\nDescartado.*?([o|0-9]*)\\n"
REGEX_CONFIRMADO = "\\nConfirmado.*?([o|0-9]*)\\n"

#### Lendo DataSet de Boletins

In [3]:
df_posts = pd.read_csv("../dataset/posts_boletins_refine.csv")
df_posts.head()

Unnamed: 0.1,Unnamed: 0,post_id,source,date,text,image,url
0,0,962170830896016,Prefeitura de Icém,"[datetime.datetime(2020, 6, 29, 17, 0, 11)]",🚨Confira o Boletim Informativo🚨 de Icém refere...,https://scontent.fcgh7-1.fna.fbcdn.net/v/t1.0-...,/story.php?story_fbid=962170830896016&id=23449...
1,1,961495100963589,Prefeitura de Icém,"[datetime.datetime(2020, 6, 28, 17, 0, 15)]",🚨Confira o Boletim Informativo🚨 de Icém refere...,https://scontent.fcgh7-1.fna.fbcdn.net/v/t1.0-...,/story.php?story_fbid=961495100963589&id=23449...
2,3,960726124373820,Prefeitura de Icém,"[datetime.datetime(2020, 6, 27, 17, 32, 15)]",🚨Confira o Boletim Informativo🚨 de Icém refere...,https://scontent.fcgh7-1.fna.fbcdn.net/v/t1.0-...,/story.php?story_fbid=960726124373820&id=23449...
3,5,959994087780357,Prefeitura de Icém,"[datetime.datetime(2020, 6, 26, 17, 17, 51)]",🚨Confira o Boletim Informativo🚨 de Icém refere...,https://scontent.fcgh7-1.fna.fbcdn.net/v/t1.0-...,/story.php?story_fbid=959994087780357&id=23449...
4,6,959277597852006,Prefeitura de Icém,"[datetime.datetime(2020, 6, 25, 17, 0, 19)]",🚨Confira o Boletim Informativo🚨 de Icém refere...,https://scontent.fcgh7-1.fna.fbcdn.net/v/t1.0-...,/story.php?story_fbid=959277597852006&id=23449...


#### Extraindo texto das imagens

In [12]:
def detect_text_uri(uri):
    """Detects text in the file located in Google Cloud Storage or on the Web.
    """
    client = vision.ImageAnnotatorClient()
    image = vision.types.Image()
    image.source.image_uri = uri

    response = client.text_detection(
        image=image, image_context={"language_hints": ["pt_BR",]},
    )
    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    else:
        return response.text_annotations[0].description

In [13]:
words = []
for _, post in df_posts.iterrows():
    print(f"Processando {post.post_id}")
    words.append(detect_text_uri(post.image))

Processando 962170830896016
Processando 961495100963589
Processando 960726124373820
Processando 959994087780357
Processando 959277597852006
Processando 957834814662951
Processando 957074851405614
Processando 956420174804415
Processando 955575804888852
Processando 954818228297943
Processando 954101798369586
Processando 953440648435701
Processando 953350978444668
Processando 952634735182959
Processando 951911955255237
Processando 951201068659659
Processando 950399455406487
Processando 949655008814265
Processando 948927698886996
Processando 948225618957204
Processando 947524912360608
Processando 946797012433398
Processando 945961455850287
Processando 945164655929967
Processando 944375739342192
Processando 943599962753103
Processando 942869009492865
Processando 942045649575201
Processando 941331232979976
Processando 940632983049801
Processando 939895953123504
Processando 938486406597792
Processando 937799159999850
Processando 937118353401264
Processando 936438846802548
Processando 93574464

#### Gerando dataset com dados de: Data, Investigação, Descartados, Confirmados

In [6]:
def extract_text_by_regex(string):
    def _extract(regex, string):
        match = re.search(regex, string, re.MULTILINE)
        if match:
            return match.groups()[0]
        return ""

    return {
        "data": _extract(REGEX_DATA, string),
        "investigado": _extract(REGEX_INVESTIGADO, string),
        "descartado": _extract(REGEX_DESCARTADO, string),
        "confirmado": _extract(REGEX_CONFIRMADO, string),
    }

In [7]:
data = pd.DataFrame(
    [extract_text_by_regex(word) for word in words],
    columns=["data", "investigado", "descartado", "confirmado"],
)
data.head()

Unnamed: 0,data,investigado,descartado,confirmado
0,29/06/2020,01,27,3.0
1,28/06/2020,02,24,3.0
2,27/06/2020,02,24,3.0
3,26/06/2020,02,24,
4,25/06/2020,o,23,2.0


In [8]:
data = data.replace("o", 0)
data.to_csv("../dataset/dados_covid_trusted.csv")
# PS: o dataset precisou de correções manuais