In [1]:
#Install tabula python wrapper
!pip install tabula-py



In [0]:
from tabula import read_pdf
import pandas as pd
from datetime import date

In [0]:
# Use the url from www.gob.mx
# TODO: check how their cms works, seems like every day they upload 4 pdfs... 
# check file/<number>/ and date at the end

sospechosos ="https://www.gob.mx/cms/uploads/attachment/file/542965/Tabla_casos_sospechosos_COVID-19_2020.03.23.pdf"

In [0]:
# specify the area for page 1 since the column names are not being picked up by tabula
# A primer on PDF coordinate system to pass values to area param
# https://www.leadtools.com/help/leadtools/v19/dh/to/pdf-topics-pdfcoordinatesystem.html
df = read_pdf(sospechosos, area=[100, 0, 792, 792], pages="1") #92-99 depending on their format

In [0]:
# Get the rest of the pages in a second data frame 
# TODO: seems like is not reading the first row, maybe because it doesnt have the upper border
df2 = read_pdf(sospechosos, pages="all")

# Cleaning the first data frame

In [6]:
# Tabula outputs unformatted headers...
df[0].head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,1,SAN LUIS POTOSÍ,RIO VERDE,F,22,11/03/2020,Sospechoso,España,06/03/2020,
1,2,PUEBLA,PUEBLA,M,49,15/03/2020,Sospechoso,Contacto,,
2,3,SAN LUIS POTOSÍ,SAN LUIS POTOSI,M,25,13/03/2020,Sospechoso,Contacto,,
3,4,GUERRERO,ZONA NORTE,M,29,15/03/2020,Sospechoso,Contacto,,
4,5,HIDALGO,TULANCINGO,F,72,14/03/2020,Sospechoso,Estados Unidos,14/03/2020,


In [0]:
# drop the first row with non usable data
df0 = df[0]

# Rename the columns
newcols = {
    'Unnamed: 0': 'No de caso', 
    'Unnamed: 1': 'Estado', 
    'Unnamed: 2': 'Localidad',
    'Unnamed: 3': 'Sexo',
    'Unnamed: 4': 'Edad',
    'Unnamed: 5': 'Fecha de Inicio de síntomas',
    'Unnamed: 6': 'Identificación de COVID-19 por RTPCR en tiempo real',
    'Unnamed: 7': 'Procedencia',
    'Unnamed: 8': 'Fecha de llegada a México'
}

# Throws a warning in colab, unfortunatly I'm still not proficient with python...
df0.rename(columns=newcols, inplace=True)

In [8]:
df0.head()

Unnamed: 0,No de caso,Estado,Localidad,Sexo,Edad,Fecha de Inicio de síntomas,Identificación de COVID-19 por RTPCR en tiempo real,Procedencia,Fecha de llegada a México,Unnamed: 9
0,1,SAN LUIS POTOSÍ,RIO VERDE,F,22,11/03/2020,Sospechoso,España,06/03/2020,
1,2,PUEBLA,PUEBLA,M,49,15/03/2020,Sospechoso,Contacto,,
2,3,SAN LUIS POTOSÍ,SAN LUIS POTOSI,M,25,13/03/2020,Sospechoso,Contacto,,
3,4,GUERRERO,ZONA NORTE,M,29,15/03/2020,Sospechoso,Contacto,,
4,5,HIDALGO,TULANCINGO,F,72,14/03/2020,Sospechoso,Estados Unidos,14/03/2020,


In [0]:
# Remove the extra columns:
columns = ['Unnamed: 9']
df0 = df0.drop(labels=columns, axis=1)

In [11]:
df0.head()

Unnamed: 0,No de caso,Estado,Localidad,Sexo,Edad,Fecha de Inicio de síntomas,Identificación de COVID-19 por RTPCR en tiempo real,Procedencia,Fecha de llegada a México
0,1,SAN LUIS POTOSÍ,RIO VERDE,F,22,11/03/2020,Sospechoso,España,06/03/2020
1,2,PUEBLA,PUEBLA,M,49,15/03/2020,Sospechoso,Contacto,
2,3,SAN LUIS POTOSÍ,SAN LUIS POTOSI,M,25,13/03/2020,Sospechoso,Contacto,
3,4,GUERRERO,ZONA NORTE,M,29,15/03/2020,Sospechoso,Contacto,
4,5,HIDALGO,TULANCINGO,F,72,14/03/2020,Sospechoso,Estados Unidos,14/03/2020


## Concatenate with the other data frames

In [12]:
# Data frame 2 should have all pages...
len(df2)

16

In [0]:
## Remove page 1 since we already have it
df2 = df2[1:]
df2[0].head()

In [0]:
# Add column headers to data frames
cols = ['No de caso', 
    'Estado', 
    'Localidad', 
    'Sexo',
    'Edad',
    'Fecha de Inicio de síntomas',
    'Identificación de COVID-19 por RTPCR en tiempo real',
    'Procedencia',
    'Fecha de llegada a México'
]

# Initialize list with the cleaned df 0 
dataFrames = [df0]

# Iterate through the dataframes in df2 and add the column headers
for dfx in df2:
    # Add header value as data
    first = dfx.columns
    first_row = dict( zip(cols, first))
    data = pd.DataFrame([first_row])
    dataframe = pd.concat([data, pd.DataFrame(dfx.values, columns=cols)])
    # append the dataframes
    dataFrames.append(dataframe)


In [0]:
# Concatenate the cleaned data frames
data = pd.concat(dataFrames,ignore_index=True)
data

## Export the data set:

In [0]:
today = date.today()
d1 = today.strftime("%Y-%m-%d")
filename = "sospechosos-"+ d1+".csv"
data.to_csv(filename)