In [0]:
#Install tabula python wrapper
!pip install tabula-py

In [0]:
from tabula import read_pdf
import pandas as pd
from datetime import date

In [0]:
# Use the url from www.gob.mx
# TODO: check how their cms works, seems like every day they upload 4 pdfs... 
# check file/<number>/ and date at the end

sospechosos ="https://www.gob.mx/cms/uploads/attachment/file/542103/Tabla_casos_sospechosos_COVID-19_2020.03.18.pdf"

In [0]:
# specify the area for page 1 since the column names are not being picked up by tabula
# A primer on PDF coordinate system to pass values to area param
# https://www.leadtools.com/help/leadtools/v19/dh/to/pdf-topics-pdfcoordinatesystem.html
df = read_pdf(sospechosos, area=[90, 0, 792, 792], pages="1")

In [0]:
# Get the rest of the pages in a second data frame 
# TODO: seems like is not reading the first row, maybe because it doesnt have the upper border
df2 = read_pdf(sospechosos, pages="all")

# Cleaning the first data frame

In [153]:
# Tabula outputs unformatted headers...
df[0].head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,de síntomas,Unnamed: 8,PCR en tiempo real,Unnamed: 9,Unnamed: 10,Unnamed: 11,México,Unnamed: 12
0,,,,,,PCR en tiempo real,México,,,,,,,,,
1,1.0,BAJA CALIFORNIA,M,56.0,13/03/2020,Sospechoso,Contacto,05/03/2020,,,,,,,,
2,2.0,SAN LUIS POTOSÍ,F,22.0,11/03/2020,Sospechoso,España,06/03/2020,,,,,,,,
3,3.0,CIUDAD DE MÉXICO,F,26.0,13/03/2020,Sospechoso,España,13/03/2020,,,,,,,,
4,4.0,MÉXICO,M,23.0,03/03/2020,Sospechoso,España,02/03/2020,,,,,,,,


In [121]:
# drop the first row with non usable data
df0 = df[0].iloc[1:] 

# Rename the columns
newcols = {
    'Unnamed: 0': 'No de caso', 
    'Unnamed: 1': 'Estado', 
    'Unnamed: 2': 'Sexo',
    'Unnamed: 3': 'Edad',
    'Unnamed: 4': 'Fecha de Inicio de síntomas',
    'Unnamed: 5': 'Identificación de COVID-19 por RTPCR en tiempo real',
    'Unnamed: 6': 'Procedencia',
    'Unnamed: 7': 'Fecha de llegada a México'
}

# Throws a warning in colab, unfortunatly I'm still not proficient with python...
df0.rename(columns=newcols, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [0]:
# Remove the extra columns:
columns = ['de síntomas', 
           'PCR en tiempo real', 
           'Unnamed: 8',
           'Unnamed: 9', 
           'Unnamed: 10', 
           'Unnamed: 11', 
           'México', 
           'Unnamed: 12']
df0 = df1.drop(labels=columns, axis=1)

In [160]:
df0.head()

Unnamed: 0,No de caso,Estado,Sexo,Edad,Fecha de Inicio de síntomas,Identificación de COVID-19 por RTPCR en tiempo real,Procedencia,Fecha de llegada a México
1,1.0,BAJA CALIFORNIA,M,56.0,13/03/2020,Sospechoso,Contacto,05/03/2020
2,2.0,SAN LUIS POTOSÍ,F,22.0,11/03/2020,Sospechoso,España,06/03/2020
3,3.0,CIUDAD DE MÉXICO,F,26.0,13/03/2020,Sospechoso,España,13/03/2020
4,4.0,MÉXICO,M,23.0,03/03/2020,Sospechoso,España,02/03/2020
5,5.0,COAHUILA,M,21.0,10/03/2020,Sospechoso,España,13/03/2020


## Concatenate with the other data frames

In [157]:
# Data frame 2 should have all pages...
len(df2)

6

In [158]:
## Remove page 1 since we already have it
df2 = df2[1:]
df2[0].head()

Unnamed: 0,51,CIUDAD DE MÉXICO,F,76,15/03/2020,Sospechoso,Contacto,NA
0,52,QUERETARO,M,25,08/03/2020,Sospechoso,España,16/03/2020
1,53,GUERRERO,M,29,15/03/2020,Sospechoso,Contacto,
2,54,HIDALGO,F,72,14/03/2020,Sospechoso,Estados Unidos,14/03/2020
3,55,SONORA,M,44,12/03/2020,Sospechoso,Italia,10/03/2020
4,56,MÉXICO,F,4,04/03/2020,Sospechoso,Estados Unidos,07/03/2020


In [0]:
# Add column headers to data frames
cols = ['No de caso', 
    'Estado', 
    'Sexo',
    'Edad',
    'Fecha de Inicio de síntomas',
    'Identificación de COVID-19 por RTPCR en tiempo real',
    'Procedencia',
    'Fecha de llegada a México'
]

# Initialize list with the cleaned df 0 
dataFrames = [df0]

# Iterate through the dataframes in df2 and add the column headers
# TODO: seems like we need to add the header value too
for dfx in df2:
    dataframe = pd.DataFrame(dfx.values, columns=cols)
    dataFrames.append(dataframe)


In [179]:
# Concatenate the cleaned data frames
# TODO: adjust the index column :/
data = pd.concat(dataFrames)
data

Unnamed: 0,No de caso,Estado,Sexo,Edad,Fecha de Inicio de síntomas,Identificación de COVID-19 por RTPCR en tiempo real,Procedencia,Fecha de llegada a México
1,1,BAJA CALIFORNIA,M,56,13/03/2020,Sospechoso,Contacto,05/03/2020
2,2,SAN LUIS POTOSÍ,F,22,11/03/2020,Sospechoso,España,06/03/2020
3,3,CIUDAD DE MÉXICO,F,26,13/03/2020,Sospechoso,España,13/03/2020
4,4,MÉXICO,M,23,03/03/2020,Sospechoso,España,02/03/2020
5,5,COAHUILA,M,21,10/03/2020,Sospechoso,España,13/03/2020
...,...,...,...,...,...,...,...,...
34,310,MÉXICO,M,33,10/03/2020,Sospechoso,Estados Unidos,10/03/2020
35,311,CIUDAD DE MÉXICO,M,40,09/03/2020,Sospechoso,Italia,08/03/2020
36,312,GUANAJUATO,M,48,15/03/2020,Sospechoso,España,10/03/2020
37,313,MÉXICO,M,15,08/03/2020,Sospechoso,Estados Unidos,08/03/2020


## Export the data set:

In [0]:
today = date.today()
d1 = today.strftime("%Y-%m-%d")
filename = "sospechosos-"+ d1+".csv"
data.to_csv(filename)