In [1]:
import camelot
import pandas as pd

# Updated Process

## 1. Create empty dataframe with proper column names

In [2]:
# This is where data will be added after cleaning. The final dataframe for export

In [3]:
df = pd.DataFrame(columns = [
    "Wissenschaftlicher_Name",
    "BE", 
    "Bestand", 
    "Trend_lang", 
    "Trend_kurz",
    "RF",
    "BB",
    "D",
    "GS",
    "GfU",
    "Deutscher Name"
])
df

Unnamed: 0,Wissenschaftlicher_Name,BE,Bestand,Trend_lang,Trend_kurz,RF,BB,D,GS,GfU,Deutscher Name


In [4]:
df.shape

(0, 11)

## 2. Setup mechanism to store accuracy report : QA step

In [5]:
accuracy = []

## 3. Dictionary of dataframes on the go

In [6]:
# Dictionary to collect dataframes created from each table
d = {}

## 4. Adding the list of pages to be parsed

In [7]:
pages = []
i = 7
while i < 11:
    pages.append(i)
    i = i + 1
print(pages)

[7, 8, 9, 10]


## 5. Parsing through the pages

In [8]:
for i in pages:
    # print page number
    print (i)
    # create dataframe with page number
    d[i] = pd.DataFrame()
    #read the page
    tables = camelot.read_pdf('rote_liste_libellen_petzold.pdf', pages= str(i), flavor='stream', edge_tol=1000, row_tol=10)
    # add the accuracy report
    accuracy.append(tables[0].parsing_report)
    # assign the first table to a dataframe
    d[i] = tables[0].df
    # check for rows with "ex" column 2 and get a list of indices for those rows
    ex_index = d[i][(d[i].iloc[:,1] == "ex") | (d[i].iloc[:,2] == "ex") ].index
    # only keep the rows if the index is in the ex_index
    d[i] = d[i][(d[i].index.isin(ex_index))]
    # print the shape of the dataframe 
    # (check whether there are enough columns, if not change page reading parameters
    print(d[i].shape)
# Print Done once all pages are parsed
print ("Done")

7
(0, 11)
8
(1, 10)
9
(3, 11)
10
(0, 11)
Done


In [9]:
# Check accuracy reports for the process
accuracy

[{'accuracy': 100.0, 'whitespace': 37.12, 'order': 1, 'page': 7},
 {'accuracy': 100.0, 'whitespace': 23.08, 'order': 1, 'page': 8},
 {'accuracy': 100.0, 'whitespace': 32.17, 'order': 1, 'page': 9},
 {'accuracy': 100.0, 'whitespace': 29.22, 'order': 1, 'page': 10}]

In [10]:
d[7]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10


In [11]:
d[8]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
7,"Nehalennia speciosa (CHARPENTIER, 1840)*",0,ex,,,,2,1,§§,"1a, 1c, 2d, 10c, \nZwerglibelle"


In [12]:
d[9]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
3,"Onychogomphus forcipatus (LINNAEUS, 1758)*",0,ex,,,,3,V,§,"3a, 3b, 5a, 5b,",Kleine Zangenlibelle
7,"Cordulegaster boltonii (DONOVAN, 1807)*",0,ex,,,,3,*,§,"1a, 5b, 11c",Zweigestreifte
19,"Leucorrhinia dubia (VANDER LINDEN, 1825)*",0,ex,,,,2,3,§,"2d, 7e, 10c, 11c",Kleine Moosjungfer


In [13]:
d[10]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10


# Data cleanup steps

In [14]:
# Check whether all dataframes are of the same size
for key in d:
    print(key)
    print(d[key].shape)

7
(0, 11)
8
(1, 10)
9
(3, 11)
10
(0, 11)


In [15]:
del d[7]
del d[10]

In [16]:
for key in d:
    print(key)
    print(d[key].shape)

8
(1, 10)
9
(3, 11)


In [17]:
d[8]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
7,"Nehalennia speciosa (CHARPENTIER, 1840)*",0,ex,,,,2,1,§§,"1a, 1c, 2d, 10c, \nZwerglibelle"


In [18]:
# Removing empty dataframes
# del d[64]
# del d[74]

In [19]:
d[8][[9,10]] = d[8][9].str.split('\n', expand=True)
d[8]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
7,"Nehalennia speciosa (CHARPENTIER, 1840)*",0,ex,,,,2,1,§§,"1a, 1c, 2d, 10c,",Zwerglibelle


In [20]:
d[8].shape

(1, 11)

In [21]:
# Creates a sorted dictionary (sorted by key)
from collections import OrderedDict
d2 = OrderedDict(sorted(d.items()))

In [22]:
# Check whether the dictionary is sorted
for key in d2:
    print(key)

8
9


In [23]:
# Rename all the dataframes in the sorted dictionary
for key in d2:
    print(key)
    d2[key].columns = [ "Wissenschaftlicher_Name",
    "BE", 
    "Bestand", 
    "Trend_lang", 
    "Trend_kurz",
    "RF",
    "BB",
    "D",
    "GS",
    "GfU",
    "Deutscher Name"]
print("Rename completed for concat")

8
9
Rename completed for concat


In [24]:
# Concatanate all the non-empty dataframes in the sorted dictionary into the empty df
for key in d2:
    df = pd.concat([df, d2[key]], axis = 0)
df

Unnamed: 0,Wissenschaftlicher_Name,BE,Bestand,Trend_lang,Trend_kurz,RF,BB,D,GS,GfU,Deutscher Name
7,"Nehalennia speciosa (CHARPENTIER, 1840)*",0,ex,,,,2,1,§§,"1a, 1c, 2d, 10c,",Zwerglibelle
3,"Onychogomphus forcipatus (LINNAEUS, 1758)*",0,ex,,,,3,V,§,"3a, 3b, 5a, 5b,",Kleine Zangenlibelle
7,"Cordulegaster boltonii (DONOVAN, 1807)*",0,ex,,,,3,*,§,"1a, 5b, 11c",Zweigestreifte
19,"Leucorrhinia dubia (VANDER LINDEN, 1825)*",0,ex,,,,2,3,§,"2d, 7e, 10c, 11c",Kleine Moosjungfer


In [25]:
# Updated export location
# export the dataframe as CSV
df.to_csv('../../Transformation/Raw_csv/Odonata.csv', 
          index = False
         )