In [1]:
import camelot
import pandas as pd

# Updated Process

## 1. Create empty dataframe with proper column names

In [2]:
# This is where data will be added after cleaning. The final dataframe for export

In [3]:
df = pd.DataFrame(columns = [
    "Species",
    "BE", 
    "Bestand", 
    "Trend_lang",
    "Trend_kurz",
    "RF",
    "BB",
    "D",
    "GS",
    "GfU",
    "Letzter_Nachweis",
    "Common_name"
])
df

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,BB,D,GS,GfU,Letzter_Nachweis,Common_name


In [4]:
df.shape

(0, 12)

## 2. Setup mechanism to store accuracy report : QA step

In [6]:
accuracy = []

## 3. Dictionary of dataframes on the go

In [7]:
# Dictionary to collect dataframes created from each table
d = {}

## 4. Adding the list of pages to be parsed

In [8]:
pages = []
i = 7
while i < 8:
    pages.append(i)
    i = i + 1
print(pages)

[7]


## 5. Parsing through the pages

In [9]:
for i in pages:
    # print page number
    print (i)
    # create dataframe with page number
    d[i] = pd.DataFrame()
    #read the page
    tables = camelot.read_pdf('rote_liste_amphibien_kuehnel_et_al.pdf', 
                              pages= str(i), 
                              flavor='stream', edge_tol=1000, row_tol=10)
    #tables = camelot.read_pdf('18_netzfl_print.pdf', pages= str(i), flavor='lattice')
    # add the accuracy report
    accuracy.append(tables[0].parsing_report)
    # assign the first table to a dataframe
    d[i] = tables[0].df
    print(d[i].shape)
# Print Done once all pages are parsed
print ("Done")

7
(28, 12)
Done


In [10]:
# Check accuracy reports for the process
accuracy

[{'accuracy': 99.77, 'whitespace': 47.02, 'order': 1, 'page': 7}]

# Data cleanup steps

In [11]:
# Check whether all dataframes are of the same size
for key in d:
    print(key)
    print(d[key].shape)

7
(28, 12)


In [12]:
# Removing empty dataframes
# del d[64]
# del d[74]

In [13]:
# Creates a sorted dictionary (sorted by key)
from collections import OrderedDict
d2 = OrderedDict(sorted(d.items()))

In [14]:
# Check whether the dictionary is sorted
for key in d2:
    print(key)

7


In [15]:
# Rename all the dataframes in the sorted dictionary
for key in d2:
    print(key)
    d2[key].columns = [ "Species",
    "BE", 
    "Bestand", 
    "Trend_lang",
    "Trend_kurz",
    "RF",
    "BB",
    "D",
    "GS",
    "GfU",
    "Letzter_Nachweis",
    "Common_name"]
print("Rename completed for concat")

7
Rename completed for concat


In [16]:
# Concatanate all the non-empty dataframes in the sorted dictionary into the empty df
for key in d2:
    df = pd.concat([df, d2[key]], axis = 0)
df

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,BB,D,GS,GfU,Letzter_Nachweis,Common_name
0,Tabelle 3: Rote Liste und Gesamtartenliste der...,,,,,,,,,,,
1,Wissenschaftlicher Name,BE,Bestand,Trend,Trend,RF,BB,D,GS,GfU,Letzter,Deutscher Name
2,,,,lang,kurz,,,,,,Nachweis,
3,Schwanzlurche (Urodela),,,,,,,,,,,
4,"Ichthyosaura alpestris (LAURENTI, 1768)",♦,nb,,,,2,*,§,"10c, 11c, 14l",,Bergmolch
5,"Lissotriton vulgaris (LINNAEUS, 1758)",*,sh,<<,=,=,**,*,§,"1a, 2a, 6e, 8a, 9a, 10c,",,Teichmolch
6,,,,,,,,,,11c,,
7,"Triturus cristatus (LAURENTI, 1768)",2,s,<<,(cid:114)(cid:114)(cid:3),=,3,V,"§§, II,","1a, 2a, 6e, 8a 9a, 10c,",,"Nördlicher Kammmolch,"
8,,,,,,,,,IV,11c,,Kammmolch
9,Froschlurche (ANURA),,,,,,,,,,,


In [17]:
# Definition of extinction
df2 = df.loc[df["BE"] == "0"]
df2.head()

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,BB,D,GS,GfU,Letzter_Nachweis,Common_name
19,"Hyla arborea (LINNAEUS, 1758)",0,ex,,,,2,3,"§§, IV","1a, 2a, 4a, 6a,10c, 11c,",vor 1960,Mitteleuropäischer
22,"Pelophylax lessonae (CAMERANAO, 1782)",0,ex,,,,G,G,"§, IV","2d, 10c, 11c, 12b",1991,Kleiner Wasserfrosch


In [18]:
df2.reset_index(inplace = True)
df2.head()

Unnamed: 0,index,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,BB,D,GS,GfU,Letzter_Nachweis,Common_name
0,19,"Hyla arborea (LINNAEUS, 1758)",0,ex,,,,2,3,"§§, IV","1a, 2a, 4a, 6a,10c, 11c,",vor 1960,Mitteleuropäischer
1,22,"Pelophylax lessonae (CAMERANAO, 1782)",0,ex,,,,G,G,"§, IV","2d, 10c, 11c, 12b",1991,Kleiner Wasserfrosch


In [19]:
df2 = df2.drop(['index'], axis=1)
df2.head()

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,BB,D,GS,GfU,Letzter_Nachweis,Common_name
0,"Hyla arborea (LINNAEUS, 1758)",0,ex,,,,2,3,"§§, IV","1a, 2a, 4a, 6a,10c, 11c,",vor 1960,Mitteleuropäischer
1,"Pelophylax lessonae (CAMERANAO, 1782)",0,ex,,,,G,G,"§, IV","2d, 10c, 11c, 12b",1991,Kleiner Wasserfrosch


In [20]:
df3 = df.loc[df["Bestand"] == "ex"]
df3.reset_index(inplace = True)
df3 = df3.drop(['index'], axis=1)
df3.head()

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,BB,D,GS,GfU,Letzter_Nachweis,Common_name
0,"Hyla arborea (LINNAEUS, 1758)",0,ex,,,,2,3,"§§, IV","1a, 2a, 4a, 6a,10c, 11c,",vor 1960,Mitteleuropäischer
1,"Pelophylax lessonae (CAMERANAO, 1782)",0,ex,,,,G,G,"§, IV","2d, 10c, 11c, 12b",1991,Kleiner Wasserfrosch


In [21]:
df2.describe()

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,BB,D,GS,GfU,Letzter_Nachweis,Common_name
count,2,2,2,2.0,2.0,2.0,2,2,2,2,2,2
unique,2,1,1,1.0,1.0,1.0,2,2,2,2,2,2
top,"Hyla arborea (LINNAEUS, 1758)",0,ex,,,,2,3,"§§, IV","1a, 2a, 4a, 6a,10c, 11c,",vor 1960,Mitteleuropäischer
freq,1,2,2,2.0,2.0,2.0,1,1,1,1,1,1


In [22]:
df3.describe()

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,BB,D,GS,GfU,Letzter_Nachweis,Common_name
count,2,2,2,2.0,2.0,2.0,2,2,2,2,2,2
unique,2,1,1,1.0,1.0,1.0,2,2,2,2,2,2
top,"Hyla arborea (LINNAEUS, 1758)",0,ex,,,,2,3,"§§, IV","1a, 2a, 4a, 6a,10c, 11c,",vor 1960,Mitteleuropäischer
freq,1,2,2,2.0,2.0,2.0,1,1,1,1,1,1


In [24]:
# export the dataframe as CSV
df2.to_csv('Amphibia.csv', index=False)