In [1]:
import camelot
import pandas as pd

# Updated Process

## 1. Create empty dataframe with proper column names

In [2]:
# This is where data will be added after cleaning. The final dataframe for export

In [3]:
df = pd.DataFrame(columns = [
    "Species",
    "RL", 
    "Gefährdungsursachen", 
    "Letzter Nachweis in Berlin"
])
df

Unnamed: 0,Species,RL,Gefährdungsursachen,Letzter Nachweis in Berlin


In [4]:
df.shape

(0, 4)

## 2. Setup mechanism to store accuracy report : QA step

In [5]:
accuracy = []

## 3. Dictionary of dataframes on the go

In [6]:
# Dictionary to collect dataframes created from each table
d = {}

## 4. Adding the list of pages to be parsed

In [7]:
pages = []
i = 19
while i < 23:
    pages.append(i)
    i = i + 1
print(pages)

[19, 20, 21, 22]


## 5. Parsing through the pages

In [8]:
for i in pages:
    # print page number
    print (i)
    # create dataframe with page number
    d[i] = pd.DataFrame()
    #read the page
    tables = camelot.read_pdf('rote_liste_wasserkaefer_hendrich_mueller.pdf', 
                              pages= str(i), 
                              flavor='stream', edge_tol=1000, row_tol=10)
    #tables = camelot.read_pdf('rote_liste_wasserkaefer_hendrich_mueller.pdf', pages= str(i), flavor='lattice')
    # add the accuracy report
    accuracy.append(tables[0].parsing_report)
    # assign the first table to a dataframe
    d[i] = tables[0].df
    print(d[i].shape)
# Print Done once all pages are parsed
print ("Done")

19
(28, 3)
20
(45, 4)
21
(45, 4)
22
(45, 4)
Done


In [9]:
tables = camelot.read_pdf('rote_liste_wasserkaefer_hendrich_mueller.pdf', 
                              pages= str(19), 
                              flavor='stream', edge_tol=1000, row_tol=10)
accuracy.append(tables[0].parsing_report)
d[19] = tables[1].df
print(d[19].shape)

(16, 4)


In [10]:
# Check accuracy reports for the process
accuracy

[{'accuracy': 68.9, 'whitespace': 47.62, 'order': 1, 'page': 19},
 {'accuracy': 100.0, 'whitespace': 20.0, 'order': 1, 'page': 20},
 {'accuracy': 100.0, 'whitespace': 18.33, 'order': 1, 'page': 21},
 {'accuracy': 100.0, 'whitespace': 30.0, 'order': 1, 'page': 22},
 {'accuracy': 68.9, 'whitespace': 47.62, 'order': 1, 'page': 19}]

# Data cleanup steps

In [11]:
# Check whether all dataframes are of the same size
for key in d:
    print(key)
    print(d[key].shape)

19
(16, 4)
20
(45, 4)
21
(45, 4)
22
(45, 4)


In [12]:
for key in d:
    d[key] = d[key].drop([0,1], axis=0)

In [13]:
for key in d:
    print(key)
    print(d[key].shape)

19
(14, 4)
20
(43, 4)
21
(43, 4)
22
(43, 4)


In [14]:
d[19] = d[19].drop([2,3,4], axis=0)
d[19].head()

Unnamed: 0,0,1,2,3
5,Hygrobiidae,,,
6,Hygrobia hermanni,0.0,?,"Weißensee, ca. 1900, HORION (1941) (ZMB)"
7,Haliplidae,,,
8,Brychius elevatus,0.0,5b,"Tempelhof, ca. 1900, coll. Weiß (ZMB)"
9,Haliplus confinis,3.0,"12a, 14a","Zehlendorf, Teich am Teltowkanal im Forst Düp-"


In [15]:
# Removing empty dataframes
# del d[64]
# del d[74]

In [16]:
# Creates a sorted dictionary (sorted by key)
from collections import OrderedDict
d2 = OrderedDict(sorted(d.items()))

In [17]:
# Check whether the dictionary is sorted
for key in d2:
    print(key)

19
20
21
22


In [18]:
# Rename all the dataframes in the sorted dictionary
for key in d2:
    print(key)
    d2[key].columns = [
    "Species",
    "RL", 
    "Gefährdungsursachen", 
    "Letzter Nachweis in Berlin"]
print("Rename completed for concat")

19
20
21
22
Rename completed for concat


In [19]:
# Concatanate all the non-empty dataframes in the sorted dictionary into the empty df
for key in d2:
    df = pd.concat([df, d2[key]], axis = 0)
df

Unnamed: 0,Species,RL,Gefährdungsursachen,Letzter Nachweis in Berlin
5,Hygrobiidae,,,
6,Hygrobia hermanni,0,?,"Weißensee, ca. 1900, HORION (1941) (ZMB)"
7,Haliplidae,,,
8,Brychius elevatus,0,5b,"Tempelhof, ca. 1900, coll. Weiß (ZMB)"
9,Haliplus confinis,3,"12a, 14a","Zehlendorf, Teich am Teltowkanal im Forst Düp-"
...,...,...,...,...
40,Dryops anglicanus,2,"2d, 11c","Schmöckwitz, ND Langes Luch 2006, Hendrich leg."
41,Dryops griseus,0,"2d, 11c","Spandau, NSG Teufelsbruch, 1973, Korge leg. (CK)"
42,Dryops nitidulus,0,?,"Lichterfelde, US-Truppenübungsplatz, 1983, Hen-"
43,,,,drich leg.


In [20]:
# Definition of extinction
df2 = df.loc[df["RL"] == "0"]
df2.head()

Unnamed: 0,Species,RL,Gefährdungsursachen,Letzter Nachweis in Berlin
6,Hygrobia hermanni,0,?,"Weißensee, ca. 1900, HORION (1941) (ZMB)"
8,Brychius elevatus,0,5b,"Tempelhof, ca. 1900, coll. Weiß (ZMB)"
12,Agabus biguttatus,0,"5b, 11c","Frohnau, Bieselfließ, 1996, Hendrich leg."
13,Agabus clypealis,0,"2d, 11c","Dahlem, NSG Langes Luch, 1987, BH leg."
19,Agabus guttatus,0,"5b, 11c","Frohnau, Bieselfließ, 1996, Hendrich leg."


In [21]:
df2.reset_index(inplace = True)
df2.head()

Unnamed: 0,index,Species,RL,Gefährdungsursachen,Letzter Nachweis in Berlin
0,6,Hygrobia hermanni,0,?,"Weißensee, ca. 1900, HORION (1941) (ZMB)"
1,8,Brychius elevatus,0,5b,"Tempelhof, ca. 1900, coll. Weiß (ZMB)"
2,12,Agabus biguttatus,0,"5b, 11c","Frohnau, Bieselfließ, 1996, Hendrich leg."
3,13,Agabus clypealis,0,"2d, 11c","Dahlem, NSG Langes Luch, 1987, BH leg."
4,19,Agabus guttatus,0,"5b, 11c","Frohnau, Bieselfließ, 1996, Hendrich leg."


In [22]:
df2 = df2.drop(['index'], axis=1)
df2.head()

Unnamed: 0,Species,RL,Gefährdungsursachen,Letzter Nachweis in Berlin
0,Hygrobia hermanni,0,?,"Weißensee, ca. 1900, HORION (1941) (ZMB)"
1,Brychius elevatus,0,5b,"Tempelhof, ca. 1900, coll. Weiß (ZMB)"
2,Agabus biguttatus,0,"5b, 11c","Frohnau, Bieselfließ, 1996, Hendrich leg."
3,Agabus clypealis,0,"2d, 11c","Dahlem, NSG Langes Luch, 1987, BH leg."
4,Agabus guttatus,0,"5b, 11c","Frohnau, Bieselfließ, 1996, Hendrich leg."


In [23]:
#df3 = df.loc[df["Bestand"] == "ex"]
#df3.reset_index(inplace = True)
#df3 = df3.drop(['index'], axis=1)
#df3.head()

In [24]:
df2.describe()

Unnamed: 0,Species,RL,Gefährdungsursachen,Letzter Nachweis in Berlin
count,31,31,31,31
unique,31,1,13,29
top,Hygrobia hermanni,0,"2d, 11c","Frohnau, Bieselfließ, 1996, Hendrich leg."
freq,1,31,10,2


In [25]:
df2

Unnamed: 0,Species,RL,Gefährdungsursachen,Letzter Nachweis in Berlin
0,Hygrobia hermanni,0,?,"Weißensee, ca. 1900, HORION (1941) (ZMB)"
1,Brychius elevatus,0,5b,"Tempelhof, ca. 1900, coll. Weiß (ZMB)"
2,Agabus biguttatus,0,"5b, 11c","Frohnau, Bieselfließ, 1996, Hendrich leg."
3,Agabus clypealis,0,"2d, 11c","Dahlem, NSG Langes Luch, 1987, BH leg."
4,Agabus guttatus,0,"5b, 11c","Frohnau, Bieselfließ, 1996, Hendrich leg."
5,Agabus labiatus,0,"1a, 5b","Tegel, Jungfernheide, ca. 1900, coll. Schilsky"
6,Agabus striolatus,0,"2d, 11c","Reinickendorf, ehem. Hermsdorfer See, 1994, BH"
7,Dytiscus latissimus,0,11c,"Tegel, Tegeler See, 1949, Weinhold leg."
8,Dytiscus semisulcatus,0,"1a, 5b","Pankow, Buch, 1926, Beck leg."
9,Graphoderus bilineatus,0,"5b, 11c, 12c","Berlin 10.1.1905, Sammlung Zimmermann (ZSM)"


In [26]:
#df3.describe()

In [27]:
# export the dataframe as CSV
df2.to_csv('../../Transformation/Raw_csv/Coleoptera_Wasserkäfer.csv', index=False)