In [1]:
import camelot
import pandas as pd

# Updated Process

## 1. Create empty dataframe with proper column names

In [2]:
# This is where data will be added after cleaning. The final dataframe for export

In [3]:
df = pd.DataFrame(columns = [
    "Species",
    "BE", 
    "Bestand", 
    "Trend_lang", 
    "Trend_kurz",
    "RF",
    "D",
    "GfU",
    "Letzter_Nachweis"
])
df

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,D,GfU,Letzter_Nachweis


In [4]:
df.shape

(0, 9)

## 2. Setup mechanism to store accuracy report : QA step

In [5]:
accuracy = []

## 3. Dictionary of dataframes on the go

In [6]:
# Dictionary to collect dataframes created from each table
d = {}

## 4. Adding the list of pages to be parsed

In [7]:
pages = []
i = 7
while i < 9:
    pages.append(i)
    i = i + 1
print(pages)

[7, 8]


## 5. Parsing through the pages

In [8]:
for i in pages:
    # print page number
    print (i)
    # create dataframe with page number
    d[i] = pd.DataFrame()
    #read the page
    tables = camelot.read_pdf('rote_liste_raubfliegen_degen.pdf', 
                              pages= str(i), 
                              flavor='stream', edge_tol=1000, row_tol=10)
    #tables = camelot.read_pdf('rote_liste_raubfliegen_degen.pdf', pages= str(i), flavor='lattice')
    # add the accuracy report
    accuracy.append(tables[0].parsing_report)
    # assign the first table to a dataframe
    d[i] = tables[0].df
    print(d[i].shape)
# Print Done once all pages are parsed
print ("Done")

7
(26, 9)
8
(20, 9)
Done


In [9]:
# Check accuracy reports for the process
accuracy

[{'accuracy': 99.7, 'whitespace': 28.21, 'order': 1, 'page': 7},
 {'accuracy': 99.35, 'whitespace': 22.78, 'order': 1, 'page': 8}]

In [10]:
d[8]

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Wissenschaftlicher Name,BE,Bestand,Trend,Trend,RF,D,GfU,Letzter
1,,,,lang,kurz,,,,Nachweis
2,"Leptogaster guttiventris ZETTERSTEDT, 1842",*,ss,=,=,=,*,,
3,"Leptogaster subtilis LOEW, 1847",*,ss,>,(cid:113)(cid:3),=,*,,
4,"Machimus arthriticus (ZELLER, 1840)",1,es,<<,(cid:114)(cid:114)(cid:3),=,*,,
5,"Machimus chrysitis (MEIGEN, 1820)",*,ss,=,(cid:113)(cid:3),=,2,,
6,"Machimus gonatistes (ZELLER, 1840)",1,es,=,((cid:114)),=,2,,
7,"Machimus rusticus (MEIGEN, 1820)",*,ss,=,=,=,V,,
8,"Neoepitriptus setosulus (ZELLER, 1840)",*,s,=,(cid:113)(cid:3),=,3,,
9,"Neoitamus cothurnatus (MEIGEN, 1820)*",*,ss,?,(cid:113)(cid:3),=,G,,


# Data cleanup steps

In [11]:
d[7] = d[7].drop([0,1], axis=0)
d[7]

Unnamed: 0,0,1,2,3,4,5,6,7,8
2,"Andrenosoma atrum (LINNAEUS, 1758)",0,ex,,,,2,,1904
3,"Antipalus varipes (MEIGEN, 1820)",*,h,=,=,=,V,,
4,"Asilus crabroniformis LINNAEUS, 1758",0,ex,,,,2,,1936
5,"Choerades femorata (MEIGEN, 1804)",*,ss,=,=,=,*,,
6,"Choerades gilva (LINNAEUS, 1758)",0,ex,,,,G,,1958
7,"Choerades ignea (MEIGEN, 1820)",1,es,<<,(cid:114)(cid:114)(cid:3),=,V,12b,
8,"Choerades marginata (LINNAEUS, 1758)",*,s,=,=,=,*,,
9,"Dasypogon diadema (FABRICIUS, 1781)",V,s,=,((cid:114)),=,2,"7a, 14e",
10,"Dioctria atricapilla MEIGEN, 1804",*,mh,=,=,=,*,,
11,"Dioctria cothurnata MEIGEN, 1820",0,ex,,,,*,,1951


In [12]:
d[8] = d[8].drop([0,1], axis=0)
d[8]

Unnamed: 0,0,1,2,3,4,5,6,7,8
2,"Leptogaster guttiventris ZETTERSTEDT, 1842",*,ss,=,=,=,*,,
3,"Leptogaster subtilis LOEW, 1847",*,ss,>,(cid:113)(cid:3),=,*,,
4,"Machimus arthriticus (ZELLER, 1840)",1,es,<<,(cid:114)(cid:114)(cid:3),=,*,,
5,"Machimus chrysitis (MEIGEN, 1820)",*,ss,=,(cid:113)(cid:3),=,2,,
6,"Machimus gonatistes (ZELLER, 1840)",1,es,=,((cid:114)),=,2,,
7,"Machimus rusticus (MEIGEN, 1820)",*,ss,=,=,=,V,,
8,"Neoepitriptus setosulus (ZELLER, 1840)",*,s,=,(cid:113)(cid:3),=,3,,
9,"Neoitamus cothurnatus (MEIGEN, 1820)*",*,ss,?,(cid:113)(cid:3),=,G,,
10,"Neoitamus cyanurus (LOEW, 1849)",*,mh,=,=,=,*,,
11,"Neoitamus socius (LOEW, 1871)",1,es,<<,?,=,*,,


In [13]:
# Check whether all dataframes are of the same size
for key in d:
    print(key)
    print(d[key].shape)

7
(24, 9)
8
(18, 9)


In [14]:
# Removing empty dataframes
# del d[64]
# del d[74]

In [15]:
# Creates a sorted dictionary (sorted by key)
from collections import OrderedDict
d2 = OrderedDict(sorted(d.items()))

In [16]:
# Check whether the dictionary is sorted
for key in d2:
    print(key)

7
8


In [17]:
# Rename all the dataframes in the sorted dictionary
for key in d2:
    print(key)
    d2[key].columns = [
        "Species",
        "BE", 
        "Bestand", 
        "Trend_lang", 
        "Trend_kurz",
        "RF",
        "D",
        "GfU",
        "Letzter_Nachweis"
    ]
print("Rename completed for concat")

7
8
Rename completed for concat


In [18]:
# Concatanate all the non-empty dataframes in the sorted dictionary into the empty df
for key in d2:
    df = pd.concat([df, d2[key]], axis = 0)
df

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,D,GfU,Letzter_Nachweis
2,"Andrenosoma atrum (LINNAEUS, 1758)",0,ex,,,,2,,1904
3,"Antipalus varipes (MEIGEN, 1820)",*,h,=,=,=,V,,
4,"Asilus crabroniformis LINNAEUS, 1758",0,ex,,,,2,,1936
5,"Choerades femorata (MEIGEN, 1804)",*,ss,=,=,=,*,,
6,"Choerades gilva (LINNAEUS, 1758)",0,ex,,,,G,,1958
7,"Choerades ignea (MEIGEN, 1820)",1,es,<<,(cid:114)(cid:114)(cid:3),=,V,12b,
8,"Choerades marginata (LINNAEUS, 1758)",*,s,=,=,=,*,,
9,"Dasypogon diadema (FABRICIUS, 1781)",V,s,=,((cid:114)),=,2,"7a, 14e",
10,"Dioctria atricapilla MEIGEN, 1804",*,mh,=,=,=,*,,
11,"Dioctria cothurnata MEIGEN, 1820",0,ex,,,,*,,1951


In [19]:
# Definition of extinction
df2 = df.loc[df["BE"] == "0"]
df2.head()

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,D,GfU,Letzter_Nachweis
2,"Andrenosoma atrum (LINNAEUS, 1758)",0,ex,,,,2,,1904
4,"Asilus crabroniformis LINNAEUS, 1758",0,ex,,,,2,,1936
6,"Choerades gilva (LINNAEUS, 1758)",0,ex,,,,G,,1958
11,"Dioctria cothurnata MEIGEN, 1820",0,ex,,,,*,,1951
18,"Erax barbatus SCOPOLI, 1763",0,ex,,,,2,,vor 1945


In [20]:
df2.reset_index(inplace = True)
df2.head()

Unnamed: 0,index,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,D,GfU,Letzter_Nachweis
0,2,"Andrenosoma atrum (LINNAEUS, 1758)",0,ex,,,,2,,1904
1,4,"Asilus crabroniformis LINNAEUS, 1758",0,ex,,,,2,,1936
2,6,"Choerades gilva (LINNAEUS, 1758)",0,ex,,,,G,,1958
3,11,"Dioctria cothurnata MEIGEN, 1820",0,ex,,,,*,,1951
4,18,"Erax barbatus SCOPOLI, 1763",0,ex,,,,2,,vor 1945


In [21]:
df2 = df2.drop(['index'], axis=1)
df2.head()

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,D,GfU,Letzter_Nachweis
0,"Andrenosoma atrum (LINNAEUS, 1758)",0,ex,,,,2,,1904
1,"Asilus crabroniformis LINNAEUS, 1758",0,ex,,,,2,,1936
2,"Choerades gilva (LINNAEUS, 1758)",0,ex,,,,G,,1958
3,"Dioctria cothurnata MEIGEN, 1820",0,ex,,,,*,,1951
4,"Erax barbatus SCOPOLI, 1763",0,ex,,,,2,,vor 1945


In [22]:
df3 = df.loc[df["Bestand"] == "ex"]
df3.head()

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,D,GfU,Letzter_Nachweis
2,"Andrenosoma atrum (LINNAEUS, 1758)",0,ex,,,,2,,1904
4,"Asilus crabroniformis LINNAEUS, 1758",0,ex,,,,2,,1936
6,"Choerades gilva (LINNAEUS, 1758)",0,ex,,,,G,,1958
11,"Dioctria cothurnata MEIGEN, 1820",0,ex,,,,*,,1951
18,"Erax barbatus SCOPOLI, 1763",0,ex,,,,2,,vor 1945


In [23]:
df2.describe()

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,D,GfU,Letzter_Nachweis
count,8,8,8,8.0,8.0,8.0,8,8.0,8
unique,8,1,1,1.0,1.0,1.0,5,1.0,7
top,"Andrenosoma atrum (LINNAEUS, 1758)",0,ex,,,,2,,vor 1945
freq,1,8,8,8.0,8.0,8.0,4,8.0,2


In [24]:
df3.describe()

Unnamed: 0,Species,BE,Bestand,Trend_lang,Trend_kurz,RF,D,GfU,Letzter_Nachweis
count,8,8,8,8.0,8.0,8.0,8,8.0,8
unique,8,1,1,1.0,1.0,1.0,5,1.0,7
top,"Andrenosoma atrum (LINNAEUS, 1758)",0,ex,,,,2,,vor 1945
freq,1,8,8,8.0,8.0,8.0,4,8.0,2


In [26]:
# export the dataframe as CSV
# Updated location
df2.to_csv('../../Transformation/Raw_csv/Diptera_Asilidae_Raubfliegen.csv', index=False)