In [1]:
import camelot
import pandas as pd

# Updated Process

## 1. Create empty dataframe with proper column names

In [2]:
# This is where data will be added after cleaning. The final dataframe for export

In [3]:
df = pd.DataFrame(columns = [
    "Species",
    "BE", 
    "Wirtsart", 
    "Anzahl_Fundorte_vor1945", 
    "Anzahl_Fundorte_1945_1990",
    "Anzahl_Fundorte_1991_2001"
])
df

Unnamed: 0,Species,BE,Wirtsart,Anzahl_Fundorte_vor1945,Anzahl_Fundorte_1945_1990,Anzahl_Fundorte_1991_2001


In [4]:
df.shape

(0, 6)

## 2. Setup mechanism to store accuracy report : QA step

In [5]:
accuracy = []

## 3. Dictionary of dataframes on the go

In [6]:
# Dictionary to collect dataframes created from each table
d = {}

## 4. Adding the list of pages to be parsed

In [7]:
pages = []
i = 2
while i < 7:
    pages.append(i)
    i = i + 1
print(pages)

[2, 3, 4, 5, 6]


## 5. Parsing through the pages

In [8]:
for i in pages:
    # print page number
    print (i)
    # create dataframe with page number
    d[i] = pd.DataFrame()
    #read the page
    #tables = camelot.read_pdf('18_netzfl_print.pdf', 
    #                          pages= str(i), 
    #                          flavor='stream', edge_tol=1000, row_tol=10)
    tables = camelot.read_pdf('06_brandpilze_print.pdf', pages= str(i), flavor='lattice')
    # add the accuracy report
    accuracy.append(tables[0].parsing_report)
    # assign the first table to a dataframe
    d[i] = tables[0].df
    print(d[i].shape)
# Print Done once all pages are parsed
print ("Done")

2
(22, 6)
3
(23, 6)
4
(23, 6)
5
(23, 6)
6
(14, 6)
Done


In [9]:
# Check accuracy reports for the process
accuracy

[{'accuracy': 99.17, 'whitespace': 4.55, 'order': 1, 'page': 2},
 {'accuracy': 99.29, 'whitespace': 4.35, 'order': 1, 'page': 3},
 {'accuracy': 99.3, 'whitespace': 4.35, 'order': 1, 'page': 4},
 {'accuracy': 99.33, 'whitespace': 4.35, 'order': 1, 'page': 5},
 {'accuracy': 99.3, 'whitespace': 7.14, 'order': 1, 'page': 6}]

# Data cleanup steps

In [10]:
# Check whether all dataframes are of the same size
for key in d:
    print(key)
    print(d[key].shape)

2
(22, 6)
3
(23, 6)
4
(23, 6)
5
(23, 6)
6
(14, 6)


In [11]:
# Removing empty dataframes
# del d[64]
# del d[74]

In [12]:
# Creates a sorted dictionary (sorted by key)
from collections import OrderedDict
d2 = OrderedDict(sorted(d.items()))

In [13]:
# Check whether the dictionary is sorted
for key in d2:
    print(key)

2
3
4
5
6


In [14]:
# Rename all the dataframes in the sorted dictionary
for key in d2:
    print(key)
    d2[key].columns = ["Species",
    "BE", 
    "Wirtsart", 
    "Anzahl_Fundorte_vor1945", 
    "Anzahl_Fundorte_1945_1990",
    "Anzahl_Fundorte_1991_2001"]
print("Rename completed for concat")

2
3
4
5
6
Rename completed for concat


In [15]:
# Concatanate all the non-empty dataframes in the sorted dictionary into the empty df
for key in d2:
    df = pd.concat([df, d2[key]], axis = 0)
df

Unnamed: 0,Species,BE,Wirtsart,Anzahl_Fundorte_vor1945,Anzahl_Fundorte_1945_1990,Anzahl_Fundorte_1991_2001
0,Wissenschaftlicher Name,BE Wirtsart,,Anzahl der Fundorte,,
1,,,,vor \n1945,1945-\n1990,1991-\n2001
2,Anthracoidea angulata (H. SYDOW) \nBOIDOL & POELT,0,Carex hirta,3,-,-
3,Anthracoidea arenaria (H. SYDOW) J. \nNANNFELDT,1,Carex arenaria \nCarex ligerica \nCarex praecox,4 \n3 \n1,- \n- \n1,- \n- \n-
4,Anthracoidea caricis (PERSOON) \nBREFELD,0,Carex pilulifera,2,-,-
...,...,...,...,...,...,...
9,Ustilago syntherismae (SCHWEINITZ) \nPECK,1,Digitaria ischaemum (A) \nDigitaria sanguinali...,7 \n1,- \n-,1 \n1
10,Ustilago trichophora (LINK) KÖRNICKE,1,Echinochloa crus-galli (A),-,1,-
11,Ustilago tritici (PERSOON) ROSTRUP,V,Hordeum distichon (cult.) \nHordeum vulgare (c...,3 \n3 \n2,3 \n- \n2,- \n- \n-
12,Vankya ornithogali (SCHMIDT & \nKUNZE) ERSHAD ...,-,Gagea lutea \nGagea minima (?) \nGagea pratens...,1 \n- \n2 \n3,- \n2 \n17 \n1,- \n- \n1 \n1


In [16]:
# Definition of extinction
df2 = df.loc[df["BE"] == "0"]
df2.reset_index(inplace = True)
df2 = df2.drop(['index'], axis=1)
df2.head()

Unnamed: 0,Species,BE,Wirtsart,Anzahl_Fundorte_vor1945,Anzahl_Fundorte_1945_1990,Anzahl_Fundorte_1991_2001
0,Anthracoidea angulata (H. SYDOW) \nBOIDOL & POELT,0,Carex hirta,3,-,-
1,Anthracoidea caricis (PERSOON) \nBREFELD,0,Carex pilulifera,2,-,-
2,Anthracoidea caryophylleae KUKKO-\nNEN,0,Carex supina,5,-,-
3,Anthracoidea limosa (H. SYDOW) \nKUKKONEN,0,Carex limosa,1,-,-
4,Doassansia alismatis (NEES) CORNU,0,Alisma plantago-aquatica,4,-,-


In [17]:
'''
df3 = df.loc[df["Bestand"] == "ex"]
df3.reset_index(inplace = True)
df3 = df3.drop(['index'], axis=1)
df3.head()
'''

'\ndf3 = df.loc[df["Bestand"] == "ex"]\ndf3.reset_index(inplace = True)\ndf3 = df3.drop([\'index\'], axis=1)\ndf3.head()\n'

In [18]:
df2.describe()

Unnamed: 0,Species,BE,Wirtsart,Anzahl_Fundorte_vor1945,Anzahl_Fundorte_1945_1990,Anzahl_Fundorte_1991_2001
count,44,44,44,44,44,44
unique,44,1,42,16,5,4
top,Anthracoidea angulata (H. SYDOW) \nBOIDOL & POELT,0,Sorghum bicolor (cult.),1,-,-
freq,1,44,2,19,34,35


In [19]:
#df3.describe()

In [20]:
# export the dataframe as CSV
df2.to_csv('Fungi_Ustilaginales.csv', index=False)