# Create script to extract weekly Grippeindex from PDF reports and store it in one file

In [66]:
import camelot
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#save names of all pdf files to the reportlist
reportlist = [report for report in os.listdir('reports') if report[-4:] == '.pdf']
reportlist.sort()

In [3]:
#create Dataframe for Grippeindex that contains the list of states
grippe_index = pd.DataFrame(['Süden', 'Baden-Württemberg', 'Bayern', 'Mitte (West)', 'Hessen',
       'Nordrhein-Westfalen', 'Rheinland-Pfalz, Saarland',
       'Norden (West)', 'Niedersachsen, Bremen',
       'Schleswig-Holstein, Hamburg', 'Osten', 'Brandenburg, Berlin',
       'Mecklenburg-Vorpommern', 'Sachsen', 'Sachsen-Anhalt', 'Thüringen','Gesamt'], columns = ['AGI-(Groß-)Region']) 

for report in reportlist:
    path = 'reports/' + report
    year = report[:4]
    tables = camelot.read_pdf(path, flavor='stream',pages='1,2,3,4,5,6')
    df = pd.DataFrame()
    
    #only extract table with Praxisindex for each State
    for i in range(tables.n - 1):
        test_df = tables[i].df
        if 'AGI-(Groß-)Region' in str(test_df[0]):
            df = test_df
            break
    
    #manipulate df so that it only contains the relevant information and has the right header
    df = df[df.index[df[0] == 'AGI-(Groß-)Region'].tolist()[0] : df.index[df[0] == 'Gesamt'].tolist()[0] + 1]
    df  = pd.DataFrame(df.values[1:], columns=df.iloc[0])
    
    
    for col in df.columns.drop('AGI-(Groß-)Region'):
        #change column names to datetime format
        new_col = str(report[:4]+col[:2])
        df.rename(columns={col: str(new_col)},inplace=True)
        
        #update grippe index dataframe
        grippe_index.loc[:,new_col] = df[new_col]

print('Reports collected')



Reports collected


In [54]:
test = grippe_index.melt(id_vars = 'AGI-(Groß-)Region', var_name ='week',value_name='index')
test['year'] = [w[:4] for w in test['week']]
test['week'] = [w[4:] for w in test['week']]
test['AGI-(Groß-)Region'] = [reg.replace(',','_').replace(' ','_').replace('-','') for reg in test['AGI-(Groß-)Region']]

In [55]:
date_df = pd.DataFrame(index=pd.date_range(pd.to_datetime('1/1/2015'),pd.to_datetime('1/7/2020')))

In [56]:
date_df['week'] = date_df.index.weekofyear
date_df['year'] = date_df.index.year