** 0.0 / IMPORTS & INIT**

In [1]:
import pandas as pd
import time
import os
import dash
from jupyter_plotly_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
import matplotlib.pyplot as plt
import plotly.express as px


import pandas as pd
import time
import os
import pygal # Python SVG graph plotting library
from pygal.style import NeonStyle # sexy af

**1.1 / CSV IMPORTING**

We import the 4 working databases from csv files in the subdirectory "Base" at the root of the current working directory through Pandas and assign them as Dataframes.                          
We also display progress information, since the process can be quite lenghty.

We declare constants to fetch only needed columns from each csv, reducing massively the computing power usage.

~~Caution : When all bases are imported at once, they are all loaded into RAM and they can take up to 20G. If your machine can't handle it, you should comment out what you don't need and work sequentially.~~ 
Not necessary anymore now that we fetch only the columns we need. Base still takes about 10G.

In [3]:
# Constants containing the columns we want to fetch from the csv's.
COLS_AVANTAGE = ['ligne_identifiant', 'denomination_sociale', 'categorie', 'qualite', 'benef_codepostal', 'benef_ville', 'pays', 'benef_titre_libelle', 'benef_speicalite_libelle', 'benef_etablissement_codepostal', 'ligne_type', 'avant_date_signature', 'avant_montant_ttc']
COLS_CONVENTION = ['ligne_identifiant', 'denomination_sociale', 'categorie', 'qualite', 'benef_codepostal', 'benef_ville', 'pays', 'benef_titre_libelle', 'benef_speicalite_libelle', 'benef_etablissement_codepostal', 'ligne_type', 'conv_date_signature', 'conv_montant_ttc']
COLS_REMUNERATION = ['entreprise_identifiant', 'denomination_sociale', 'benef_categorie_code', 'qualite', 'benef_codepostal', 'pays', 'benef_titre_libelle', 'benef_speicalite_libelle', 'benef_etablissement_codepostal', 'remu_date', 'remu_montant_ttc']
COLS_ENTREPRISE = ['pays','secteur','code_postal','ville']


start = time.perf_counter() # starting time counter

# Sequentially reads CSVs while displaying some basic progress info
# Uses usecols= to only take columns defined in the constants above
print('Started import.')
print('-----------------------')

print('Importing D_avantage...')
D_avantage = pd.read_csv("Base/declaration_avantage_2020_02_19_04_00.csv", sep = ";", usecols = COLS_AVANTAGE)
D_avantage.name = 'D_avantage'
print('D_avantage successfully imported. 3 more to go.')

print('Importing D_Convention...')
D_Convention = pd.read_csv("Base/declaration_convention_2020_02_19_04_00.csv", sep = ";", usecols = COLS_CONVENTION)
D_Convention.name = 'D_Convention'
print('D_Convention successfully imported. 2 more to go.')

print('Importing D_Remuneration...')
D_Remuneration = pd.read_csv("Base/declaration_remuneration_2020_02_19_04_00.csv", sep = ";", usecols = COLS_REMUNERATION)
D_Remuneration.name = 'D_Remuneration'
print('D_Remuneration successfully imported. 1 more to go.')

print('Importing Entreprise...')
Entreprise = pd.read_csv("Base/entreprise_2020_02_19_04_00.csv", sep = ",", usecols = COLS_ENTREPRISE)
Entreprise.name = 'Entreprise'
print('Entreprise successfully imported.')

# Calculates and prints compute time
success = time.perf_counter() 
import_time = int(success - start) 
print('-----------------------')
print('All csv successfully imported in %s seconds.\n\n'%(import_time))



Starting import...


Importing D_avantage...


**1.2 / DATAFRAMES CLEANING**

We use a dictionary because we'll need it later.

Order of operations :

- Create the return dictionary
- Get dataframe row indexes in a list
- Iterate over that list and assign the values of the benef_codepostal column and the target indicator column
- Format the values of benef_codepostal to keep only 2 digits, ignoring it if it is NaN, and turn it to a string
- Cast the values of the target as an int, ignoring it if it's NaN
- Iterate over the dictionary to check if the formatted value of benef_codepostal is already in there. If it's not, put it in along with its corresponding
target column value. If it is, add the target column value to the existing dict value for that key.
- Print various stats

We return a dictionary with a 2-digit postal code as keys and an (most of the time) absurdly large number as values.


In [1]:
def comparator3000(df, fetch):
    '''
    Creates a dictionary {'Postal Code' : 'Total € given'} with a 
    dataframe and a column of that dataframe.
    '''

    print('Started importing %s from %s.'%(fetch, df.name) )

    # Init 
    dic = dict() 
    start = time.perf_counter()
    q = list(df.index)
    fc = 0
    sc = 0

    # Core
    for i in q:

        # Progress bar
        if i % int((len(q)/100)) == 0: 
            aa = int(i / len(q) * 100) 
            aa = str(aa)
            print('%s %% processed.'%(aa))

        # Dynamically assigning relevant column values from row i
        cp = df['benef_codepostal'][i]
        ttc = df[fetch][i]

        # Type verification, splicing, and success/fail counts.
        cp = str(cp)
        cp = cp[:2]
        try:
            cp = int(cp)
            cp = str(cp)
            if len(cp) == 1:
                cp = '0' + cp
            else:
                pass
            ttc = int(ttc)
            sc += 1
        except ValueError:
            fc += 1
            continue

    

        # Populating dictionary while correcting issues
        if cp in dic:
            dic[cp] += ttc
        else:
            dic[cp] = ttc

    # Reporting compute time and successes/fails
    success = time.perf_counter()
    ns = int(success - start) 
    print('Succesfully imported %s from %s in %s seconds | %s rows had one or more missing values and were omitted | %s rows were usable\n'%(fetch, df.name, ns, sc, fc))        

    return dic


** 2.1 / MAP VISUALIZATION WITH PYGAL (PYthon svg GrAph plotting Library) **

Pygal allows to create dynamic maps as vectorial plots in xml optimized for HTML5 integration. It comes bundled with a detailled France map with regions and departments.

We use it to import the France departments map, and pass it the values of our dictionary, along with a title for the top and a subtitle on each value.
We then export it to a file that will be used by the HTML/CSS/JS renderer in an iframe.

In [None]:
def get_map(dic, title, subtitle):
    '''
    Creates a html file from a dictionary generated by comparator3000().
    Asks for a Title (displayed at the top of the page) and a subtitle
    (displayed over each department).
    '''

    # Core
    fr_chart = pygal.maps.fr.Departments(human_readable=True, width=1080, height=1080, style=NeonStyle)
    fr_chart.title = str(title)
    fr_chart.add(str(subtitle), dic)

    # Renders it and outputs to file in the current working directory
    fr_chart.render_to_file('%s.html'%(title), 555)

** DASH **


In [None]:
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

app.layout = html.Div(children=[
    html.H1(children='Transparence Santé'),

    html.Div(children='''
        Visualisation de données à partir de la base de données publique Transparence - Santé
    ''' ),

    dcc.Graph(
        id='example-graph',
        figure={
            'data': [
                {'x': [1, 2, 3], 'y': [1, 1, 2], 'type': 'bar', 'name': ':hap:'},
                {'x': [1, 2, 3], 'y': [2, 4, 5], 'type': 'bar', 'name': u':noel:'},
            ],
            'layout': {
                'title': 'Dash Data Visualization'
            }

        }
    )
])




if __name__ == '__main__':
    app.run_server(debug=True)