In [1]:
from os.path import abspath
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

spark = SparkSession \
    .builder \
    .appName("renovation") \
    .getOrCreate()

# set this parameter for date issue before 1582 (dpe database)
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

In [3]:
file_location = "../../data/bdnd/dpe_logement.csv"
file_type = "csv"
name = "dpe_bdnd"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .option("header", first_row_is_header) \
    .option("sep", delimiter) \
    .load(file_location)
if name == "weather":
    new_column_name_list= [name.replace(',','') for name in df.columns]
    df = df.toDF(*new_column_name_list)
# df.write.mode('overwrite')\
#     .format("parquet") \
#     .saveAsTable(f"Datalake.{name}")

In [4]:
df.show()

+---------------+----------------------+-----------+--------------------+-----------------+------------------------+----------------------+-------+----------------------+-------------------+----------------------+----------------------+--------------------------+--------------------------+--------------------+--------------------+------------------------+----------------+-------------------+--------------------------------+-------------------------------+--------------------------------+---------------------------------------+---------------------------+----------------------+-------------------------+------------------------------------+------------------------------+---------------------------------+--------------------------------------------+-----------------+-----------------------+-------------------------+--------------------------+-----------------------------+----------------------------------------+---------------------+----------------+--------------------+------------------

In [5]:
df_filtered = df.filter(F.col('type_dpe').contains(2021))
df_filtered.count()

1441016

In [6]:
df_filtered.show()
df_filtered.dtypes

+---------------+----------------------+-----------+--------------------+-----------------+------------------------+----------------------+-------+----------------------+-------------------+----------------------+----------------------+--------------------------+--------------------------+--------------------+--------------------+------------------------+----------------+-------------------+--------------------------------+-------------------------------+--------------------------------+---------------------------------------+---------------------------+----------------------+-------------------------+------------------------------------+------------------------------+---------------------------------+--------------------------------------------+-----------------+-----------------------+-------------------------+--------------------------+-----------------------------+----------------------------------------+---------------------+----------------+-------------------+-------------------

[('identifiant_dpe', 'string'),
 ('code_departement_insee', 'string'),
 ('arrete_2021', 'int'),
 ('type_dpe', 'string'),
 ('type_batiment_dpe', 'string'),
 ('periode_construction_dpe', 'string'),
 ('annee_construction_dpe', 'int'),
 ('version', 'double'),
 ('date_etablissement_dpe', 'string'),
 ('date_reception_dpe', 'string'),
 ('nombre_niveau_logement', 'int'),
 ('nombre_niveau_immeuble', 'int'),
 ('surface_habitable_immeuble', 'double'),
 ('surface_habitable_logement', 'double'),
 ('conso_5_usages_ep_m2', 'double'),
 ('conso_5_usages_ef_m2', 'double'),
 ('emission_ges_5_usages_m2', 'double'),
 ('classe_bilan_dpe', 'string'),
 ('classe_emission_ges', 'string'),
 ('classe_conso_energie_arrete_2012', 'string'),
 ('classe_emission_ges_arrete_2012', 'string'),
 ('conso_3_usages_ep_m2_arrete_2012', 'double'),
 ('emission_ges_3_usages_ep_m2_arrete_2012', 'double'),
 ('type_installation_chauffage', 'string'),
 ('type_energie_chauffage', 'string'),
 ('type_generateur_chauffage', 'string'),
 

In [7]:
df_filtered.withColumns({
    'date_etablissement_dpe' : F.to_date('date_etablissement_dpe', 'yyyy/MM/dd')
}).select(F.col('date_etablissement_dpe')).orderBy(F.desc('date_etablissement_dpe')).show()


+----------------------+
|date_etablissement_dpe|
+----------------------+
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
|            2022-08-29|
+----------------------+
only showing top 20 rows

