In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os
import sys
import warnings
sys.path.append(os.path.abspath("../lib"))
from merge import prepare_train_show

from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning) 

In [2]:
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

spark = SparkSession \
    .builder \
    .appName("renovation") \
    .getOrCreate()

# set this parameter for date issue before 1582 (dpe database)
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [3]:
# import files
# File location and type
file_location_array = [
    {"location": "../training/pred_tremi_old.csv",
        "name": "pred_tremi_old", "delimiter": ","},
    {"location": "../training/pred_tremi.csv",
        "name": "pred_tremi_full", "delimiter": ","},
    {"location": "../training/dico.csv", "name": "dictionary", "delimiter": "\t"}
]
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
dataframes = {}

# The applied options are for CSV files. For other file types, these will be ignored.
for file in file_location_array:
    delimiter = file["delimiter"]
    file_location = file["location"]
    name = file["name"]
    dataframes[file["name"]] = (
        spark.read.format(file_type)
        .option("inferSchema", infer_schema)
        .option("header", first_row_is_header)
        .option("sep", delimiter)
        .load(file_location)
    )

In [10]:
# name the important datasets and cast to type
training_tremi_old = (
    dataframes['pred_tremi_old'].withColumns({
        'surface': F.col('surface').cast('int'),
        'heating_production': F.col('heating_production').cast('int'),
        'heating_emission': F.col('heating_emission').cast('int'),
    })
)
training_tremi = (
    dataframes['pred_tremi_full'].withColumns({
        'surface': F.col('surface').cast('int'),
        'heating_production': F.col('heating_production').cast('int'),
    })
)


dictionary = dataframes['dictionary']

In [11]:
# split training and prediction datasets
training_surf_old = training_tremi_old.filter(F.col('surface').isNotNull()).drop('heating_emission', 'heating_production')
predicting_surf_old = training_tremi_old.filter(F.col('surface').isNull()).drop('heating_emission', 'heating_production')

training_prod_old = training_tremi_old.filter(F.col('heating_production').isNotNull()).drop('heating_emission', 'surface')
predicting_prod_old = training_tremi_old.filter(F.col('heating_production').isNull()).drop('heating_emission', 'surface')

training_em_old = training_tremi_old.filter(F.col('heating_emission').isNotNull()).drop('surface', 'heating_production')
predicting_em_old = training_tremi_old.filter(F.col('heating_emission').isNull()).drop('surface', 'heating_production')

print(f"""
{training_surf_old.count() =}
{predicting_surf_old.count() =}\n
{training_prod_old.count() =}
{predicting_prod_old.count() =}\n
{training_em_old.count() =}
{predicting_em_old.count() =}\n
""")


training_surf_old.count() =12565
predicting_surf_old.count() =26933

training_prod_old.count() =11128
predicting_prod_old.count() =28370

training_em_old.count() =12558
predicting_em_old.count() =26940




In [7]:
# split training and prediction datasets
training_surf = training_tremi.filter(F.col('surface').isNotNull()).drop(
    'heating_production')
predicting_surf = training_tremi.filter(F.col('surface').isNull()).drop(
    'heating_production')

training_prod = training_tremi.filter(
    F.col('heating_production').isNotNull()).drop('surface')
predicting_prod = training_tremi.filter(
    F.col('heating_production').isNull()).drop('surface')

print(f"""
{training_surf.count() =}
{predicting_surf.count() =}\n
{training_prod.count() =}
{predicting_prod.count() =}\n
""")


training_surf.count() =12565
predicting_surf.count() =26933

training_prod.count() =11128
predicting_prod.count() =28370


