# Production history converter

This notebook aims at gathering the data published semi-annually by RTE (.xlsx files in ./RTE_gross_data/) into a single Pandas DataFrame (.csv.xz) to allow further processing.

In [94]:
# Modules import
import math

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

#### Filenames list generation

In [95]:
# As RTE files are splited by semester, using .5 to differentiate between the first semester (integers) and second semester (half-integer)
history_start_year = 2011.5                                         # First set available
history_end_year = 2023.5                                           # Last set available

# Type of RTE data to cast in a global pandas dataframe
history_files_folder = "./RTE_gross_data/"                          # Relative path
history_file_type = "ProductionGroupe"                              # Common root in the name

# Generation of the filenames' list
filenames_list = []
for semester in range(int(10*history_start_year), int(10*(history_end_year+0.5)), 5):
    filename_year = str(int((semester/10)//1))
    filename_semester = str(int(1+2*((semester/10)%1)))
    filenames_list.append(history_files_folder+history_file_type+"_"+filename_year+"-semestre"+filename_semester+".xlsx")
#end for

print(len(filenames_list), "filenames generated between", filenames_list[0], "and", filenames_list[-1])

25 filenames generated between ./RTE_gross_data/ProductionGroupe_2011-semestre2.xlsx and ./RTE_gross_data/ProductionGroupe_2023-semestre2.xlsx


#### Units' names extraction
Browsing into each Excel file (costly step requiring 4 minutes to run)

In [97]:
unit_names = []

for filename in filenames_list:
    # For each file in the list
    print(filename)                                                 # Printing filename is optional but helps monitoring the process while running
    df = pd.read_excel(filename, header=1)                          # Filtering the technologies written on the first line, technologies will be fetched later

    i = 0                                                           # Initialization of the counter of new names
    for name in df.columns:
        # For each column in the current file
        if name in unit_names:
            # If the name is already known, skip to the next column (watch out for the poor duplicates with typographic deltas)
            pass
        else:
            #print(name, "ajouté à la liste")
            unit_names.append(name)
            i+=1                                                    # Increments the counter
        #end if
    #end for
        
    print(i, "noms ajoutés à la liste")                             # Printing is optional but helps locating a possible error
#end for
            
print(len(unit_names), "noms extraits")                             # As of 2023, 255 must be extracted
#print(unit_names)

./RTE_gross_data/ProductionGroupe_2011-semestre2.xlsx
148 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2012-semestre1.xlsx
2 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2012-semestre2.xlsx
1 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2013-semestre1.xlsx
0 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2013-semestre2.xlsx
0 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2014-semestre1.xlsx
81 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2014-semestre2.xlsx
0 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2015-semestre1.xlsx
0 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2015-semestre2.xlsx
0 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2016-semestre1.xlsx
4 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2016-semestre2.xlsx
0 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_2017-semestre1.xlsx
0 noms ajoutés à la liste
./RTE_gross_data/ProductionGroupe_201

#### Units' names duplicates removal :
As the gross list has some poor duplicates, it is required to build manually a dictionary to assign the correct values to each gross value

In [98]:
unit_dict = {}                                                      # Net units' names dictionary initialization

for elt in unit_names:
    # It is required to manually add lines if duplicates are found
    elt2 = elt.upper()
    elt3 = elt2.replace("-"," ")
    elt4 = elt3.replace(" (LE) ", " ")
    elt5 = elt4.replace(" (LA) ", " ")
    elt6 = elt5.replace(" ( SUR SEINE) ", " ")
    elt7 = elt6.replace(" (L ) ", " ")
    elt8 = elt7.replace("_", " ")
    elt9 = elt8.replace("'", " ")
    elt10 = elt9.replace("COMBIGOLFE CCG", "COMBIGOLFE")
    elt11 = elt10.replace("DAMPIERRE EN BURLY", "DAMPIERRE")
    elt12 = elt11.replace(" SUR SEINE ", " ")
    elt13 = elt12.replace("ST ALBAN ST MAURICE", "ST ALBAN")
    elt14 = elt13.replace("LUCY 3", "LUCY")
    elt15 = elt14.replace("ST LAURENT DES EAUX B", "ST LAURENT")
    elt16 = elt15.replace("AMFARD", "AMFARD ")
    elt17 = elt16.replace("SPEM CCG", "SPEM")
    elt18 = elt17.replace("CYCOFOS PL1", "CYCOFOS")
    elt19 = elt18.replace("CHINON 1", "CHINON B 1")
    elt20 = elt19.replace("CHINON 2", "CHINON B 2")
    elt21 = elt20.replace("CHINON 3", "CHINON B 3")
    elt22 = elt21.replace("CHINON 4", "CHINON B 4")
    elt23 = elt22.replace("CHOOZ 1", "CHOOZ B 1")
    elt24 = elt23.replace("CHOOZ 2", "CHOOZ B 2")
    elt25 = elt24.replace("PORCHEVILLE 1", "PORCHEVILLE B 1")
    elt26 = elt25.replace("PORCHEVILLE 2", "PORCHEVILLE B 2")
    elt27 = elt26.replace("PORCHEVILLE 3", "PORCHEVILLE B 3")
    elt28 = elt27.replace("PORCHEVILLE 4", "PORCHEVILLE B 4")
    unit_dict.update({elt: elt28})
#end for
    
# At that stage, a problem remains about DK6 plant: from 2011 to 2014, there is only DK6 1 and DK6 2. After 2014 and until 2016, there is a duplicate for...
# ...each DK6 with two DK6-TG and two DK6-TV. After 2016.5, there is only four DK6 plants: two DK6-TG and two DK6-TV. There is also a question remaining open...
# ...about Saint-Pierre dam that seems to be inserted twice in RTE statements.
# A conservative option will be to discard later CYCOFOS and DK6 plants, whose yearly contribution is rather low, to ensure a good fusion of datas

# Names not to include in the final DataFrame
corrupted_names = ["CYCOFOS", "CYCOFOS PL2", "DK6 1", "DK6 2", "DK6-TG1", "DK6-TG2", "DK6-TV1", "DK6-TV2"]

# Commented section but can prove itself useful: prints the units' names conversion dictionary
#content_dict = []
#for key in unit_dict.keys():
#    if not unit_dict[key] in content_dict:
#        content_dict.append(unit_dict[key])
#    #end if
##end for
#        
##print(unit_dict)        
#content_dict.sort()
#print(content_dict)
#print(len(content_dict))

#### Concatenating data from semi-annual statements
Once the list of unit names is cleared of duplicates, we can start gathering data by concatenating vertically semesters. Before that, it will be required to modify the names of columns according to the dictionary created above to ensure continuity.

This step can prove quite long: anticipate 3-4 minutes duration for 2011 - 2023 concatenation

In [99]:
i = 0                                                               # Flag variable that turns off concatenation if null

for filename in filenames_list:
    print(filename)                                                 # Printing filename is optional but helps monitoring the process while running
    df = pd.read_excel(filename, header=1)                          # Gross dataframe with original names including duplicates
    #print(df)
    #print(df.shape)

    for corrupted_name in corrupted_names:                          # Remove the columns with corrupted names (default: CyCoFos and DK6 units)
        if corrupted_name in df.columns:                            # Avoiding "key not found in axis" error
            df = df.drop(columns=corrupted_name)                    # Remove the column if name is corrupted
            print(corrupted_name, "column dropped")                 # Printing dropped columns is optional but helps monitoring the process while running

    df = df.rename(columns=unit_dict)                               # Renaming the remaining columns according to the dictionary
    #print(df)

    if i==0:
        # First dataframe to be extracted, initializes the global dataframe
        global_df = df
    else:
        # Concatenate the current dataframe to the global one
        global_df = pd.concat([global_df, df], sort=False, ignore_index=True)
    #end if
        
    i += 1
        
    print(global_df.shape)                                          # Printing shape is optional but helps monitoring the process while running
#end for

./RTE_gross_data/ProductionGroupe_2011-semestre2.xlsx


CYCOFOS column dropped
DK6 1 column dropped
DK6 2 column dropped
(456, 145)
./RTE_gross_data/ProductionGroupe_2012-semestre1.xlsx
CYCOFOS column dropped
DK6 1 column dropped
DK6 2 column dropped
(4824, 147)
./RTE_gross_data/ProductionGroupe_2012-semestre2.xlsx
CYCOFOS column dropped
DK6 1 column dropped
DK6 2 column dropped
(9240, 148)
./RTE_gross_data/ProductionGroupe_2013-semestre1.xlsx
CYCOFOS column dropped
DK6 1 column dropped
DK6 2 column dropped
(13583, 148)
./RTE_gross_data/ProductionGroupe_2013-semestre2.xlsx
CYCOFOS column dropped
DK6 1 column dropped
DK6 2 column dropped
(17981, 148)
./RTE_gross_data/ProductionGroupe_2014-semestre1.xlsx
CYCOFOS column dropped
CYCOFOS PL2 column dropped
DK6 1 column dropped
DK6 2 column dropped
DK6-TG1 column dropped
DK6-TG2 column dropped
DK6-TV1 column dropped
DK6-TV2 column dropped
(22325, 156)
./RTE_gross_data/ProductionGroupe_2014-semestre2.xlsx
CYCOFOS column dropped
CYCOFOS PL2 column dropped
DK6 1 column dropped
DK6 2 column dropped
D

#### Units' technologies extraction
As the source files have the technology as a first row, we skipped it during the previous step. Let's browse again the files to extract units' technologies and add it to the DataFrame.
This step can prove quite long: anticipate 3-4 minutes duration for 2011 - 2023 technology extraction

In [100]:
techno_dict = {}                                                    # Units' technologies dictionary initialization

for filename in filenames_list:
    print(filename)
    df = pd.read_excel(filename, nrows=1)                           # All the file can be skipped after the first row

    # At that stage we have a DataFrame with technologies as header.
    # As DataFrame cannot support multiple columns with the same name, columns were renamed automatically with ".x" and x an increasing value.
    # It is impossible to swap header and rows that easily, so lets extract the data into a proper dictionary

    for key in df.columns:
        # For each column of the current file
        unit_name = str(df[key].values[0])                          # We keep only the first cell with the unit's name
        techno_name = key.split(".")[0]                             # The technology name per se is located before the ".", we keep only this part

        #print("Clé :", key)
        #print("Unité :", unit_name)
        #print("Techno :", techno_name)

        corrected_name = unit_dict[unit_name]                       # As the name can change from one statements to another, using the name correction dictionary will prevent errors

        # To avoid duplicates, we add the technology to the dictionary if and only if it's not already known or corrupted
        if corrected_name in global_df.columns:
            if corrected_name not in techno_dict.keys():
                techno_dict.update({corrected_name : techno_name})
                #print(corrected_name, "added")
            #end if
            else:
                #print(corrected_name, "rejected because duplicate")
                pass
        else:
            #print(unit_name, "rejected, may be corrupted")
            pass
        #end if
    #end for
#end for

techno_dict['DATE'] = ""                                            # Modifies the "DATE" value from "Unnamed: 0" to ""

#print(techno_dict)
#print(len(techno_dict))

./RTE_gross_data/ProductionGroupe_2011-semestre2.xlsx
./RTE_gross_data/ProductionGroupe_2012-semestre1.xlsx
./RTE_gross_data/ProductionGroupe_2012-semestre2.xlsx
./RTE_gross_data/ProductionGroupe_2013-semestre1.xlsx
./RTE_gross_data/ProductionGroupe_2013-semestre2.xlsx
./RTE_gross_data/ProductionGroupe_2014-semestre1.xlsx
./RTE_gross_data/ProductionGroupe_2014-semestre2.xlsx
./RTE_gross_data/ProductionGroupe_2015-semestre1.xlsx
./RTE_gross_data/ProductionGroupe_2015-semestre2.xlsx
./RTE_gross_data/ProductionGroupe_2016-semestre1.xlsx
./RTE_gross_data/ProductionGroupe_2016-semestre2.xlsx
./RTE_gross_data/ProductionGroupe_2017-semestre1.xlsx
./RTE_gross_data/ProductionGroupe_2017-semestre2.xlsx
./RTE_gross_data/ProductionGroupe_2018-semestre1.xlsx
./RTE_gross_data/ProductionGroupe_2018-semestre2.xlsx
./RTE_gross_data/ProductionGroupe_2019-semestre1.xlsx
./RTE_gross_data/ProductionGroupe_2019-semestre2.xlsx
./RTE_gross_data/ProductionGroupe_2020-semestre1.xlsx
./RTE_gross_data/ProductionG

#### Gathering aditionnal unit's technology and capacity datas into two DataFrame lines

In [101]:
# Adjusting dictionary content to add the model for NPPs
filename =  "./NPP_France_models.xlsx"
df = pd.read_excel(filename, index_col=0, header=0)

for unit in df.index.values:
    # For each nuclear power plant name known in the current file
    techno_dict.update({unit : df["NPP model"][unit]})              # Updates the technology by adding the model of the nuclear reactor
    pass
#end for
#print(techno_dict)

# Extracting the capacity of each unit from another Excel file
filename =  "./Units_France_capacity.xlsx"
df = pd.read_excel(filename, index_col=0, header=0)

capacity_dict = techno_dict.copy()                                  # Instanciates capacity_dict on the same keys than techno_dict
for key in capacity_dict.keys():
    # For each unit in the dictionary
    if key != "DATE":
        capacity_dict.update({key : df["Capacity (MW)"][key]})      # Fetches the capacity in the .xlsx file and updates the dictionary
    else:
        pass
    #end if
#end for
#print(capacity_dict)
    


# Convert the resulting technology and capacity dictionaries into a Pandas DataFrame
df = pd.DataFrame(data=techno_dict, index=["Technology"])           # DataFrame line with technologies (and model for nuclear power plants)
df2 = pd.DataFrame(data=capacity_dict, index=["Capacity (MW)"])     # DataFrame line with capacity (expressed in MW)

# Adds the new row to the dataframe
global_df_with_tech = pd.concat([global_df, df, df2], sort=False)   # Concatenates the two lines at the end of the global dataframe

# To reorganise the rows to have technology and capacity lines just below the header, it is necessary to create an index list

new_indexes = ["Technology", "Capacity (MW)"]                       # DataFrame will begin with those indexes
for i in range(global_df_with_tech.shape[0]-2):
    # For all other lines, we append their index at the end of the index list
    new_indexes.append(i)
#end for
#print(new_indexes[:3], "...", new_indexes[-3:])                     # Optional print to validate the swap

final_df = global_df_with_tech.reindex(new_indexes)                 # Swaping lines per se
#print(final_df)

### Global DataFrame export
This step can prove quite long: anticipate 1 minute duration for 2011 - 2023 export

In [102]:
final_df.to_csv("./Processed_data/dataframe_RTE.csv.xz")