# ENV/ENERGY 716 F2020 - Lesson 9 - More on pandas - revisiting eGRID

In [43]:
import pandas as pd
import numpy as np

In [44]:
PLNT=pd.read_excel("egrid2020_data.xlsx",sheet_name="PLNT20",keep_default_na=False,header=1)

In [45]:
GEN=pd.read_excel("egrid2020_data.xlsx",sheet_name="GEN20",keep_default_na=False,header=1)

## H9.1. According to eGRID2020, what is the average year plants came online? Assume the year a plant came online is equal to the year the oldest generator came online.

### Lets take a look at the column names in the PLNT  and GEN dataframes

In [46]:
PLNT.dtypes # this will print a list of all the columns and the datatype of each in front
             # it will show that some columns such as year have numeric data "int64" while others are 
             # objects. Need to keep in mind that some data are objects and not numbers so we need 
             # to convert to number any mahematical operations
print("The column names and types in the PLNT tab of eGRID 2020 are:", PLNT.dtypes)

The column names and types in the PLNT tab of eGRID 2020 are: SEQPLT20     int64
YEAR         int64
PSTATABB    object
PNAME       object
ORISPL       int64
             ...  
PLTNPR      object
PLTRPR      object
PLTHPR      object
PLCYPR      object
PLCNPR      object
Length: 140, dtype: object


In [47]:
GEN.dtypes # same as above
print("The column names and types in the GEN tab of eGRID 2020 are:", GEN.dtypes)

The column names and types in the GEN tab of eGRID 2020 are: SEQGEN20     int64
YEAR         int64
PSTATABB    object
PNAME       object
ORISPL       int64
GENID       object
NUMBLR       int64
GENSTAT     object
PRMVR       object
FUELG1      object
NAMEPCAP    object
CFACT       object
GENNTAN     object
GENNTOZ     object
GENERSRC    object
GENYRONL    object
GENYRRET    object
dtype: object


### Can use ORISPL to map the generators to the plants. The col GENYRONL is the year a generator came online

In [48]:
#We do not need all the GEN df so lets take a subset dataframe with only the columns of interest
#If you know the number of the columns of PNAME an dGENYRONL use iloc, otherwise use .loc to refer to the columns by name
GENsub=GEN.loc[:,["ORISPL","GENYRONL","GENSTAT"]]

In [49]:
#lets find how many rows and columns in the GENsub using .shape
GENsub.shape

(30193, 3)

In [50]:
#Lets just take a list of the plants in the PLNT tab
PLNTsub=PLNT.loc[:,["ORISPL","PLPRMFL","NAMEPCAP","CAPFAC" ]]
PLNTsub.shape

(12668, 4)

### Now we are going to merge the two dataframes using the pd.merge function.
#### The two required parameters are the two dataframes to merge.
#### The default type of merge is "inner" which means that all the rows in each of the two dataframes will be in the merged one


In [51]:
merge_inner=pd.merge(PLNTsub,GENsub)
merge_inner.shape

(30193, 6)

#### See below that now we have one column for each generator. If they belong to the same plant, then their info on PLPRMFL,  NAMEPCAP, CAPFAC will be the same

In [52]:
merge_inner.head(20)

Unnamed: 0,ORISPL,PLPRMFL,NAMEPCAP,CAPFAC,GENYRONL,GENSTAT
0,60814,WND,1.8,,,CN
1,54452,NG,21.6,,1977.0,OS
2,54452,NG,21.6,,1977.0,OS
3,54452,NG,21.6,,1977.0,OS
4,54452,NG,21.6,,1977.0,OS
5,54452,NG,21.6,,1977.0,OS
6,54452,NG,21.6,,,RE
7,54452,NG,21.6,,,RE
8,54452,NG,21.6,,,RE
9,54452,NG,21.6,,,RE


### The pd.merge function can take multiple optional paramters.
#### One important parameter specifies HOW to merge the dataframes. The type of merge can be "left", "right", or the dafault "inner"
#### In the case of the GEN and PLNT dataframes, this does not matter, because each plant has at least one generator and each generator belongs to one plant listed in the plant data frame. Other cases will be discussed in detail next week

merge_right=pd.merge(PLNTsub,GENsub,how='right')
merge_right.shape

In [54]:
merge_right.head(20)

Unnamed: 0,ORISPL,PLPRMFL,NAMEPCAP,CAPFAC,GENYRONL,GENSTAT
0,60814,WND,1.8,,,CN
1,54452,NG,21.6,,1977.0,OS
2,54452,NG,21.6,,1977.0,OS
3,54452,NG,21.6,,1977.0,OS
4,54452,NG,21.6,,1977.0,OS
5,54452,NG,21.6,,1977.0,OS
6,54452,NG,21.6,,,RE
7,54452,NG,21.6,,,RE
8,54452,NG,21.6,,,RE
9,54452,NG,21.6,,,RE


In [55]:
merge_left=pd.merge(PLNTsub,GENsub,how='left')
merge_left.shape

(30193, 6)

In [56]:
merge_left.head(20)

Unnamed: 0,ORISPL,PLPRMFL,NAMEPCAP,CAPFAC,GENYRONL,GENSTAT
0,60814,WND,1.8,,,CN
1,54452,NG,21.6,,1977.0,OS
2,54452,NG,21.6,,1977.0,OS
3,54452,NG,21.6,,1977.0,OS
4,54452,NG,21.6,,1977.0,OS
5,54452,NG,21.6,,1977.0,OS
6,54452,NG,21.6,,,RE
7,54452,NG,21.6,,,RE
8,54452,NG,21.6,,,RE
9,54452,NG,21.6,,,RE


### Now we need to use the function df.groupby to get rid of the rows that are repeated for a plant and save only one row per plant, with information on the oldest generator
#### The "groupby" requires specifying the column by which we will group the rows, and the aggregation for grouping.
#### It also has many other optional parameters.  

In [73]:
#This groups the generators by plant code, and returns a dataframe with two columns, the ORISPL and the GENYRONL.
#the resulting dataframe has only one row per ORISPL and under the GENYRONL it has the minimun value for all the 
# GENs of that plant.
# Below we will see how to return all the columns in the merge_left dataframe. Which in this case may be preferred
PLNT_YRONL=merge_left.groupby(["ORISPL"], sort=False, as_index=False)["GENYRONL"].min()
PLNT_YRONL.shape

(12668, 2)

In [74]:
PLNT_YRONL.head(20)

Unnamed: 0,ORISPL,GENYRONL
0,60814,
1,54452,1977.0
2,57053,2013.0
3,58982,2016.0
4,60243,1990.0
5,75,1972.0
6,7462,1990.0
7,7182,1988.0
8,62,1915.0
9,7250,1980.0


In [79]:
## The data frame PLNT_YRONL with the two columns can be merged to the original PLNT dataframe 
PLNT_Final=pd.merge(PLNTsub,PLNT_YRONL,how='left')

In [81]:
AverageYear=PLNT_Final.GENYRONL.mean()
print("the average year is",AverageYear)

the average year is 1997.7958408508214


In [116]:
##to format the average year we can use str.format to display 0 decimals. Learn more about formatting below
print("the average year is {:.0f}".format(AverageYear))

the average year is 1998


### Here is a second (and in this case, preferred) way to use groupby to get all the columns in the dataframe we are grouping. It uses df.groupby(by=[Colname]).aggfunc

In [122]:
#This groups the generators by plant code, and returns a dataframe with all the columns
#the resulting dataframe has only one row per ORISPL and under the GENYRONL it has the minimun value for all the 
# GENs of that plant.
PLNT_YRONL2=merge_left.groupby(by=["ORISPL"], sort=False, as_index=False).min()
PLNT_YRONL2.shape

(12668, 6)

### Now is time to talk about how to format numbers so we do not get so many decimal points
#### There are two methods to format the values that go inside a string to be printed.
#### Both require that we specify the placeholders for the value to enter with curly brackets inside the sentence
#### Method 1: "formatted string literals or f-strings. Allow including the value of Python expressions inside a string by prefixing the string with f or F and writing expressions as {expression}.
#### Method 2:  "format" method. This is a method to modify "str" or strings of characters.
#### The sintaxis is: string.format(value1, value2,..)

In [101]:
#Method one:  Use letter f before the string to print (formatted string literals)
Country="China"
Fuel="Coal"
Percentage=50
print(f"The country {Country}")
print (f"The country {Country} burns {Percentage} of the {Fuel} of the world")
##Note that Percentage is a number (an integer) but the print function converted it to str and printed it)
##Inside the curlybrackets we can specify the format we want, for example 3 decimals for percentage
## We specify that we want 4 decimals by typing :.2f inside the curlybrackets for Percentage
print (f"The country {Country} burns {Percentage:.4f} of the {Fuel} of the world")

The country China
The country China burns 50 of the Coal of the world
The country China burns 50.0000 of the Coal of the world


In [123]:
#Method two: Use the string.format method. Requires writing the value of the placeholder after writting .format
print("The country {Country}".format(Country="China"))
##Alternatively we can just leave unspecified what will go inside the curly brackets.
print("The country {}".format("China"))
##Specifying what goes inside the curly brackets is handy if we have more than one value
##If we do not specify what goes inside the curly brackets, then the place holders will be filled with the values 
## entered inside the parenthesis of the .format() in the order they are presented.
##The sentence below is all mixed up because the values inside format() are in the wrong order
print("The country {} burns {} of the {} of the world".format("China","Coal",50))
## we could specify the order
print ("The country {Country} burns {Percentage} of the {Fuel} of the world".format(Fuel="Coal", Percentage=50, Country="China"))
##Note that Percentage is a number but the print function converted it to str and printed it)
##Inside the curlybrackets we can specify the format we want, for example 3 decimals for percentage
## We specify that we want 1 decimal by typing :.1f inside the curlybrackets for Percentage
print ("The country {Country} burns {Percentage:.1f} of the {Fuel} of the world".format(Fuel="Coal", Percentage=50, Country="China"))

The country China
The country China
The country China burns Coal of the 50 of the world
The country China burns 50 of the Coal of the world
The country China burns 50.0 of the Coal of the world


In [35]:
#Other way to see how the format method works
text="In a scale from 1 to 10 I feel {score} today" #this is a string that has the method ".format"
print(text.format(score=10))

In a scale from 1 to 10 I feel 10 today


In [36]:
#if I want four decimals
text="In a scale from 1 to 10 I feel {score: .4f} today" #this is a string that has the method ".format"
print(text.format(score=10))

In a scale from 1 to 10 I feel  10.0000 today
