# H6 - Exploring eGRID III with python and dataframes

In [22]:
import pandas as pd # this imports the pandas library which is necessary for data frames and calls that library
                    # can call all the functions of this library with "pd."
import numpy as np  # same for numpy
###############################################################################################################
## Below we:
## 1. Read eGRID20 file
## Read the PLNT20 sheet of the excel file into a pandas DataFrame.
## A DataFrame is a two-dimensional labeled data structure with columns of potentially different types
## Specify that the headers are on the second row of the excel file (i.e., row 1 in Python)
## 2. Confirm we read all columns by printing the first five rows
###############################################################################################################
egrid=pd.read_excel("egrid2020_data.xlsx",sheet_name="PLNT20",keep_default_na=False,header=1)
#The only two paramters necessary for this are 1) The excel file and 2.The sheet name
#the other two parameters are optional but helpful. 3)Telling python to not use the "NA" fo
#4) specifying that the header is in row 1 (row 1 is in fact the second row of the spreadsh
#because python always starts counting at 0
egrid.head(3) # This prints the first three rows of the dataframe that we just named "egrid" *the default is five rows

Unnamed: 0,SEQPLT20,YEAR,PSTATABB,PNAME,ORISPL,OPRNAME,OPRCODE,UTLSRVNM,UTLSRVID,SECTOR,...,PLWIPR,PLSOPR,PLGTPR,PLOFPR,PLOPPR,PLTNPR,PLTRPR,PLTHPR,PLCYPR,PLCNPR
0,1,2020,AK,7-Mile Ridge Wind Project,60814,Alaska Power and Telephone Co,219,Alaska Power and Telephone Co,219,Electric Utility,...,,,,,,,,,,
1,2,2020,AK,Agrium Kenai Nitrogen Operations,54452,Homer Electric Assn Inc,19558,Agrium US Inc,179,Industrial CHP,...,,,,,,,,,,
2,3,2020,AK,Alakanuk,57053,"Alaska Village Elec Coop, Inc",221,"Alaska Village Elec Coop, Inc",221,Electric Utility,...,,,,,,,,,,


## a. How many plants are in the eGRID PLNT spreadsheet?

In [23]:
#Use the function len(). It is wise enough to not count the header in the number of rows
len(egrid)

12668

In [25]:
#another way to print the statement is first storing the lenght of the dataframe in a variable called 
# "NumPlant"  and then calling that variable to print it
NumPlants=len(egrid)
print("The variable NumPlants is of type", type(NumPlants)) ##This show that NumPlants is data of type "int" o
print("There are", NumPlants,"plants in eGRID2020")


The variable NumPlants is of type <class 'int'>
There are 12668 plants in eGRID2020


## What is the data type of the data stored in the columns of the dataframe eGRID?

In [26]:
#Use the pandas function pd.dtypes
egrid.dtypes # this will print a list of all the columns and the datatype of each in front
             # it will show that some columns such as year have numeric data "int64" while others are 
             # objects. Need to keep in mind that some data are objects and not numbers so we need 
             # to convert to number any mahematical operations
print("The column names and types in the PLNT tab of eGRID 2020 are:", egrid.dtypes)

The column names and types in the PLNT tab of eGRID 2020 are: SEQPLT20     int64
YEAR         int64
PSTATABB    object
PNAME       object
ORISPL       int64
             ...  
PLTNPR      object
PLTRPR      object
PLTHPR      object
PLCYPR      object
PLCNPR      object
Length: 140, dtype: object


# b. What is the total nameplate capacity of the power plants recorded in eGRID?

In [27]:
#The column NAMEPCAP has the nameplate capacity of the plants
#lets check its type to see if it is numeric and we can sum it
#We find the datatype of a column in a dataframe with dataframe["columnheader"].dtype
print("The data type of the NAMEPCAP column in the PLNT tab of eGRID 2020 is", egrid.NAMEPCAP.dtype)
#An alternative is:
#print("The data type of the NAMEPCAP column in the PLNT tab of eGRID 2020 is", egrid["NAMEPCAP"].dtype)

The data type of the NAMEPCAP column in the PLNT tab of eGRID 2020 is object


In [28]:
#since we cannot do math with objects, we need to convert the datatype of NAMEPCAP to a number
#we do this with the function pd.to_numeric(dataframe["columnname"])
#We assign to the column NAMEPCAP in the existing dataframe, the same values in the object but converted 
#to num
egrid.NAMEPCAP=pd.to_numeric(egrid.NAMEPCAP)
#An alternative is
#egrid["NAMEPCAP"]=pd.to_numeric(egrid["NAMEPCAP"])

In [29]:
#Now check that the datatype of the data in NAMEPCAP is indeed a number (in fact is a real number with decimals
print("The data type of the NAMEPCAP column in the PLNT tab of eGRID 2020 is now", egrid.NAMEPCAP.dtype)
#An alternative is
#print("The data type of the NAMEPCAP column in the PLNT tab of eGRID 2020 is now", egrid["NAMEPCAP"].dtype)

The data type of the NAMEPCAP column in the PLNT tab of eGRID 2020 is now float64


In [30]:
#Now that the datatype of NAMEPCAP is numbers we can add it for all the plants
US_MW_Cap=egrid.NAMEPCAP.sum() # I am storing the sum of all the values in NAMECAP into a variable called US_MW
print("The total name plate capacity of US plants according to eGRID2020 is", US_MW_Cap, "MW")
# An alternative is:
# US_MW_Cap=egrid["NAMEPCAP"].sum()

The total name plate capacity of US plants according to eGRID2020 is 1656187.7000000002 MW


# c. What was the total power generation in 2020 from these plants?

In [31]:
#Check the datatype of the data in PLNGENAN is indeed a number (in fact is a real number with decimals
print("The data type of the PLNGENAN column in the PLNT tab of eGRID 2020 is ", egrid.PLNGENAN.dtype)

The data type of the PLNGENAN column in the PLNT tab of eGRID 2020 is  object


In [34]:
#Again we need to convert the datatype of PLNGENAN to a number
#we do this with the function pd.to_numeric(dataframe.columnname)
egrid.PLNGENAN=pd.to_numeric(egrid.PLNGENAN)
print("Data type of the PLNGENAN column in the PLNT tab of eGRID 2020 is now", egrid.PLNGENAN.dtype)

Data type of the PLNGENAN column in the PLNT tab of eGRID 2020 is now float64


In [35]:
#Now that the datatype of PLNGENAN is numeric we can add it for all the plants
US_MWh_Gen=egrid.PLNGENAN.sum() # I am storing the sum of all the values in a variable I choose to call US_MWh_
print("The total GENERATION from US plants according to eGRID2020 was", US_MWh_Gen, "MWh")

The total GENERATION from US plants according to eGRID2020 was 4021549453.402 MWh


# d. How many solar plants were in eGRID 2020?

In [36]:
# First lets find how does eGRID call the solar plants that use solar energy.
# lets look at the fuel column and see what are all the values
# We use the pandas "unique" function which returns an array with all the unique values
egrid.PLFUELCT.unique()

array(['WIND', 'GAS', 'OIL', 'HYDRO', 'COAL', 'OTHF', 'BIOMASS', 'OFSL',
       'SOLAR', 'NUCLEAR', '', 'GEOTHERMAL'], dtype=object)

In [37]:
#Create a new dataframe that only has the solar plants
egridSolar=egrid[(egrid.PLFUELCT=="SOLAR")]

In [38]:
#Now lets find the length of the new dataframe egridSolar
NumSolarPlants=len(egridSolar)
print("The total number of SOLAR plants in the US according to eGRID2020 was", NumSolarPlants)

The total number of SOLAR plants in the US according to eGRID2020 was 4820


# e. What was the power generation of SOLAR power plants in 2020?

In [42]:
US_Solar_MWh_Gen=egridSolar.PLNGENAN.sum() # I am storing the sum of all the values in a variable 
                                           # I chose to vall this variable US_Solar_MWh_Gen
print("The total GENERATION from SOLAR US plants according to eGRID2020 was", US_Solar_MWh_Gen, "MWh")

The total GENERATION from SOLAR US plants according to eGRID2020 was 88969043.38800001 MWh


# f. What percentage of the US 2020 solar generation was from South Carolina and North Carolina

In [43]:
# Extract the SOLAR plants in NC and SC using PSTATABB
NCSC_Solar=egridSolar[(egridSolar.PSTATABB=="NC")|(egridSolar.PSTATABB=="SC")]
print("There are",len(NCSC_Solar),"SOLAR plants in NC and SC")

There are 916 SOLAR plants in NC and SC


In [44]:
#Now find the solar generation form the carolinas
NCSC_Solar_MWh_Gen=NCSC_Solar.PLNGENAN.sum()
print("The total GENERATION from SOLAR plants In the CAROLINAS according to eGRID2020 was", NCSC_Solar_MWh_Gen, "MWh")

The total GENERATION from SOLAR plants In the CAROLINAS according to eGRID2020 was 9964585.003 MWh


In [45]:
#Now calculate the percentage by dividing the carolinas generation by the US solar generation
NCSC_Solar_Gen_Percent=NCSC_Solar_MWh_Gen/US_Solar_MWh_Gen*100
print("The CAROLINAS according to eGRID2020 generated ", NCSC_Solar_Gen_Percent, "% of US solar power in 2020")

The CAROLINAS according to eGRID2020 generated  11.200058608637358 % of US solar power in 2020
