In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import time

from sqlalchemy import create_engine

## Energy potential: Bhumi (read pdf and figure out terms) & Ryan Cheng (Cleaning the data) US_Renewable_Energy_Technical_Potential

In [2]:
# opening the file and creating a df
potential_energy_path = 'Resources/Raw Data/US_Renewable_Energy_Technical_Potential.csv'
potential_df = pd.read_csv(potential_energy_path)

In [3]:
# dropping the unnessecary columns of the df
potential_df = potential_df.drop({'urbanUtilityScalePV_GW', 'urbanUtilityScalePV_km2', 
                                'ruralUtilityScalePV_GW', 'ruralUtilityScalePV_km2', 'rooftopPV_GW', 
                                'CSP_GW', 'CSP_km2', 'onshoreWind_GW', 'onshoreWind_km2', 'offshoreWind_GW', 
                                'offshoreWind_km2', 'biopowerSolid_GW', 'biopowerSolid_BDT', 
                                'biopowerGaseous_GW', 'biopowerGaseous_Tonnes-CH4', 
                                'geothermalHydrothermal_GW', 'EGSGeothermal_GW', 'hydropower_GW', 
                                'hydropower_countOfSites'}, axis = 1)
# naming the 1st column as it was not named before
potential_df.rename(columns = {'Unnamed: 0':'States'}, inplace=True)
potential_df1 = potential_df.fillna(0)
potential_df1.head()

Unnamed: 0,States,urbanUtilityScalePV_GWh,ruralUtilityScalePV_GWh,rooftopPV_GWh,CSP_GWh,onshoreWind_GWh,offshoreWind_GWh,biopowerSolid_GWh,biopowerGaseous_GWh,geothermalHydrothermal_GWh,EGSGeothermal_GWh,hydropower_GWh
0,Alabama,35850,3706838,15475.0,0,283,0.0,11193,1533,0,535489.0,4102
1,Alaska,166,8282976,0.0,0,1373433,0.0,513,61,15437,0.0,23675
2,Arizona,121305,11867693,22736.0,12544333,26036,0.0,1087,837,8329,1239147.0,1303
3,Arkansas,28960,4986388,8484.0,0,22892,0.0,14381,1063,0,628621.0,6093
4,California,246008,8855917,106411.0,8490916,89862,2662579.0,12408,15510,130921,1344179.0,30023


In [4]:
# creating new columns for each type of energy (solar, wind, bio, geothermal, hydro) by summing the
# respective data
solar_list = ['urbanUtilityScalePV_GWh', 'ruralUtilityScalePV_GWh', 'rooftopPV_GWh', 
                                    'CSP_GWh']
potential_df1['Solar Energy Total Potential (GWh)'] = potential_df1[solar_list].sum(axis=1)

wind_list = ['onshoreWind_GWh', 'offshoreWind_GWh']
potential_df1['Wind Energy Total Potential (GWh)'] = potential_df1[wind_list].sum(axis=1)

bio_list = ['biopowerSolid_GWh', 'biopowerGaseous_GWh']
potential_df1['Bio Energy Total Potential (GWh)'] = potential_df1[bio_list].sum(axis=1)

geo_list = ['geothermalHydrothermal_GWh', 'EGSGeothermal_GWh']
potential_df1['Geothermal Energy Total Potential (GWh)'] = potential_df1[geo_list].sum(axis=1)

potential_df1['Hydropower Energy Total Potential (GWh)'] = potential_df1['hydropower_GWh']
potential_df1.head()

Unnamed: 0,States,urbanUtilityScalePV_GWh,ruralUtilityScalePV_GWh,rooftopPV_GWh,CSP_GWh,onshoreWind_GWh,offshoreWind_GWh,biopowerSolid_GWh,biopowerGaseous_GWh,geothermalHydrothermal_GWh,EGSGeothermal_GWh,hydropower_GWh,Solar Energy Total Potential (GWh),Wind Energy Total Potential (GWh),Bio Energy Total Potential (GWh),Geothermal Energy Total Potential (GWh),Hydropower Energy Total Potential (GWh)
0,Alabama,35850,3706838,15475.0,0,283,0.0,11193,1533,0,535489.0,4102,3758163.0,283.0,12726,535489.0,4102
1,Alaska,166,8282976,0.0,0,1373433,0.0,513,61,15437,0.0,23675,8283142.0,1373433.0,574,15437.0,23675
2,Arizona,121305,11867693,22736.0,12544333,26036,0.0,1087,837,8329,1239147.0,1303,24556067.0,26036.0,1924,1247476.0,1303
3,Arkansas,28960,4986388,8484.0,0,22892,0.0,14381,1063,0,628621.0,6093,5023832.0,22892.0,15444,628621.0,6093
4,California,246008,8855917,106411.0,8490916,89862,2662579.0,12408,15510,130921,1344179.0,30023,17699252.0,2752441.0,27918,1475100.0,30023


## Energy_consumption: Rafael (TBD) & Ryan Callaghan (TBD) US_Total_Energy_Consumption

In [5]:
# opening the file and creating a df
energy_consumption_path = 'Resources/Raw Data/US_Total_Energy_Consumption.csv'
energy_consumption_df = pd.read_csv(energy_consumption_path)
energy_consumption_df.head()

Unnamed: 0.1,Unnamed: 0,Rank,State,"Total Energy Consumed per Capita, million Btu"
0,0,1,Wyoming,932
1,1,2,Louisiana,922
2,2,3,North Dakota,875
3,3,4,Alaska,839
4,4,5,Iowa,517


In [6]:
# Change state abb. to names
# Artem to take a look

dict = {
    "AK": "Alaska", "AL":"Alabama", "AR": "Arkansas", "AZ": "Arizona", "CA": "California","CO": "Colorado", "CT": "Connecticut",
"DC": "District of Columbia","DE":"Delaware","FL":"Florida","GA": "Georgia",
"HI":"Hawaii","IA": "Iowa","ID":"Idaho","IL": "Illinois","IN":"Indiana",
"KS": "Kansas","KY" :"Kentucky","LA": "Louisiana","MA":"Massachusetts","MD":"Maryland",
"ME" :"Maine","MI":"Michigan","MN":"Minnesota","MO": "Missouri","MS" : "Mississippi","MT":"Montana",
"NC": "North Carolina","ND": "North Dakota","NE":"Nebraska","NH":"New Hampshire","NJ":"New Jersey","NM": "New Mexico",
"NV": "Nevada","NY": "New York","OH": "Ohio","OK": "Oklahoma","OR": "Oregon","PA":"Pennsylvania",
"RI":"Rhode Island","SC": "South Carolina","SD":"South Dakota","TN": "Tennessee","TX": "Texas",
"UT": "Utah","VA": "Virginia","VT":"Vermont","WA":"Washington","WI":"Wisconsin",
"WV": "West Virginia","WY":"Wyoming"
}
energy_consumption_df["State"] = energy_consumption_df["State"].map(dict)
energy_consumption_df.head()

Unnamed: 0.1,Unnamed: 0,Rank,State,"Total Energy Consumed per Capita, million Btu"
0,0,1,,932
1,1,2,,922
2,2,3,,875
3,3,4,,839
4,4,5,,517


In [7]:
# Save the dataframe as csv
energy_consumption_df.to_csv("Resources/Raw Data/US_Total_Energy_Consumption.csv")

## Artem - Webscraping & lats/Longs

In [None]:
# proving url address
url  = "https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population#cite_note-5"

# sleep for 1 second before reading url
time.sleep(1)

# scrape wiki url
wiki_data = pd.read_html(url)
# type(wiki_data)

# select the first table from the list of tables and convert to DataFrame
population_table = pd.DataFrame(wiki_data[0])

# dropping first level of multi index column headers
population_2021 = population_table.droplevel(0, axis=1) 

# selecting state and population as of 2021 columns
population_2021 = population_2021 [['State or territory','July 1, 2021']]

# rename columns 
population_2021 = population_2021.rename(columns=({'State or territory':'State','July 1, 2021':'Population'}))

# removing extra rows
population_2021 = population_2021[0:52]

# remving territories
population_2021 = population_2021.drop(labels=[29,49], axis=0)
population_2021.head()



## TBD (most likely Ryan + someone else) - Postgress -> push to DB