In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import time

from sqlalchemy import create_engine

## Energy potential: Bhumi (read pdf and figure out terms) & Ryan Cheng (Cleaning the data) US_Renewable_Energy_Technical_Potential

In [2]:
# opening the file and creating a df
potential_energy_path = 'Resources/Raw Data/US_Renewable_Energy_Technical_Potential.csv'
potential_df = pd.read_csv(potential_energy_path)

In [3]:
# dropping the unnessecary columns of the df
potential_df = potential_df.drop({'urbanUtilityScalePV_GW', 'urbanUtilityScalePV_km2', 
                                'ruralUtilityScalePV_GW', 'ruralUtilityScalePV_km2', 'rooftopPV_GW', 
                                'CSP_GW', 'CSP_km2', 'onshoreWind_GW', 'onshoreWind_km2', 'offshoreWind_GW', 
                                'offshoreWind_km2', 'biopowerSolid_GW', 'biopowerSolid_BDT', 
                                'biopowerGaseous_GW', 'biopowerGaseous_Tonnes-CH4', 
                                'geothermalHydrothermal_GW', 'EGSGeothermal_GW', 'hydropower_GW', 
                                'hydropower_countOfSites'}, axis = 1)
# naming the 1st column as it was not named before
potential_df.rename(columns = {'Unnamed: 0':'States'}, inplace=True)
potential_df.head()

Unnamed: 0,States,urbanUtilityScalePV_GWh,ruralUtilityScalePV_GWh,rooftopPV_GWh,CSP_GWh,onshoreWind_GWh,offshoreWind_GWh,biopowerSolid_GWh,biopowerGaseous_GWh,geothermalHydrothermal_GWh,EGSGeothermal_GWh,hydropower_GWh
0,Alabama,35850,3706838,15475.0,0,283,0.0,11193,1533,0,535489.0,4102
1,Alaska,166,8282976,,0,1373433,,513,61,15437,,23675
2,Arizona,121305,11867693,22736.0,12544333,26036,,1087,837,8329,1239147.0,1303
3,Arkansas,28960,4986388,8484.0,0,22892,,14381,1063,0,628621.0,6093
4,California,246008,8855917,106411.0,8490916,89862,2662579.0,12408,15510,130921,1344179.0,30023


In [4]:
# creating new columns for each type of energy (solar, wind, bio, geothermal, hydro) by summing the
# respective data
solar_list = ['urbanUtilityScalePV_GWh', 'ruralUtilityScalePV_GWh', 'rooftopPV_GWh', 
                                    'CSP_GWh']
potential_df['Solar Energy Total Potential (GWh)'] = potential_df[solar_list].sum(axis=1)

wind_list = ['onshoreWind_GWh', 'offshoreWind_GWh']
potential_df['Wind Energy Total Potential (GWh)'] = potential_df[wind_list].sum(axis=1)

bio_list = ['biopowerSolid_GWh', 'biopowerGaseous_GWh']
potential_df['Bio Energy Total Potential (GWh)'] = potential_df[bio_list].sum(axis=1)

geo_list = ['geothermalHydrothermal_GWh', 'EGSGeothermal_GWh']
potential_df['Geothermal Energy Total Potential (GWh)'] = potential_df[geo_list].sum(axis=1)

potential_df['Hydropower Energy Total Potential (GWh)'] = potential_df['hydropower_GWh']
potential_df.head()

Unnamed: 0,States,urbanUtilityScalePV_GWh,ruralUtilityScalePV_GWh,rooftopPV_GWh,CSP_GWh,onshoreWind_GWh,offshoreWind_GWh,biopowerSolid_GWh,biopowerGaseous_GWh,geothermalHydrothermal_GWh,EGSGeothermal_GWh,hydropower_GWh,Solar Energy Total Potential,Wind Energy Total Potential,Bio Energy Total Potential,Geothermal Energy Total Potential,Hydropower Energy Total Potential
0,Alabama,35850,3706838,15475.0,0,283,0.0,11193,1533,0,535489.0,4102,3758163.0,283.0,12726,535489.0,4102
1,Alaska,166,8282976,,0,1373433,,513,61,15437,,23675,8283142.0,1373433.0,574,15437.0,23675
2,Arizona,121305,11867693,22736.0,12544333,26036,,1087,837,8329,1239147.0,1303,24556067.0,26036.0,1924,1247476.0,1303
3,Arkansas,28960,4986388,8484.0,0,22892,,14381,1063,0,628621.0,6093,5023832.0,22892.0,15444,628621.0,6093
4,California,246008,8855917,106411.0,8490916,89862,2662579.0,12408,15510,130921,1344179.0,30023,17699252.0,2752441.0,27918,1475100.0,30023


## Old  code for reference only

In [5]:

# importing csv files.
crimes_19 = pd.read_csv('input_data/Chicago_Crimes_2019.csv')
crimes_20 = pd.read_csv('input_data/Chicago_Crimes_2020.csv')
crimes_21 = pd.read_csv('input_data/Chicago_Crimes_2021.csv')
fbi_code = pd.read_csv('input_data/FBI_Code.csv', encoding = "ISO-8859-1")
#fbi_code = pd.read_csv('input_data/FBI_Code.csv',  encoding= 'unicode_escape')
iucr_code = pd.read_csv('input_data/IUCR_Codes.csv', encoding = "ISO-8859-1")

# merging crimes data for the last 3 years into a signle dataframe
crimes_data = pd.merge((pd.merge(crimes_19,crimes_20, how="outer")),crimes_21,how="outer")

print(len(crimes_data))

#Loading the FBI cvs file
df = fbi_code
df.head()

#Loading the IUCS cvs file
df2 = iucr_code
df2.head(10)

#dropping unnecessary columns
df = df.drop(df.columns[[2,3]], axis=1)
df.columns=['code', 'description']
df.head()

#dropping unnecessary columns
df2 = df2.drop(df2.columns[[3,4]], axis=1)
df2.head()

#Checking and Removing duplicates rows
df = df.drop_duplicates()
df.head()

#writting csv file for import into pgadmin data base
#df.to_csv(r'./Output_data/FBI_code.csv', encoding='utf-8', index=False)

#Checking and Removing duplicates rows
df2 = df2.drop_duplicates()
df2.head()

#Rename Columns
df2 = df2.rename(columns=({'PRIMARY DESCRIPTION':'PRIMARY_DESCRIPTION','SECONDARY DESCRIPTION':'SECONDARY_DESCRIPTION'}))
df2.head()

#writting csv file for import into pgadmin data base
#df2.to_csv(r'./Output_data/ICUS_code.csv', encoding='utf-8', index=False)

# merging crimes data for the last 3 years into a signle dataframe
crimes_data = pd.merge((pd.merge(crimes_19,crimes_20, how="outer")),crimes_21,how="outer")

#dropping unnecessary columns
crimes_data = crimes_data.drop(columns=['Latitude','Location','Longitude','X Coordinate','Y Coordinate','Updated On'])
# crimes_data.shape[0]

# removing duplicate values
crimes_data = crimes_data.drop_duplicates()
# crimes_data.shape[0]

# removing missing values
crimes_data = crimes_data.dropna()
# crimes_data.shape[0]

# Creating a function which will remove extra leading and tailing whitespace from the data.
def whitespace_remover(crimes_data):
   
    # iterating over the columns
    for i in crimes_data.columns:
         
        # checking datatype of each columns
        if crimes_data[i].dtype == 'object':
             
            # applying strip function on column
            crimes_data[i] = crimes_data[i].map(str.strip)
        else:
             
            # if condn. is False then it will do nothing.
            pass
        
# applyting whitespace_remover function on dataframe        
whitespace_remover(crimes_data)

#Rename Columns
crimes_data = crimes_data.rename(columns=({'Case Number':'Case_Number',
                                           'Location Description':'Location_Description',
                                           'Community Area':'Community_Area',
                                          'FBI Code':'FBI_Code'}))


#writting csv file for import into pgadmin data base
#crimes_data.to_csv(r'./Output_data/crimes_data.csv', encoding='utf-8', index=False)
crimes_data.head(20)

# create database connection
connection_string = "postgres:postgressql@localhost:5432/new_crime_db"
engine = create_engine(f'postgresql://{connection_string}')
# Confirm tables
engine.table_names()

# rename columns to lower case 
df2.rename(columns={'IUCR':'iucr','PRIMARY_DESCRIPTION':'primary_description','SECONDARY_DESCRIPTION':'secondary_description' }, inplace=True)
crimes_data.columns=['id', 'case_number', 'date', 'block', 'iucr','location_description',
                     'arrest', 'domestic','beat','district', 'district_name', 'district_population',
                     'ward', 'community_area', 'fbi_code'
                    ]

#load dataframe into database for the FBI and IUCR codes

df2.to_sql(name='iucr_codes', con=engine, if_exists='append', index=False)
df.to_sql(name='fbi_description', con=engine, if_exists='append', index=False)

#load dataframe into database for the crimes data
crimes_data.to_sql(name='crime_table', con=engine, if_exists='append', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'input_data/Chicago_Crimes_2019.csv'

## Energy_consumption: Rafael (TBD) & Ryan Callaghan (TBD) US_Total_Energy_Consumption

## Artem - Webscraping & lats/Longs

In [None]:
# proving url address
url  = "https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population#cite_note-5"

# sleep for 1 second before reading url
time.sleep(1)

# scrape wiki url
wiki_data = pd.read_html(url)
# type(wiki_data)

# select the first table from the list of tables and convert to DataFrame
population_table = pd.DataFrame(wiki_data[0])
population_table



## TBD (most likely Ryan + someone else) - Postgress -> push to DB