In [1]:
######################################## STEP 1 - Data acquistion #######################################################

##### ########## ######### Alternative Fuel Stations Dataset ######################################################

## Downloading the Alternative Fuel Station dataset

import pandas as pd
import requests
import json
import numpy as np

# URL and API KEY
api_url = "https://developer.nrel.gov/api/alt-fuel-stations/v1.csv?api_key=Nv1cnxwgtq1cIaOqIKYRTA3jg744nVGal8W7LbLh"

# Make an HTTP GET request to the API endpoint
response = requests.get(api_url)
print(response)

if response.status_code == 200:
    #file pathto save the downloaded CSV file
    file_path = "alternative_fuel_stations.csv"

    # Open the file in binary write mode and write the content of the response to it
    with open(file_path, "wb") as file:
        file.write(response.content)

    print(f"File downloaded and saved as '{file_path}'")
else:
    print(f"Failed to download the file. HTTP status code: {response.status_code}")
    

#######  ######### Showing the dataset #########   ######
# Loading the dataset
df_AlternativeFuelStations = pd.read_csv(file_path, low_memory=False)

# Display the content of each DataFrame
print("Downloaded Dataset - Alternative Fuel Stations:")
display(df_AlternativeFuelStations.head())  

############################################################ STEP 2 - Data Processing ##########################################

######### ######    Remove duplicates ########### ########################
#because our dataset treats null values as NaN's and NaN == NaN will return false
#creating temporary placeholder
df = df_AlternativeFuelStations.fillna('temporary')
#checking for duplaictes 
duplicates = df.duplicated().sum()
print("number of duplicate rows found: ", duplicates)
#returning the dataset to its former Stage
df = df.replace('temporary', np.nan)

#Since this method did not remove any duplicates, we can use other methods, for example:
#considreing a KEY pair to identify each entry

# Identify and keep only the duplicated rows based on the 'Latitude' and 'Longitude' columns
duplicates = df[df.duplicated(subset=['Latitude', 'Longitude'], keep=False)]
# Sort the duplicated rows by 'Latitude' and 'Longitude'
sorted_duplicates = duplicates.sort_values(by=['Latitude', 'Longitude'])

# Display the duplicated rows ordered by 'Latitude' and 'Longitude'
print("Duplicate rows based on Latitude and Longitude, ordered:")
display(sorted_duplicates.head())

#so we are dropping these duplicates
# Display the count of duplicated results
num_total = len(df_AlternativeFuelStations)
display("Total number of entries BEFORE removing duplicates", num_total)

# Drop duplicates based on the 'Latitude' and 'Longitude' columns
df_AlternativeFuelStations = df_AlternativeFuelStations.drop_duplicates(subset=['Latitude', 'Longitude'])

num_total = len(df_AlternativeFuelStations)
# Display the count of duplicated results
display("Total number of entries AFTER removing duplicates", num_total)

# Display the DataFrame with duplicates removed
display(df_AlternativeFuelStations.head())

############### ############### Removing Outliers ############## ##############################

## mantain only the rows with  US or CA for country
df_AlternativeFuelStations = df_AlternativeFuelStations[df_AlternativeFuelStations['Country'].isin(['US', 'CA'])]
display(df_AlternativeFuelStations.head())


################################################### STEP 3 - DATA TRANSFORMATION ##########################################
# STEP 3 -- Data Tranformation
#Datetype date Conversion
date_column_dtype = df_AlternativeFuelStations['Date Last Confirmed'].dtypes
date_column_dtype2 = df_AlternativeFuelStations['Updated At'].dtypes
date_column_dtype3 = df_AlternativeFuelStations['Open Date'].dtypes

print("before: Data type of 'Date Last Confirmed ' column:", date_column_dtype)
print("before: Data type of 'Updated At' column:", date_column_dtype2)
print("before: Data type of 'Open Date' column:", date_column_dtype3)


#setting Date columns to datype of date

# Convert the 'DateStrings' column to datetime
df_AlternativeFuelStations ['Date Last Confirmed'] = pd.to_datetime(df_AlternativeFuelStations ['Date Last Confirmed'])
df_AlternativeFuelStations ['Updated At'] = pd.to_datetime(df_AlternativeFuelStations ['Updated At'])
df_AlternativeFuelStations ['Open Date'] = pd.to_datetime(df_AlternativeFuelStations ['Open Date'])


# Check the data type of the 'Date' column
date_column_dtype = df_AlternativeFuelStations['Date Last Confirmed'].dtypes
date_column_dtype2 = df_AlternativeFuelStations['Updated At'].dtypes
date_column_dtype3 = df_AlternativeFuelStations['Open Date'].dtypes

print("after: Data type of 'Date Last Confirmed ' column:", date_column_dtype)
print("after: Data type of 'Updated At' column:", date_column_dtype2)
print("after: Data type of 'Open Date' column:", date_column_dtype3)


##Showing that it did not break the data
selected_columns = df_AlternativeFuelStations[['Date Last Confirmed', 'Updated At', 'Open Date']]
display(selected_columns.head())


################### Saving the file ###################
csv_file = "AlternativeFuelStations_Transformed.csv"

# to_csv() method to save the DataFrame to a CSV file (header is included by default)
df_AlternativeFuelStations.to_csv(csv_file, index=False)  # Set index=False to exclude the index column

print("DataFrame saved to CSV file with header.")


<Response [200]>
File downloaded and saved as 'alternative_fuel_stations.csv'
Downloaded Dataset - Alternative Fuel Stations:


Unnamed: 0,Fuel Type Code,Station Name,Street Address,Intersection Directions,City,State,ZIP,Plus4,Station Phone,Status Code,...,Restricted Access,RD Blends,RD Blends (French),RD Blended with Biodiesel,RD Maximum Biodiesel Level,NPS Unit Name,CNG Station Sells Renewable Natural Gas,LNG Station Sells Renewable Natural Gas,Maximum Vehicle Class,EV Workplace Charging
0,CNG,Spire - Montgomery Operations Center,2951 Chestnut St,,Montgomery,AL,36107,,,E,...,,,,,,,False,,MD,
1,CNG,Metropolitan Atlanta Rapid Transit Authority,2424 Piedmont Rd NE,,Atlanta,GA,30324,,,E,...,,,,,,,,,LD,
2,CNG,United Parcel Service,270 Marvin Miller Dr,,Atlanta,GA,30336,,,E,...,,,,,,,,,HD,
3,CNG,Arkansas Oklahoma Gas Corp,2100 S Waldron Rd,,Fort Smith,AR,72903,,479-783-3181,E,...,False,,,,,,False,,MD,
4,CNG,Clean Energy - Logan International Airport,1000 Cottage St Ext,"From Route 1, take the first exit after Callah...",East Boston,MA,2128,,866-809-4869,E,...,False,,,,,,True,,MD,


number of duplicate rows found:  0
Duplicate rows based on Latitude and Longitude, ordered:


Unnamed: 0,Fuel Type Code,Station Name,Street Address,Intersection Directions,City,State,ZIP,Plus4,Station Phone,Status Code,...,Restricted Access,RD Blends,RD Blends (French),RD Blended with Biodiesel,RD Maximum Biodiesel Level,NPS Unit Name,CNG Station Sells Renewable Natural Gas,LNG Station Sells Renewable Natural Gas,Maximum Vehicle Class,EV Workplace Charging
31560,ELEC,Davenport - 3,NA NA,,,FL,0,,855-900-7584,E,...,,,,,,,,,,False
31561,ELEC,Davenport - 4,NA NA,,,FL,0,,855-900-7584,E,...,,,,,,,,,,False
31574,ELEC,Orlando - 4,NA NA,,,FL,0,,855-900-7584,E,...,,,,,,,,,,False
43960,ELEC,Home2Suites Asheville Airport,NA NA,,,NC,0,,855-900-7584,E,...,,,,,,,,,,False
7733,ELEC,Kona Nissan,76-6353 Kuakini Hwy,,Kailua-Kona,HI,96740,,808-329-4408,E,...,False,,,,,,,,,False


'Total number of entries BEFORE removing duplicates'

78437

'Total number of entries AFTER removing duplicates'

76077

Unnamed: 0,Fuel Type Code,Station Name,Street Address,Intersection Directions,City,State,ZIP,Plus4,Station Phone,Status Code,...,Restricted Access,RD Blends,RD Blends (French),RD Blended with Biodiesel,RD Maximum Biodiesel Level,NPS Unit Name,CNG Station Sells Renewable Natural Gas,LNG Station Sells Renewable Natural Gas,Maximum Vehicle Class,EV Workplace Charging
0,CNG,Spire - Montgomery Operations Center,2951 Chestnut St,,Montgomery,AL,36107,,,E,...,,,,,,,False,,MD,
1,CNG,Metropolitan Atlanta Rapid Transit Authority,2424 Piedmont Rd NE,,Atlanta,GA,30324,,,E,...,,,,,,,,,LD,
2,CNG,United Parcel Service,270 Marvin Miller Dr,,Atlanta,GA,30336,,,E,...,,,,,,,,,HD,
3,CNG,Arkansas Oklahoma Gas Corp,2100 S Waldron Rd,,Fort Smith,AR,72903,,479-783-3181,E,...,False,,,,,,False,,MD,
4,CNG,Clean Energy - Logan International Airport,1000 Cottage St Ext,"From Route 1, take the first exit after Callah...",East Boston,MA,2128,,866-809-4869,E,...,False,,,,,,True,,MD,


Unnamed: 0,Fuel Type Code,Station Name,Street Address,Intersection Directions,City,State,ZIP,Plus4,Station Phone,Status Code,...,Restricted Access,RD Blends,RD Blends (French),RD Blended with Biodiesel,RD Maximum Biodiesel Level,NPS Unit Name,CNG Station Sells Renewable Natural Gas,LNG Station Sells Renewable Natural Gas,Maximum Vehicle Class,EV Workplace Charging
0,CNG,Spire - Montgomery Operations Center,2951 Chestnut St,,Montgomery,AL,36107,,,E,...,,,,,,,False,,MD,
1,CNG,Metropolitan Atlanta Rapid Transit Authority,2424 Piedmont Rd NE,,Atlanta,GA,30324,,,E,...,,,,,,,,,LD,
2,CNG,United Parcel Service,270 Marvin Miller Dr,,Atlanta,GA,30336,,,E,...,,,,,,,,,HD,
3,CNG,Arkansas Oklahoma Gas Corp,2100 S Waldron Rd,,Fort Smith,AR,72903,,479-783-3181,E,...,False,,,,,,False,,MD,
4,CNG,Clean Energy - Logan International Airport,1000 Cottage St Ext,"From Route 1, take the first exit after Callah...",East Boston,MA,2128,,866-809-4869,E,...,False,,,,,,True,,MD,


before: Data type of 'Date Last Confirmed ' column: object
before: Data type of 'Updated At' column: object
before: Data type of 'Open Date' column: object
after: Data type of 'Date Last Confirmed ' column: datetime64[ns]
after: Data type of 'Updated At' column: datetime64[ns, UTC]
after: Data type of 'Open Date' column: datetime64[ns]


Unnamed: 0,Date Last Confirmed,Updated At,Open Date
0,2023-04-06,2023-05-30 18:46:28+00:00,2010-12-01
1,2023-01-10,2023-08-01 22:53:06+00:00,1996-12-15
2,2022-06-14,2023-08-03 03:41:49+00:00,1997-01-01
3,2023-08-10,2023-08-10 16:58:49+00:00,1997-01-01
4,2023-09-14,2023-09-14 14:01:49+00:00,1996-11-15


DataFrame saved to CSV file with header.
