In [1]:
######################################## STEP 1 - Data acquistion #######################################################

##### ########## ######### Vehicle Fuel Economy Dataset ######################################################

## Downloading the Vehicle Fuel Economy dataset
#import numpy as np

import pandas as pd
import requests
import numpy as np 
from datetime import datetime

#API endpoint URL
api_url = "https://www.fueleconomy.gov/feg/epadata/vehicles.csv"

# Make an HTTP GET request to the API endpoint
response = requests.get(api_url)
print(response)

if response.status_code == 200:
    # Define the file path where you want to save the downloaded CSV file
    file_path = "vehicleFuelEconomy.csv"

    # Open the file in binary write mode and write the content of the response to it
    with open(file_path, "wb") as file:
        file.write(response.content)

    print(f"File downloaded and saved as '{file_path}'")
else:
    print(f"Failed to download the file. HTTP status code: {response.status_code}")
    
    
    
################ Showing the dataset ###############
# Load the datasets into DataFrames
df_VehicleFuelEconomyInfo = pd.read_csv(file_path, low_memory=False)
print(" Downloaded Dataset - Vehicle Fuel Economy Information:")
display(df_VehicleFuelEconomyInfo.head() ) # Display the first few rows of the DataFrame



############################################################### STEP 2 - DATA PROCESSING ######################################
######## ############    Remove outliers ################## ####################

# Get a list of unique values in the 'atvType' column
unique_atv_types = df_VehicleFuelEconomyInfo['atvType'].unique()
# Display the list of unique values
print(unique_atv_types)


# Filter the DataFrame to include only rows where 'atvType' is equal to "EV"
filtered_df = df_VehicleFuelEconomyInfo[df_VehicleFuelEconomyInfo['atvType'] == 'EV']
#column to remove outliers
column_to_clean = 'cityE'
# Calculate the Z-scores for the 'cityE' column in the filtered DataFrame
z_scores = np.abs((filtered_df[column_to_clean] - filtered_df[column_to_clean].mean()) / filtered_df[column_to_clean].std())
# Set a threshold for Z-scores
threshold = 2.0
# Remove rows with Z-scores exceeding the threshold
filtered_df = filtered_df[z_scores <= threshold]

# Display the cleaned DataFrame
display(filtered_df)


###### Null values Handling ############### ######

## Pandas is treating NULLs and NaNs so we will keep them this way

######################################## STEP 3 - DATA TRANSFORMATION ###########################################

df_VehicleFuelEconomyInfo = filtered_df
#normalization
# 'youSaveSpend' is the column we will normalize
# Identifying the minimum and maximum values in the 'youSaveSpend' column
min_savings = df_VehicleFuelEconomyInfo['youSaveSpend'].min()
max_savings = df_VehicleFuelEconomyInfo['youSaveSpend'].max()

# Apply Min-Max scaling to the 'youSaveSpend' column
df_VehicleFuelEconomyInfo['NormalizedSavings'] = (df_VehicleFuelEconomyInfo['youSaveSpend'] - min_savings) / (max_savings - min_savings)

# Display the DataFrame with the normalized 'youSaveSpend' column
display(df_VehicleFuelEconomyInfo.head())


########## ###################  new column -- CAR AGE  ################ ############################
# Calculate the car age based on the 'year' column and the current year
current_year = datetime.now().year
df_VehicleFuelEconomyInfo['CarAge'] = current_year - df_VehicleFuelEconomyInfo['year']

# Display the DataFrame with the new 'CarAge' column
display(df_VehicleFuelEconomyInfo.head())


############# ############# remove OLD CARS ############# ###################
# Define the threshold 
old_threshold = 1985

# Remove rows for "old cars"
df = df_VehicleFuelEconomyInfo[df_VehicleFuelEconomyInfo['year'] >= old_threshold]
display(df.head())


################### Saving the file ###################
csv_file = "VehicleFuelEconomy_Transformed.csv"

# to_csv() method to save the DataFrame to a CSV file (header is included by default)
df_VehicleFuelEconomyInfo.to_csv(csv_file, index=False)  # Set index=False to exclude the index column

print("DataFrame VehicleFuelEconomy saved to CSV file with header.")



<Response [200]>
File downloaded and saved as 'vehicleFuelEconomy.csv'
 Downloaded Dataset - Vehicle Fuel Economy Information:


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,14.167143,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,27.046364,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,11.018889,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,27.046364,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,15.658421,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


[nan 'Diesel' 'Hybrid' 'Bifuel (CNG)' 'CNG' 'FFV' 'EV' 'Bifuel (LPG)'
 'Plug-in Hybrid']


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
7138,0.0960,0.0,0.0,0.0,81,0.0000,0,0.0,0.0,41.0000,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0
7139,0.1128,0.0,0.0,0.0,81,0.0000,0,0.0,0.0,41.0000,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0
8143,0.1128,0.0,0.0,0.0,81,0.0000,0,0.0,0.0,41.0000,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0
8144,0.1248,0.0,0.0,0.0,74,0.0000,0,0.0,0.0,46.0000,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0
8147,0.1080,0.0,0.0,0.0,84,0.0000,0,0.0,0.0,40.0000,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40674,0.1032,0.0,0.0,13.0,82,81.7384,0,0.0,0.0,41.2352,...,RIV,,0.0,,Mon Jul 31 00:00:00 EDT 2023,Mon Jul 31 00:00:00 EDT 2023,N,0,0,0
40675,0.1056,0.0,0.0,13.0,80,80.2542,0,0.0,0.0,41.9978,...,RIV,,0.0,,Mon Jul 31 00:00:00 EDT 2023,Mon Jul 31 00:00:00 EDT 2023,N,0,0,0
40676,0.1056,0.0,0.0,13.0,80,80.2542,0,0.0,0.0,41.9978,...,RIV,,0.0,,Mon Jul 31 00:00:00 EDT 2023,Mon Jul 31 00:00:00 EDT 2023,N,0,0,0
40677,0.0936,0.0,0.0,12.0,89,89.3900,0,0.0,0.0,37.7056,...,VTP,,0.0,,Mon Jul 31 00:00:00 EDT 2023,Mon Jul 31 00:00:00 EDT 2023,N,0,0,0


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb,NormalizedSavings
7138,0.096,0.0,0.0,0.0,81,0.0,0,0.0,0.0,41.0,...,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.5
7139,0.1128,0.0,0.0,0.0,81,0.0,0,0.0,0.0,41.0,...,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.285714
8143,0.1128,0.0,0.0,0.0,81,0.0,0,0.0,0.0,41.0,...,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.285714
8144,0.1248,0.0,0.0,0.0,74,0.0,0,0.0,0.0,46.0,...,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.142857
8147,0.108,0.0,0.0,0.0,84,0.0,0,0.0,0.0,40.0,...,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.357143


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb,NormalizedSavings,CarAge
7138,0.096,0.0,0.0,0.0,81,0.0,0,0.0,0.0,41.0,...,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.5,23
7139,0.1128,0.0,0.0,0.0,81,0.0,0,0.0,0.0,41.0,...,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.285714,23
8143,0.1128,0.0,0.0,0.0,81,0.0,0,0.0,0.0,41.0,...,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.285714,22
8144,0.1248,0.0,0.0,0.0,74,0.0,0,0.0,0.0,46.0,...,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.142857,22
8147,0.108,0.0,0.0,0.0,84,0.0,0,0.0,0.0,40.0,...,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.357143,22


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb,NormalizedSavings,CarAge
7138,0.096,0.0,0.0,0.0,81,0.0,0,0.0,0.0,41.0,...,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.5,23
7139,0.1128,0.0,0.0,0.0,81,0.0,0,0.0,0.0,41.0,...,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.285714,23
8143,0.1128,0.0,0.0,0.0,81,0.0,0,0.0,0.0,41.0,...,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.285714,22
8144,0.1248,0.0,0.0,0.0,74,0.0,0,0.0,0.0,46.0,...,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.142857,22
8147,0.108,0.0,0.0,0.0,84,0.0,0,0.0,0.0,40.0,...,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jun 02 00:00:00 EDT 2020,N,0,0,0,0.357143,22


DataFrame VehicleFuelEconomy saved to CSV file with header.
