In [61]:
# IPython Imports

# Used to make notebook wider, comment out for normal notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
from IPython.display import clear_output

In [62]:
# Import Libraries
import boto3
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Dict, Callable

In [63]:
# GLOBAL HELPER FUNCTIONS
def print_ref(d_obj:Dict[str, any], row_len:int=4) -> None:
    ref = \
f""" ---------------------------
| OBJECT KEYS FOR REFERENCE |
 ---------------------------
"""
    row = 0
    for key in d_obj.keys():
        if row == row_len: 
            row = 0
            ref += "\n"
        ref += f" {key} |"
        row += 1
    print(ref)
    
def parse_col(col_list:List[str], data_frame:pd.DataFrame=None) -> List[str]:
    res, len_ = [], len(col_list) + 1
    df_exists = data_frame is not None
    for num, col in enumerate(col_list):
        example = data_frame[col].iloc[0] if df_exists else "None"
        ans = input(f"""
Item #: {num + 1}/{len_}
Column: {col}
Example: {example}
[y/n] Default[n] -> """).lower()
        clear_output()
        if ans == "y":
            res.append(col)
    return res
            

In [64]:
# GLOBAL VARS
BKT_NAME = 'ds-data-2020'

# DATASET NAMES
the_cc = "thecarconnectiondataset.csv" # Main Dataset
motortrend = "motortrend1974.csv" # Currently Not Using
car_feat_msrp = "carfeaturesmsrp.csv" # Currently Not Using

# SET DATA PATH/KEYS
KEY = the_cc

In [65]:
# Create boto3 s3 Client Object
s3 = boto3.client('s3')
# Create s3 Bucket Object -> Dict
obj = s3.get_object(Bucket=BKT_NAME, Key=KEY)

# obj_keys = obj.keys()
print_ref(obj)


 ---------------------------
| OBJECT KEYS FOR REFERENCE |
 ---------------------------
 ResponseMetadata | AcceptRanges | LastModified | ContentLength |
 ETag | ContentType | Metadata | Body |


In [66]:
# Might have to chunk data
chk_size = 2000


# Convert Bucket File to transposed pd.DataFrame
auto_df = pd.read_csv(obj['Body'], index_col=0, header=None, low_memory=False).T

In [67]:
# Rename known NaN columns
auto_df.rename(columns={ auto_df.columns[0]: "Model" }, inplace = True)

In [68]:
# Get only wanted columns
# col = parse_col(auto_df.columns, auto_df)

# col = ["Model", 'MSRP', 'Gas Mileage', 'Engine', 'EPA Class', 'Style Name', 'Drivetrain', 'Passenger Capacity', 'Passenger Doors', 'Body Style', 'Transmission', 'Base Curb Weight (lbs)', 'Wheelbase (in)', 'Min Ground Clearance (in)', 'Track Width, Front (in)', 'Track Width, Rear (in)', 'Height, Overall (in)', 'Fuel Economy Est-Combined (MPG)', 'SAE Net Torque @ RPM', 'Fuel System', 'Engine Type', 'SAE Net Horsepower @ RPM', 'Displacement', 'First Gear Ratio (:1)', 'Sixth Gear Ratio (:1)', 'Fourth Gear Ratio (:1)', 'Seventh Gear Ratio (:1)', 'Second Gear Ratio (:1)', 'Reverse Ratio (:1)', 'Fifth Gear Ratio (:1)', 'Eighth Gear Ratio (:1)', 'Third Gear Ratio (:1)', 'Final Drive Axle Ratio (:1)', 'Steering Type', 'Front Tire Size', 'Rear Tire Size']

In [69]:
# Save wanted col to txt file
# with open("wanted_col.txt", "w") as text_file:
#     for item in col:
#         text_file.write(item + "\n")
    

In [71]:
# Get list of column from wanted_col.txt
wanted_col = open("wanted_col.txt", "r").read().splitlines()
# Extract columns to new df
auto_df_clean = auto_df[wanted_col].copy()

In [72]:
auto_df_clean.head()

Unnamed: 0,Model,MSRP,Gas Mileage,Engine,EPA Class,Style Name,Drivetrain,Passenger Capacity,Passenger Doors,Body Style,...,Seventh Gear Ratio (:1),Second Gear Ratio (:1),Reverse Ratio (:1),Fifth Gear Ratio (:1),Eighth Gear Ratio (:1),Third Gear Ratio (:1),Final Drive Axle Ratio (:1),Steering Type,Front Tire Size,Rear Tire Size
1,2019 Acura RDX Specs: FWD w/Technology Pkg,"$40,600",22 mpg City/28 mpg Hwy,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,FWD w/Technology Pkg,Front Wheel Drive,5,4,Sport Utility,...,0.78,3.27,3.97,1.3,0.65,2.19,4.17,Rack-Pinion,P235/55HR19,P235/55HR19
2,2019 Acura RDX Specs: FWD w/Advance Pkg,"$45,500",22 mpg City/28 mpg Hwy,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,FWD w/Advance Pkg,Front Wheel Drive,5,4,Sport Utility,...,0.78,3.27,3.97,1.3,0.65,2.19,4.17,Rack-Pinion,P235/55HR19,P235/55HR19
3,2019 Acura RDX Specs: FWD w/A-Spec Pkg,"$43,600",22 mpg City/27 mpg Hwy,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,FWD w/A-Spec Pkg,Front Wheel Drive,5,4,Sport Utility,...,0.78,3.27,3.97,1.3,0.65,2.19,4.17,Rack-Pinion,P255/45VR20,P255/45VR20
4,2019 Acura RDX Specs: FWD,"$37,400",22 mpg City/28 mpg Hwy,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,FWD,Front Wheel Drive,5,4,Sport Utility,...,0.78,3.27,3.97,1.3,0.65,2.19,4.17,Rack-Pinion,P235/55HR19,P235/55HR19
5,2019 Acura RDX Specs: AWD w/Technology Pkg,"$42,600",21 mpg City/27 mpg Hwy,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 4WD,AWD w/Technology Pkg,All Wheel Drive,5,4,Sport Utility,...,0.78,3.27,3.97,1.3,0.65,2.19,4.17,Rack-Pinion,P235/55HR19,P235/55HR19
