In [2]:
# IPython Imports
# Used to make notebook wider, comment out for normal notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
from IPython.display import clear_output

In [3]:
# Import Libraries
import boto3
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from typing import List, Dict, Callable

In [4]:
# GLOBAL HELPER FUNCTIONS
def print_ref(d_obj:Dict[str, any], row_len:int=4) -> None:
    ref = \
f""" ---------------------------
| OBJECT KEYS FOR REFERENCE |
 ---------------------------
"""
    row = 0
    for key in d_obj.keys():
        if row == row_len: 
            row = 0
            ref += "\n"
        ref += f" {key} |"
        row += 1
    print(ref)
    
def parse_col(col_list:List[str], data_frame:pd.DataFrame=None) -> List[str]:
    res, len_ = [], len(col_list) + 1
    df_exists = data_frame is not None
    for num, col in enumerate(col_list):
        example = data_frame[col].iloc[0] if df_exists else "None"
        ans = input(f"""
Item #: {num + 1}/{len_}
Column: {col}
Example: {example}
[y/n] Default[n] -> """).lower()
        clear_output()
        if ans == "y":
            res.append(col)
    return res


In [5]:
# GLOBAL VARS
BKT_NAME = 'ds-data-2020'

# DATASET NAMES
the_cc = "thecarconnectiondataset.csv" # Main Dataset
motortrend = "motortrend1974.csv" # Currently Not Using
car_feat_msrp = "carfeaturesmsrp.csv" # Currently Not Using

# SET DATA PATH/KEYS
KEY = the_cc

In [6]:
# Create boto3 s3 Client Object
s3 = boto3.client('s3')
# Create s3 Bucket Object -> Dict
obj = s3.get_object(Bucket=BKT_NAME, Key=KEY)

# obj_keys = obj.keys()
print_ref(obj)


 ---------------------------
| OBJECT KEYS FOR REFERENCE |
 ---------------------------
 ResponseMetadata | AcceptRanges | LastModified | ContentLength |
 ETag | ContentType | Metadata | Body |


In [7]:
# Might have to chunk data
chk_size = 2000


# Convert Bucket File to transposed pd.DataFrame
auto_df = pd.read_csv(obj['Body'], index_col=0, header=None, low_memory=False).T

In [8]:
# Rename known NaN columns
auto_df.rename(columns={ auto_df.columns[0]: "Model" }, inplace = True)

In [9]:
# Get only wanted columns
# col = parse_col(auto_df.columns, auto_df)

# col = ["Model", 'MSRP', 'Gas Mileage', 'Engine', 'EPA Class', 'Style Name', 'Drivetrain', 'Passenger Capacity', 'Passenger Doors', 'Body Style', 'Transmission', 'Base Curb Weight (lbs)', 'Wheelbase (in)', 'Min Ground Clearance (in)', 'Track Width, Front (in)', 'Track Width, Rear (in)', 'Height, Overall (in)', 'Fuel Economy Est-Combined (MPG)', 'SAE Net Torque @ RPM', 'Fuel System', 'Engine Type', 'SAE Net Horsepower @ RPM', 'Displacement', 'First Gear Ratio (:1)', 'Sixth Gear Ratio (:1)', 'Fourth Gear Ratio (:1)', 'Seventh Gear Ratio (:1)', 'Second Gear Ratio (:1)', 'Reverse Ratio (:1)', 'Fifth Gear Ratio (:1)', 'Eighth Gear Ratio (:1)', 'Third Gear Ratio (:1)', 'Final Drive Axle Ratio (:1)', 'Steering Type', 'Front Tire Size', 'Rear Tire Size']

In [10]:
# Save wanted col to txt file
# with open("data/wanted_col.txt", "w") as text_file:
#     for item in col:
#         text_file.write(item + "\n")

In [11]:
# Get list of column from wanted_col.txt
wanted_col = open("data/wanted_col.txt", "r").read().splitlines()
# Extract columns to new df
auto_df_clean = auto_df[wanted_col].copy()

In [12]:
# Clean Model Column to only contain year and name
auto_df_clean["Model"] =  [i.partition('Specs')[0] for i in auto_df_clean.Model]

In [13]:
# Create Year Column from Model Column
auto_df_clean.insert(0, "Year", [re.search("([^\s]+)",x).group() for x in auto_df_clean["Model"]])

In [14]:
# Clean Model Column to only contain name
auto_df_clean["Model"] =  [(re.search("\s(.*)", i).group()) for i in auto_df_clean.Model]

In [15]:
# Create Brand Column from Model Column
auto_df_clean.insert(1, "Brand", [re.search("([^\s]+)", x).group() for x in auto_df_clean["Model"]])

In [29]:
# Clean Model Column to only contain model
auto_df_clean["Model"] =  [(re.search("\s(.*)", i).group(1)) for i in auto_df_clean.Model]

In [17]:
# Get rid of all nan values in Gas Mileage
nans = pd.notnull(auto_df_clean['Gas Mileage'])
auto_df_clean = auto_df_clean[nans]

In [18]:
# Split Gas Mileage into City and Hwy
auto_df_clean.insert(4, "Gas Mileage (City)", [(x.partition("/")[0].partition(" ")[0]) for x in auto_df_clean["Gas Mileage"]])
auto_df_clean.insert(5, "Gas Mileage (Hwy)", [(x.partition("/")[2].partition(" ")[0]) for x in auto_df_clean["Gas Mileage"]])

In [19]:
# Gas Mileage to INT
# auto_df_clean.astype({
#     "Gas Mileage (City)" : "int32",
#     "Gas Mileage (Hwy)" : "int32"
# })
auto_df_clean['Gas Mileage (City)'] = pd.to_numeric(auto_df_clean['Gas Mileage (City)'])
auto_df_clean['Gas Mileage (Hwy)'] = pd.to_numeric(auto_df_clean['Gas Mileage (Hwy)'])

In [20]:
auto_df_clean.drop(columns=["Gas Mileage"], inplace=True)

In [30]:
auto_df_clean.head()

Unnamed: 0,Year,Brand,Model,MSRP,Gas Mileage (City),Gas Mileage (Hwy),Engine,EPA Class,Style Name,Drivetrain,...,Seventh Gear Ratio (:1),Second Gear Ratio (:1),Reverse Ratio (:1),Fifth Gear Ratio (:1),Eighth Gear Ratio (:1),Third Gear Ratio (:1),Final Drive Axle Ratio (:1),Steering Type,Front Tire Size,Rear Tire Size
1,2019,Acura,RDX,"$40,600",22,28.0,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,FWD w/Technology Pkg,Front Wheel Drive,...,0.78,3.27,3.97,1.3,0.65,2.19,4.17,Rack-Pinion,P235/55HR19,P235/55HR19
2,2019,Acura,RDX,"$45,500",22,28.0,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,FWD w/Advance Pkg,Front Wheel Drive,...,0.78,3.27,3.97,1.3,0.65,2.19,4.17,Rack-Pinion,P235/55HR19,P235/55HR19
3,2019,Acura,RDX,"$43,600",22,27.0,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,FWD w/A-Spec Pkg,Front Wheel Drive,...,0.78,3.27,3.97,1.3,0.65,2.19,4.17,Rack-Pinion,P255/45VR20,P255/45VR20
4,2019,Acura,RDX,"$37,400",22,28.0,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,FWD,Front Wheel Drive,...,0.78,3.27,3.97,1.3,0.65,2.19,4.17,Rack-Pinion,P235/55HR19,P235/55HR19
5,2019,Acura,RDX,"$42,600",21,27.0,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 4WD,AWD w/Technology Pkg,All Wheel Drive,...,0.78,3.27,3.97,1.3,0.65,2.19,4.17,Rack-Pinion,P235/55HR19,P235/55HR19


In [22]:
auto_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26292 entries, 1 to 32316
Data columns (total 39 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Year                             26292 non-null  object 
 1   Brand                            26292 non-null  object 
 2   Model                            26292 non-null  object 
 3   MSRP                             26269 non-null  object 
 4   Gas Mileage (City)               26292 non-null  int64  
 5   Gas Mileage (Hwy)                26284 non-null  float64
 6   Engine                           26291 non-null  object 
 7   EPA Class                        26291 non-null  object 
 8   Style Name                       26292 non-null  object 
 9   Drivetrain                       26292 non-null  object 
 10  Passenger Capacity               26292 non-null  object 
 11  Passenger Doors                  26292 non-null  object 
 12  Body Style        

In [28]:
# auto_df_clean["Displacement"]

In [27]:
auto_df_clean.Brand.unique()

array(['Acura', 'Alfa', 'Aston', 'Audi', 'Bentley', 'BMW', 'Buick',
       'Cadillac', 'Chevrolet', 'Chrysler', 'Dodge', 'Ferrari', 'FIAT',
       'Ford', 'Genesis', 'GMC', 'Honda', 'Hyundai', 'INFINITI', 'Jaguar',
       'Jeep', 'Kia', 'Lamborghini', 'Land', 'Lexus', 'Lincoln', 'Lotus',
       'Maserati', 'Mazda', 'McLaren', 'Mercedes-Benz', 'MINI',
       'Mitsubishi', 'Nissan', 'Porsche', 'Ram', 'Rolls-Royce', 'smart',
       'Subaru', 'Toyota', 'Volkswagen', 'Volvo'], dtype=object)

In [31]:
auto_df_clean.Year.unique()

array(['2019', '2018', '2016', '2015', '2014', '2013', '2012', '2011',
       '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003',
       '2002', '2001', '2000', '1999', '1998', '1997', '1996'],
      dtype=object)