In [113]:
# IPython Imports
# Used to make notebook wider, comment out for normal notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
from IPython.display import clear_output

In [75]:
# Import Libraries
import boto3
import csv
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import re
from typing import List, Dict, Callable

In [76]:
# GLOBAL HELPER FUNCTIONS
def print_ref(d_obj:Dict[str, any], row_len:int=4) -> None:
    ref = \
f""" ---------------------------
| OBJECT KEYS FOR REFERENCE |
 ---------------------------
"""
    row = 0
    for key in d_obj.keys():
        if row == row_len: 
            row = 0
            ref += "\n"
        ref += f" {key} |"
        row += 1
    print(ref)
    
def parse_col(col_list:List[str], data_frame:pd.DataFrame=None) -> List[str]:
    res, len_ = [], len(col_list) + 1
    df_exists = data_frame is not None
    for num, col in enumerate(col_list):
        example = data_frame[col].iloc[0] if df_exists else "None"
        ans = input(f"""
Item #: {num + 1}/{len_}
Column: {col}
Example: {example}
[y/n] Default[n] -> """).lower()
        clear_output()
        if ans == "y":
            res.append(col)
    return res


In [77]:
# GLOBAL VARS
BKT_NAME = 'ds-data-2020'

# DATASET NAMES
the_cc = "thecarconnectiondataset.csv" # Main Dataset
motortrend = "motortrend1974.csv" # Currently Not Using
car_feat_msrp = "carfeaturesmsrp.csv" # Currently Not Using

# SET DATA PATH/KEYS
KEY = the_cc

In [78]:
# Create boto3 s3 Client Object
s3 = boto3.client('s3')
# Create s3 Bucket Object -> Dict
obj = s3.get_object(Bucket=BKT_NAME, Key=KEY)

# obj_keys = obj.keys()
print_ref(obj)


 ---------------------------
| OBJECT KEYS FOR REFERENCE |
 ---------------------------
 ResponseMetadata | AcceptRanges | LastModified | ContentLength |
 ETag | ContentType | Metadata | Body |


In [79]:
# Might have to chunk data
chk_size = 2000


# Convert Bucket File to transposed pd.DataFrame
auto_df = pd.read_csv(obj['Body'], index_col=0, header=None, low_memory=False).T

In [80]:
# Rename known NaN columns
auto_df.rename(columns={ auto_df.columns[0]: "Model" }, inplace = True)

In [81]:
# Get only wanted columns
# col = parse_col(auto_df.columns, auto_df)

# col = ["Model", 'MSRP', 'Gas Mileage', 'Engine', 'EPA Class', 'Style Name', 'Drivetrain', 'Passenger Capacity', 'Passenger Doors', 'Body Style', 'Transmission', 'Base Curb Weight (lbs)', 'Wheelbase (in)', 'Min Ground Clearance (in)', 'Track Width, Front (in)', 'Track Width, Rear (in)', 'Height, Overall (in)', 'Fuel Economy Est-Combined (MPG)', 'SAE Net Torque @ RPM', 'Fuel System', 'Engine Type', 'SAE Net Horsepower @ RPM', 'Displacement', 'First Gear Ratio (:1)', 'Sixth Gear Ratio (:1)', 'Fourth Gear Ratio (:1)', 'Seventh Gear Ratio (:1)', 'Second Gear Ratio (:1)', 'Reverse Ratio (:1)', 'Fifth Gear Ratio (:1)', 'Eighth Gear Ratio (:1)', 'Third Gear Ratio (:1)', 'Final Drive Axle Ratio (:1)', 'Steering Type', 'Front Tire Size', 'Rear Tire Size']

In [82]:
# Save wanted col to txt file
# with open("data/wanted_col.txt", "w") as text_file:
#     for item in col:
#         text_file.write(item + "\n")

In [83]:
# Get list of column from wanted_col.txt
wanted_col = open("data/wanted_col.txt", "r").read().splitlines()
# Extract columns to new df
auto_df_clean = auto_df[wanted_col].copy()

In [84]:
# Clean Model Column to only contain year and name
auto_df_clean["Model"] =  [i.partition('Specs')[0] for i in auto_df_clean.Model]

In [85]:
# Create Year Column from Model Column
auto_df_clean.insert(0, "Year", [re.search("([^\s]+)",x).group() for x in auto_df_clean["Model"]])

In [86]:
# Cast Year Column to Int
auto_df_clean["Year"] = auto_df_clean["Year"].astype(int) 

In [87]:
# Clean Model Column to only contain name
auto_df_clean["Model"] =  [(re.search("\s(.*)", i).group()) for i in auto_df_clean.Model]

In [88]:
# Create Brand Column from Model Column
auto_df_clean.insert(1, "Brand", [re.search("([^\s]+)", x).group() for x in auto_df_clean["Model"]])

In [89]:
# Clean Model Column to only contain model
auto_df_clean["Model"] =  [(re.search("\s(.*)", i).group(1)) for i in auto_df_clean.Model]

In [90]:
# Get rid of all nan values in Gas Mileage
nans = pd.notnull(auto_df_clean['Gas Mileage'])
auto_df_clean = auto_df_clean[nans]

In [91]:
# Split Gas Mileage into City and Hwy
auto_df_clean.insert(4, "Gas Mileage (City)", [(x.partition("/")[0].partition(" ")[0]) for x in auto_df_clean["Gas Mileage"]])
auto_df_clean.insert(5, "Gas Mileage (Hwy)", [(x.partition("/")[2].partition(" ")[0]) for x in auto_df_clean["Gas Mileage"]])

In [92]:
# Gas Mileage to INT
auto_df_clean['Gas Mileage (City)'] = pd.to_numeric(auto_df_clean['Gas Mileage (City)'])
auto_df_clean['Gas Mileage (Hwy)'] = pd.to_numeric(auto_df_clean['Gas Mileage (Hwy)'])

In [93]:
# Drop Gas Mileage Column after split
auto_df_clean.drop(columns=["Gas Mileage"], inplace=True)

In [94]:
# Clean Gear Ratio Columns
gears = ["First Gear Ratio (:1)", "Second Gear Ratio (:1)", "Third Gear Ratio (:1)", "Fourth Gear Ratio (:1)", "Fifth Gear Ratio (:1)", "Sixth Gear Ratio (:1)", "Seventh Gear Ratio (:1)", "Eighth Gear Ratio (:1)", "Reverse Ratio (:1)", "Final Drive Axle Ratio (:1)"]

# Reorder Gears Numerically
for idx, column in enumerate(gears, 26):
    col = auto_df_clean[column]
    auto_df_clean.drop(labels=[column], axis=1,inplace = True)
    auto_df_clean.insert(idx, column, col)
    
# Cast NaN to values in col with "- TBD -"
for gear in gears:
    auto_df_clean.loc[auto_df_clean[gear].str.contains("TBD", na=False), gear] = np.nan

In [95]:
# Cast float to values in col with string arithmetic
"Notes: Some gear columns need to be sep with (,), (/), or (" ") delimiters"
for gear in gears:
    auto_df_clean[gear] = pd.to_numeric([
        (x.partition(",")[0]) if ((type(x) == str) and ("," in x)) 
        else (x.partition("/")[0]) if ((type(x) == str) and ("/" in x))
        else (x.partition("-")[0]) if ((type(x) == str) and ("-" in x))
        else (x.partition(" ")[0]) if ((type(x) == str) and (" " in x))
        else x for x in auto_df_clean[gear]
    ], downcast="float", errors="coerce")

In [96]:
# Cast MSRP to Numeric Float
auto_df_clean["MSRP"] = pd.to_numeric((auto_df_clean["MSRP"].replace('[\$,]', '', regex=True)), downcast="integer", errors="coerce")
# Rename MSRP to indicate $
auto_df_clean.rename(columns={ "MSRP": "MSRP($)" }, inplace = True)

In [97]:
# Get rid of all nan values in Horsepower/Torque
nans = pd.notnull(auto_df_clean['SAE Net Torque @ RPM'])
auto_df_clean = auto_df_clean[nans]

In [98]:
# Split SAE Horsepower into Horsepower and @ RPM
auto_df_clean.insert(4, "Horsepower", [(x.partition("@")[0]) for x in auto_df_clean["SAE Net Horsepower @ RPM"]])
auto_df_clean.insert(5, "@ RPM (HP)", [(x.partition("@")[2]) for x in auto_df_clean["SAE Net Horsepower @ RPM"]])

In [99]:
# Split SAE Torque into Torque and @ RPM
auto_df_clean.insert(6, "Torque", [(x.partition("@")[0]) for x in auto_df_clean["SAE Net Torque @ RPM"]])
auto_df_clean.insert(7, "@ RPM (TQ)", [(x.partition("@")[2]) for x in auto_df_clean["SAE Net Torque @ RPM"]])

In [100]:
# Clean and Cast Power Values (Horsepower / Torque)
pow_rpm_ratings = ["Horsepower", "Torque", "@ RPM (HP)", "@ RPM (TQ)"]
"Notes: Some Power columns need to be sep with (@), or (-) delimiters"
# Cast NaN to values in col with empty ratings or "- TBD -"
for col in pow_rpm_ratings:
    auto_df_clean.loc[auto_df_clean[col].str.contains("TBD", na=False), col] = np.nan
    if col in pow_rpm_ratings[:2]:
        auto_df_clean[col] = pd.to_numeric(auto_df_clean[col], downcast="float", errors="coerce")

In [101]:
# Drop SAE Horsepower and SAE Torque Columns after splits
auto_df_clean.drop(columns=["SAE Net Horsepower @ RPM"], inplace=True)
auto_df_clean.drop(columns=["SAE Net Torque @ RPM"], inplace=True)

In [119]:
# Clean Base Curb Weight and Cast to INT
auto_df_clean["Base Curb Weight (lbs)"] = pd.to_numeric(auto_df_clean["Base Curb Weight (lbs)"], downcast="integer", errors="coerce")

In [128]:
# Clean Displacement and Cast to INT
auto_df_clean["Displacement"] = pd.to_numeric(
    [(x.partition("L")[0]) if type(x) == str else x for x in auto_df_clean["Displacement"]], 
    downcast="float", 
    errors="coerce"
)
auto_df_clean.rename(columns={ "Displacement": "Displacement(L)" }, inplace = True)

In [130]:
# auto_df_clean.info()

In [132]:
# auto_df_clean.head()

In [134]:
# auto_df_clean.describe()

Unnamed: 0,Year,MSRP($),Horsepower,Torque,Gas Mileage (City),Gas Mileage (Hwy),Base Curb Weight (lbs),Displacement(L),First Gear Ratio (:1),Second Gear Ratio (:1),Third Gear Ratio (:1),Fourth Gear Ratio (:1),Fifth Gear Ratio (:1),Sixth Gear Ratio (:1),Seventh Gear Ratio (:1),Eighth Gear Ratio (:1),Reverse Ratio (:1),Final Drive Axle Ratio (:1)
count,26284.0,26261.0,26250.0,26242.0,26284.0,26276.0,18527.0,25947.0,24834.0,23729.0,23743.0,23709.0,19534.0,13070.0,4275.0,3128.0,24336.0,18188.0
mean,2010.382324,38138.281292,250.164688,257.343811,19.555014,26.283148,3607.858261,3.345928,3.727093,2.247718,1.494974,1.098211,0.908716,0.782912,0.784824,0.659625,3.147366,3.702522
std,6.054405,32625.769552,97.641388,102.690323,5.558793,6.245657,689.04925,1.283857,0.729955,0.517186,0.361223,0.300257,0.19789,0.153854,0.159695,0.064355,0.70288,0.658005
min,1996.0,8599.0,65.0,65.0,9.0,11.0,1808.0,1.0,0.4,0.74,0.45,0.36,0.08,0.075,0.0,0.0,0.73,2.05
25%,2006.0,23280.0,173.0,177.0,16.0,21.0,3160.0,2.3,3.06,1.9,1.296,0.95,0.76,0.68,0.73,0.66,2.7,3.21
50%,2011.0,30240.0,240.0,253.0,18.0,26.0,3499.0,3.2,3.75,2.2,1.49,1.03,0.86,0.75,0.839,0.67,3.29,3.65
75%,2015.0,40965.0,302.0,305.0,22.0,30.0,4045.0,4.2,4.21,2.53,1.72,1.28,1.0,0.88,0.84,0.67,3.53,4.11
max,2019.0,548800.0,808.0,3350.0,66.0,70.0,8591.0,7.0,15.95,10.04,6.36,4.62,3.83,3.13,4.33,0.85,14.07,6.47


In [105]:
high_hp_brands = auto_df_clean[["Year", "Brand", "Model", "Style Name", "Horsepower", "Torque"]][auto_df_clean["Horsepower"] > 600].groupby("Brand")

In [106]:
# auto_df_clean[auto_df_clean["Brand"] == "Porsche"]

In [107]:
# auto_df_clean["Engine Type"].unique()

In [108]:
# auto_df_clean["Brand"].unique()

|<br>|<br>|<br>|<br>|<br>|<br>|<br>|<br>|<br>|<br>|<br>|<br>|<br>


In [135]:
profile = ProfileReport(auto_df_clean, title='theCarConnection Profiling Report', html={'style':{'full_width':True}})

HBox(children=(FloatProgress(value=0.0, description='variables', max=42.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='interactions [continuous]', max=361.0, style=ProgressStyl…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='missing', max=4.0, style=ProgressStyle(description_width=…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




In [136]:
profile.to_widgets()

Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(value='Number of va…