In [83]:
#imports
import pandas as pd
import os
from sqlalchemy import create_engine 

In [84]:
#Files to read in
kaggle_file = os.path.join('.','resources','cars_kaggle.csv')
vfacts_file = os.path.join('.','resources','2012_JAN_VFACTS_NSW.csv')

In [85]:
#Dataframe for each input CSV
vdf = pd.read_csv(vfacts_file)
kdf = pd.read_csv(kaggle_file)

In [87]:
#Function to convert currency string into a plain number string
def clean_currency(x):
    """ If the value is a string, then remove currency symbol and delimiters
    otherwise, the value is numeric and can be converted
    """
    if isinstance(x, str):
        return(x.replace('$', '').replace(',', ''))
    return(x)

In [88]:
#Convert MSRP column into clean number string, and then into a float
kdf['MSRP'] = kdf['MSRP'].apply(clean_currency).astype('float')

Unnamed: 0,Make,Origin,MSRP
0,Acura,Asia,36945.0
1,Acura,Asia,23820.0
2,Acura,Asia,26990.0
3,Acura,Asia,33195.0
4,Acura,Asia,43755.0
...,...,...,...
423,Volvo,Europe,40565.0
424,Volvo,Europe,42565.0
425,Volvo,Europe,45210.0
426,Volvo,Europe,26135.0


In [89]:
#reduce columns down to the ones we care about
cols = ['Make','Origin','MSRP','EngineSize','Cylinders','Horsepower','MPG_City','MPG_Highway','Weight','Wheelbase','Length']
kdf = kdf[cols]

In [90]:
#cols to aggregate
agg_cols = ['Origin','MSRP','EngineSize','Cylinders','Horsepower','MPG_City','MPG_Highway','Weight','Wheelbase','Length']
#Get the max for each aggregation column
kdf_max = kdf.groupby(['Make']).max()[agg_cols].add_prefix("MAX_")
#We only car about the origin once, this was a hacky way to get it in.
agg_cols.remove('Origin')
#Get the min and avg aggregations and create the aggregation dataframe
kdf_min = kdf.groupby(['Make']).min()[agg_cols].add_prefix("MIN_")
kdf_avg = kdf.groupby(['Make']).mean()[agg_cols].add_prefix("AVG_")
kdf_agg = kdf_max.join(kdf_min).join(kdf_avg).copy()
#get the count of cars made by each manufacturer, any column would have worked here
kdf_agg['car_count'] = kdf.groupby('Make').count()['Length']

In [91]:
#Merge the dataframes together and exclude anything that didnt have a Make in both tables so we only get fully populated data
final_df = pd.merge(kdf_agg,vdf,on="Make",how="inner")

In [102]:
#display the final dataframe to double check
final_df.head()

Unnamed: 0,id,make,origin,max_msrp,max_engine_size,max_cylinders,max_horsepower,max_mpg_city,max_mpg_highway,max_weight,...,avg_horsepower,avg_mpg_city,avg_mpg_highway,avg_weight,avg_wheelbase,avg_length,car_count,passenger_vol,suv_vol,total_vol
0,1,Audi,Europe,84600.0,4.2,8.0,450,23,31,4399,...,250.789474,18.473684,25.789474,3700.631579,105.315789,181.421053,19,240,220,460
1,2,BMW,Europe,73195.0,4.4,8.0,333,21,30,4824,...,241.45,18.7,27.0,3611.35,108.95,180.1,20,239,158,397
2,3,Chrysler,USA,38380.0,3.8,6.0,255,22,30,4675,...,201.133333,19.866667,27.333333,3534.333333,109.2,190.0,15,2,0,2
3,4,Dodge,USA,81795.0,8.3,10.0,500,29,36,4987,...,209.692308,19.384615,26.230769,3638.769231,112.769231,194.230769,13,32,3,35
4,5,Ford,USA,41475.0,6.8,10.0,310,27,36,7190,...,197.869565,19.26087,25.73913,3748.913043,111.434783,191.913043,23,1054,268,1322


In [93]:
#create the ID column
final_df.insert(0,'id',range(1,1 + len(final_df)))

In [94]:
#Generate the sum of the volumes into its own column
final_df['total_vol'] = final_df[['Passenger_Volume','SUV_Volume']].sum(axis=1)
#dictionary for renaming columns into sql-friendly names
rename_dict = {
    'Make':'make',
    'MAX_Origin':'origin',
    'MAX_MSRP':'max_msrp',
    'MAX_EngineSize':'max_engine_size',
    'MAX_Cylinders':'max_cylinders',
    'MAX_Horsepower':'max_horsepower',
    'MAX_MPG_City':'max_mpg_city',
    'MAX_MPG_Highway':'max_mpg_highway',
    'MAX_Weight':'max_weight',
    'MAX_Wheelbase':'max_wheelbase',
    'MAX_Length':'max_length',
    'MIN_MSRP':'min_msrp',
    'MIN_EngineSize':'min_engine_size',
    'MIN_Cylinders':'min_cylinders',
    'MIN_Horsepower':'min_horsepower',
    'MIN_MPG_City':'min_mpg_city',
    'MIN_MPG_Highway':'min_mpg_highway',
    'MIN_Weight':'min_weight',
    'MIN_Wheelbase':'min_wheelbase',
    'MIN_Length':'min_length',
    'AVG_MSRP':'avg_msrp',
    'AVG_EngineSize':'avg_engine_size',
    'AVG_Cylinders':'avg_cylinders',
    'AVG_Horsepower':'avg_horsepower',
    'AVG_MPG_City':'avg_mpg_city',
    'AVG_MPG_Highway':'avg_mpg_highway',
    'AVG_Weight':'avg_weight',
    'AVG_Wheelbase':'avg_wheelbase',
    'AVG_Length':'avg_length',
    'Passenger_Volume':'passenger_vol',
    'SUV_Volume':'suv_vol'}
#do the renaming
final_df.rename(columns=rename_dict,inplace=True)


In [95]:
#List of all the tables we want to create
make_sql_list = ['id','make','origin','car_count']
pricing_spec_list = ['id','min_msrp','avg_msrp','max_msrp']
engine_spec_list = ['id','min_engine_size','avg_engine_size','max_engine_size','min_cylinders','avg_cylinders','max_cylinders','min_horsepower','avg_horsepower','max_horsepower']
fuel_spec_list = ['id','min_mpg_city','avg_mpg_city','max_mpg_city','min_mpg_highway','avg_mpg_highway','max_mpg_highway']
chassis_spec_list = ['id','min_weight','avg_weight','max_weight','min_wheelbase','avg_wheelbase','max_wheelbase','min_length','avg_length','max_length']
sales_data_list = ['id','passenger_vol','suv_vol','total_vol']

In [97]:
#create the connection into postgresql (I've dropped the password here, my password is not "password")
conn = "postgres:[password]@localhost:5432/cars_db"
engine = create_engine(f'postgresql://{conn}')

In [98]:
#Make sure that the database has already been set up properly
engine.table_names()

['fuel_spec',
 'chassis_spec',
 'sales_data',
 'makes',
 'pricing_spec',
 'engine_spec']

In [99]:
#Start with the makes column to check it all works correctly
final_df[make_sql_list].to_sql(name='makes',con=engine,if_exists='append',index=False)

In [100]:
#read the top 5 lines of the database into a dataframe so we can verify the state
pd.read_sql_query('select * from makes limit 5', con=engine)

Unnamed: 0,id,make,origin,car_count
0,1,Audi,Europe,19
1,2,BMW,Europe,20
2,3,Chrysler,USA,15
3,4,Dodge,USA,13
4,5,Ford,USA,23


In [101]:
#blindly trust the rest will work and add the rest of them in
final_df[pricing_spec_list].to_sql(name='pricing_spec',con=engine,if_exists='append',index=False)
final_df[engine_spec_list].to_sql(name='engine_spec',con=engine,if_exists='append',index=False)
final_df[fuel_spec_list].to_sql(name='fuel_spec',con=engine,if_exists='append',index=False)
final_df[chassis_spec_list].to_sql(name='chassis_spec',con=engine,if_exists='append',index=False)
final_df[sales_data_list].to_sql(name='sales_data',con=engine,if_exists='append',index=False)

In [109]:
#Run a query to get all the data from all the tables to check that it matches what we expect
pd.read_sql_query('SELECT m.id,m.make,m.origin,m.car_count,s.total_vol,c.min_weight,c.avg_weight,c.max_weight,c.min_wheelbase,c.avg_wheelbase,c.max_wheelbase,c.min_length,c.avg_length,c.max_length,e.min_engine_size,e.avg_engine_size,e.max_engine_size,e.min_cylinders,e.avg_cylinders,e.max_cylinders,e.min_horsepower,e.avg_horsepower,e.max_horsepower,f.min_mpg_city,f.avg_mpg_city,f.max_mpg_city,f.min_mpg_highway,f.avg_mpg_highway,f.max_mpg_highway,p.min_msrp,p.avg_msrp,p.max_msrp FROM makes as m JOIN sales_data as s ON m.id = s.id JOIN chassis_spec as c ON m.id = c.id JOIN engine_spec as e ON m.id = e.id JOIN fuel_spec as f ON m.id=f.id JOIN pricing_spec as p ON m.id = p.id',con=engine).head()

Unnamed: 0,id,make,origin,car_count,total_vol,min_weight,avg_weight,max_weight,min_wheelbase,avg_wheelbase,...,max_horsepower,min_mpg_city,avg_mpg_city,max_mpg_city,min_mpg_highway,avg_mpg_highway,max_mpg_highway,min_msrp,avg_msrp,max_msrp
0,1,Audi,Europe,19,460,2921.0,3700.631579,4399.0,95.0,105.315789,...,450.0,14.0,18.473684,23.0,20.0,25.789474,31.0,25940.0,43307.894737,84600.0
1,2,BMW,Europe,20,397,2932.0,3611.35,4824.0,98.0,108.95,...,333.0,16.0,18.7,21.0,22.0,27.0,30.0,28495.0,43285.25,73195.0
2,3,Chrysler,USA,15,2,3060.0,3534.333333,4675.0,95.0,109.2,...,255.0,17.0,19.866667,22.0,23.0,27.333333,30.0,17985.0,27252.0,38380.0
3,4,Dodge,USA,13,35,2581.0,3638.769231,4987.0,99.0,112.769231,...,500.0,12.0,19.384615,29.0,20.0,26.230769,36.0,13670.0,26253.846154,81795.0
4,5,Ford,USA,23,1322,2606.0,3748.913043,7190.0,101.0,111.434783,...,310.0,10.0,19.26087,27.0,13.0,25.73913,36.0,13270.0,24015.869565,41475.0
