### Set environment variables

In [337]:
!gcloud config list

[ai_platform]
region = global
[compute]
region = us-central1
[core]
account = 147678809820-compute@developer.gserviceaccount.com
disable_usage_reporting = True
project = qwiklabs-gcp-00-0db9b1bc58c6

Your active configuration is: [default]


In [338]:
import os, re, shutil
PROJECT = 'qwiklabs-gcp-00-0db9b1bc58c6'  # Change to your project.
BUCKET = PROJECT  # Change to your project.
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET

os.environ['DS_ORIGIN'] = "used_car"

In [339]:
!echo $PROJECT $BUCKET $DS_ORIGIN

qwiklabs-gcp-00-0db9b1bc58c6 qwiklabs-gcp-00-0db9b1bc58c6 used_car


In [340]:
from IPython.core.magic import register_cell_magic
from IPython import get_ipython

@register_cell_magic('with_globals')
def with_globals(line, cell):
    contents = cell.format(**globals())
    if 'print' in line:    # print가 포함되면 cell contents를 출력
        print(contents)
    get_ipython().run_cell(contents)

### download dataset
https://www.kaggle.com/adityadesai13/used-car-dataset-ford-and-mercedes/download

In [341]:
%%bash
rm -rf ./used_car
mkdir -p ./used_car
unzip archive.zip -d ./used_car

Archive:  archive.zip
  inflating: ./used_car/audi.csv     
  inflating: ./used_car/bmw.csv      
  inflating: ./used_car/cclass.csv   
  inflating: ./used_car/focus.csv    
  inflating: ./used_car/ford.csv     
  inflating: ./used_car/hyundi.csv   
  inflating: ./used_car/merc.csv     
  inflating: ./used_car/skoda.csv    
  inflating: ./used_car/toyota.csv   
  inflating: ./used_car/unclean cclass.csv  
  inflating: ./used_car/unclean focus.csv  
  inflating: ./used_car/vauxhall.csv  
  inflating: ./used_car/vw.csv       


In [342]:
!pip install pandas



In [343]:
import pandas as pd
import os

In [344]:
BASE_PATH = './used_car'
dfs = []

for m in os.listdir(BASE_PATH):
    maker, ext = os.path.splitext(m)
    if ext == '.csv':
        path = os.path.join(BASE_PATH, m)
        df = pd.read_csv(path)
        print(maker)
        print(df.head())
        print()
        dfs.append(df)


merc
       model  year  price transmission  mileage fuelType  tax   mpg  \
0        SLK  2005   5200    Automatic    63000   Petrol  325  32.1   
1    S Class  2017  34948    Automatic    27000   Hybrid   20  61.4   
2   SL CLASS  2016  49948    Automatic     6200   Petrol  555  28.0   
3    G Class  2016  61948    Automatic    16000   Petrol  325  30.4   
4    G Class  2016  73948    Automatic     4000   Petrol  325  30.1   

   engineSize  
0         1.8  
1         2.1  
2         5.5  
3         4.0  
4         4.0  

vauxhall
    model  year  price transmission  mileage fuelType  tax   mpg  engineSize
0   Corsa  2018   7885       Manual     9876   Petrol  145  55.4         1.4
1   Corsa  2019  11995       Manual     2500   Petrol  145  54.3         1.4
2   Corsa  2017   9777    Automatic     9625   Petrol  145  47.9         1.4
3   Corsa  2016   8500       Manual    25796   Petrol   30  55.4         1.4
4   Corsa  2019  10000       Manual     3887   Petrol  145  43.5         1.4


컬럼명 정제, empty row 삭제 등 처리 후 GCP dataset에 업로드describe

In [345]:
import re

def sub_col_name(s):
    s = s.strip().replace(' ', '_')
    if s.find('_') == -1:
        s = s[0].upper() + s[1:]
    else:
        s = ''.join(word[0].upper() + word[1:] for word in s.split('_'))
    return re.sub(r'[^\da-zA-Z]','',s)

assert sub_col_name('engine size2') == 'EngineSize2'
assert sub_col_name('tax(£)') == 'Tax'

In [346]:
merged = pd.concat(dfs)
COLUMNS = list(set([sub_col_name(col) for col in merged.columns]))
COLUMNS

['Mpg',
 'Mileage',
 'Price',
 'Year',
 'EngineSize2',
 'Reference',
 'FuelType',
 'Mileage2',
 'EngineSize',
 'Transmission',
 'Tax',
 'Model',
 'FuelType2']

In [347]:
def convert_value_to_numeric(s):
    if isinstance(s, str):
        return re.sub(r'[^\.\d]', '', s)
    else:
        return s

assert 29899 != convert_value_to_numeric('£29,899')
assert 2020 != convert_value_to_numeric('2020.0')

In [348]:
import os, shutil
import numpy as np

BASE_PATH = os.getenv('DS_ORIGIN')
NEW_PATH = os.getenv('DS_MODIFIED')

NUMERIC_COLUMNS = [
 'Mileage',
 'Price',
 'Mpg',
]

CATEGORICAL_COLUMNS = [
 'Transmission',
 'FuelType',
 'Year',
 'EngineSize',
]

DROP_COLS = [
 'Tax',
 'EngineSize2',
 'Mileage2',
 'FuelType2',
 'Reference'
]

def get_new_df(path):
    df = pd.read_csv(path)
    df.rename(columns=sub_col_name, inplace=True)
    df.dropna(how='all')

    df['maker'] = maker

    if 'Mileage2' in df.columns:
        df['Mileage'] = df['Mileage2']

    for col in NUMERIC_COLUMNS:
        if col not in df.columns:
            df[col] = np.nan
        else:
            df[col] = convert_value_to_numeric(df[col])

    for col in DROP_COLS:
        df.drop(col, axis=1, inplace=True, errors='ignore')
        
    return df

In [349]:
df = get_new_df(os.path.join(BASE_PATH, 'unclean cclass.csv'))
df.head()

Unnamed: 0,Model,Year,Price,Transmission,Mileage,FuelType,EngineSize,maker,Mpg
0,C Class,2020.0,"£30,495",Automatic,1200,Diesel,2.0,unclean cclass,
1,C Class,2020.0,"£29,989",Automatic,1000,Petrol,1.5,unclean cclass,
2,C Class,2020.0,"£37,899",Automatic,500,Diesel,2.0,unclean cclass,
3,C Class,2019.0,"£30,399",Automatic,5000,Diesel,2.0,unclean cclass,
4,C Class,2019.0,"£29,899",Automatic,4500,Diesel,2.0,unclean cclass,


In [350]:
dfs = []

for m in os.listdir(BASE_PATH):
    if m.find('unclean ') != -1:
        continue
        
    maker, ext = os.path.splitext(m)
    if ext == '.csv':
        path = os.path.join(BASE_PATH, m)
        df = get_new_df(path)
        dfs.append(df)

merged = pd.concat(dfs)
merged

Unnamed: 0,Model,Year,Price,Transmission,Mileage,FuelType,Mpg,EngineSize,maker
0,SLK,2005,5200,Automatic,63000,Petrol,32.1,1.8,merc
1,S Class,2017,34948,Automatic,27000,Hybrid,61.4,2.1,merc
2,SL CLASS,2016,49948,Automatic,6200,Petrol,28.0,5.5,merc
3,G Class,2016,61948,Automatic,16000,Petrol,30.4,4.0,merc
4,G Class,2016,73948,Automatic,4000,Petrol,30.1,4.0,merc
...,...,...,...,...,...,...,...,...,...
6262,Yeti,2014,11440,Semi-Auto,14569,Petrol,44.8,1.2,skoda
6263,Octavia,2014,10990,Semi-Auto,49999,Petrol,56.5,1.4,skoda
6264,Fabia,2017,9500,Semi-Auto,17131,Petrol,61.4,1.0,skoda
6265,Citigo,2016,5999,Manual,21747,Petrol,62.8,1.0,skoda


In [351]:
merged.isnull().any()

Model           False
Year            False
Price           False
Transmission    False
Mileage         False
FuelType        False
Mpg              True
EngineSize      False
maker           False
dtype: bool

In [352]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108540 entries, 0 to 6266
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Model         108540 non-null  object 
 1   Year          108540 non-null  int64  
 2   Price         108540 non-null  int64  
 3   Transmission  108540 non-null  object 
 4   Mileage       108540 non-null  int64  
 5   FuelType      108540 non-null  object 
 6   Mpg           99187 non-null   float64
 7   EngineSize    108540 non-null  float64
 8   maker         108540 non-null  object 
dtypes: float64(2), int64(3), object(4)
memory usage: 8.3+ MB


In [353]:
merged.describe()

Unnamed: 0,Year,Price,Mileage,Mpg,EngineSize
count,108540.0,108540.0,108540.0,99187.0,108540.0
mean,2017.098028,16890.124046,23025.928469,55.166825,1.661644
std,2.130057,9756.26682,21176.423684,16.138522,0.557058
min,1970.0,450.0,1.0,0.3,0.0
25%,2016.0,10229.5,7491.75,47.1,1.2
50%,2017.0,14698.0,17265.0,54.3,1.6
75%,2019.0,20940.0,32236.0,62.8,2.0
max,2060.0,159999.0,323000.0,470.8,6.6


In [354]:
sorted(merged.EngineSize.unique())

[0.0,
 0.6,
 1.0,
 1.1,
 1.2,
 1.3,
 1.4,
 1.5,
 1.6,
 1.7,
 1.8,
 1.9,
 2.0,
 2.1,
 2.2,
 2.3,
 2.4,
 2.5,
 2.6,
 2.7,
 2.8,
 2.9,
 3.0,
 3.2,
 3.5,
 3.6,
 3.7,
 4.0,
 4.1,
 4.2,
 4.3,
 4.4,
 4.5,
 4.7,
 5.0,
 5.2,
 5.4,
 5.5,
 6.0,
 6.2,
 6.3,
 6.6]

In [355]:
merged.loc[merged.EngineSize<0.1].EngineSize.value_counts()

0.0    286
Name: EngineSize, dtype: int64

In [356]:
df = merged.copy()

## EngineSize가 누락된 컬럼의 값을 찾기 위해서 car spec을 정리

In [357]:
CAR_SPEC_COLS = ['Model', 'Year', 'Transmission', 'FuelType', 'maker', 'Mpg', 'EngineSize']
refined_df = df.loc[df.EngineSize!=0.0]
car_spec_series = refined_df[CAR_SPEC_COLS].groupby(CAR_SPEC_COLS[:-1])['EngineSize'].unique()
car_spec_df = car_spec_series.to_frame().reset_index()
car_spec_df['EngineSize'] = car_spec_df['EngineSize'].apply(lambda x: x[0] if len(x) > 0 else np.nan)
car_spec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12400 entries, 0 to 12399
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Model         12400 non-null  object 
 1   Year          12400 non-null  int64  
 2   Transmission  12400 non-null  object 
 3   FuelType      12400 non-null  object 
 4   maker         12400 non-null  object 
 5   Mpg           12400 non-null  float64
 6   EngineSize    12400 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 678.2+ KB


In [358]:
car_spec_df.describe()

Unnamed: 0,Year,Mpg,EngineSize
count,12400.0,12400.0,12400.0
mean,2016.010726,51.914497,1.959952
std,3.455635,18.358755,0.672582
min,1970.0,0.3,0.6
25%,2015.0,42.2,1.5
50%,2017.0,50.4,2.0
75%,2019.0,58.9,2.0
max,2060.0,470.8,6.6


In [359]:
car_spec_df.Transmission.unique()

array(['Manual', 'Automatic', 'Semi-Auto', 'Other'], dtype=object)

In [360]:
car_spec_df.FuelType.unique()

array(['Diesel', 'Petrol', 'Hybrid', 'Other', 'Electric'], dtype=object)

In [361]:
df.describe()

Unnamed: 0,Year,Price,Mileage,Mpg,EngineSize
count,108540.0,108540.0,108540.0,99187.0,108540.0
mean,2017.098028,16890.124046,23025.928469,55.166825,1.661644
std,2.130057,9756.26682,21176.423684,16.138522,0.557058
min,1970.0,450.0,1.0,0.3,0.0
25%,2016.0,10229.5,7491.75,47.1,1.2
50%,2017.0,14698.0,17265.0,54.3,1.6
75%,2019.0,20940.0,32236.0,62.8,2.0
max,2060.0,159999.0,323000.0,470.8,6.6


In [362]:
df.loc[df.EngineSize==0, 'EngineSize'] = np.nan
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108540 entries, 0 to 6266
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Model         108540 non-null  object 
 1   Year          108540 non-null  int64  
 2   Price         108540 non-null  int64  
 3   Transmission  108540 non-null  object 
 4   Mileage       108540 non-null  int64  
 5   FuelType      108540 non-null  object 
 6   Mpg           99187 non-null   float64
 7   EngineSize    108254 non-null  float64
 8   maker         108540 non-null  object 
dtypes: float64(2), int64(3), object(4)
memory usage: 8.3+ MB


In [363]:
df.loc[np.isnan(df.EngineSize)]

Unnamed: 0,Model,Year,Price,Transmission,Mileage,FuelType,Mpg,EngineSize,maker
9618,A Class,2016,17500,Automatic,29712,Diesel,68.9,,merc
9619,A Class,2018,20500,Automatic,13386,Petrol,53.3,,merc
9621,A Class,2018,18000,Automatic,18347,Diesel,65.7,,merc
9650,GLA Class,2016,18700,Other,30895,Other,56.5,,merc
9654,A Class,2016,17800,Automatic,21913,Diesel,68.9,,merc
...,...,...,...,...,...,...,...,...,...
5188,Octavia,2017,16000,Automatic,16166,Diesel,61.4,,skoda
5276,Octavia,2017,10700,Manual,38552,Petrol,55.4,,skoda
5334,Fabia,2014,7500,Automatic,26008,Petrol,53.3,,skoda
5373,Fabia,2018,11500,Manual,10720,Petrol,64.2,,skoda


In [364]:
row = df.loc[np.isnan(df.EngineSize)].iloc[4]
row

Model             A Class
Year                 2016
Price               17800
Transmission    Automatic
Mileage             21913
FuelType           Diesel
Mpg                  68.9
EngineSize            NaN
maker                merc
Name: 9654, dtype: object

In [367]:
def set_proper_engine_size(row):
    filtered = car_spec_df.loc[
        (car_spec_df['Model'].str.strip() == row['Model'].strip())
        & (car_spec_df['Year'] == row['Year'])
        & (car_spec_df['Transmission'].str.strip() == row['Transmission'].strip())
        & (car_spec_df['FuelType'].str.strip() == row['FuelType'].strip())
        & (car_spec_df['maker'].str.strip() == row['maker'].strip())
    ]
    
    if len(filtered) == 0:
        filtered = car_spec_df.loc[
            (car_spec_df['Model'].str.strip() == row['Model'].strip())
            & (car_spec_df['Year'] == row['Year'])
            & (car_spec_df['Mpg'] == row['Mpg'])
            & (car_spec_df['maker'].str.strip() == row['maker'].strip())
        ]       
        if len(filtered) == 0:
            return np.nan
        return filtered.EngineSize.mean()
    
    return filtered.iloc[0].EngineSize
    
for index, row in df.iterrows():
    if np.isnan(row.EngineSize):
        print('.', end='')
        new_engine_size = set_proper_engine_size(row)
        if np.isnan(new_engine_size):
            continue
        df.at[index, 'EngineSize'] = new_engine_size
        
df.info()

..............................................................................................................................................................................................................................................................................................<class 'pandas.core.frame.DataFrame'>
Int64Index: 108540 entries, 0 to 6266
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Model         108540 non-null  object 
 1   Year          108540 non-null  int64  
 2   Price         108540 non-null  int64  
 3   Transmission  108540 non-null  object 
 4   Mileage       108540 non-null  int64  
 5   FuelType      108540 non-null  object 
 6   Mpg           99187 non-null   float64
 7   EngineSize    108514 non-null  float64
 8   maker         108540 non-null  object 
dtypes: float64(2), int64(3), object(4)
memory usage: 10.3+ MB


In [368]:
df.loc[df['EngineSize'].isnull()]

Unnamed: 0,Model,Year,Price,Transmission,Mileage,FuelType,Mpg,EngineSize,maker
11347,230,2007,4500,Automatic,94000,Petrol,29.4,,merc
12072,M Class,1970,24999,Automatic,14000,Diesel,39.2,,merc
7343,Mokka,2019,14000,Manual,18548,Diesel,47.1,,vauxhall
13315,Ampera,2014,10495,Automatic,50486,Hybrid,235.4,,vauxhall
13316,Ampera,2014,11400,Automatic,64764,Hybrid,235.4,,vauxhall
16326,Ka+,2020,11999,Manual,2000,Petrol,43.5,,ford
3334,C Class,2014,14750,Manual,57062,Diesel,,,cclass
4749,i3,2014,15450,Automatic,42479,Hybrid,470.8,,bmw
4866,i3,2014,14495,Automatic,34539,Hybrid,470.8,,bmw
5790,i3,2014,14182,Automatic,37161,Hybrid,470.8,,bmw


In [371]:
df.dropna(subset=['EngineSize'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 108514 entries, 0 to 6266
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Model         108514 non-null  object 
 1   Year          108514 non-null  int64  
 2   Price         108514 non-null  int64  
 3   Transmission  108514 non-null  object 
 4   Mileage       108514 non-null  int64  
 5   FuelType      108514 non-null  object 
 6   Mpg           99174 non-null   float64
 7   EngineSize    108514 non-null  float64
 8   maker         108514 non-null  object 
dtypes: float64(2), int64(3), object(4)
memory usage: 8.3+ MB


In [None]:
only_year = df['Year'].value_counts().rename_axis('year').reset_index(name='counts').sort_values('year')

In [None]:
only_year.reset_index(drop=True)

## Remove rows which's year are more than 2020 or less than 1990

In [375]:
remove_years = df.loc[(df.Year > 2020)|(df.Year < 1990)]

In [376]:
df.drop(remove_years.index, inplace=True)
df.describe()

Unnamed: 0,Year,Price,Mileage,Mpg,EngineSize
count,108509.0,108509.0,108509.0,99169.0,108509.0
mean,2017.099024,16890.566386,23023.169497,55.151825,1.664736
std,2.115349,9755.921516,21174.058973,15.955942,0.552391
min,1991.0,450.0,1.0,0.3,0.6
25%,2016.0,10228.0,7491.0,47.1,1.2
50%,2017.0,14698.0,17261.0,54.3,1.6
75%,2019.0,20943.0,32235.0,62.8,2.0
max,2020.0,159999.0,323000.0,470.8,6.6


In [372]:
df.to_csv('refined_used_car.csv')

In [384]:
CATEGORIES = {}
for m in CATEGORICAL_COLUMNS:
    CATEGORIES[m] = list(df[m].unique())
    
print(CATEGORIES)

{'Transmission': ['Automatic', 'Manual', 'Semi-Auto', 'Other'], 'FuelType': ['Petrol', 'Hybrid', 'Diesel', 'Other', 'Electric'], 'Year': [2005, 2017, 2016, 2011, 2018, 2012, 2019, 2020, 2014, 2015, 2006, 2010, 2004, 2008, 2013, 2007, 2009, 2003, 2001, 2002, 1998, 2000, 1997, 1999, 1996, 1991, 1995], 'EngineSize': [1.8, 2.1, 5.5, 4.0, 6.2, 3.5, 2.0, 1.5, 3.0, 1.2, 1.6, 1.4, 1.7, 2.5, 4.7, 1.3, 2.2, 2.9, 0.6, 2.3, 0.8, 1.0, 6.0, 3.2, 4.4, 5.0, 2.7, 3.7, 5.4, 2.8, 1.1, 1.9, 2.4, 4.2, 4.5, 3.6, 2.6, 6.6, 5.2, 4.1, 6.3]}


### ---------- Until here, preprocessing dataframe 

In [None]:
%%bash
gsutil -m rm -f gs://${BUCKET}/refined_used_car.csv
gsutil -m cp refined_used_car.csv gs://${BUCKET}

In [None]:
schemes = []
for key, val in dict(merged.dtypes).items():
    t = 'STRING' if val == 'object' else 'FLOAT'
    schemes.append(f'{key}:{t}')
print(f'len(schemes): {len(schemes)}')
SCHEMA = ','.join(schemes)
SCHEMA

gcp dataset에서 dataset 업로드 결과 분석

In [None]:
!bq mk open_project

In [None]:
%%with_globals print
%%bash

TABLE=used_car
                
bq rm -f -t open_project.$TABLE 
bq load --source_format=CSV --skip_leading_rows=1 \
    open_project.$TABLE gs://{BUCKET}/refined_used_car.csv  {SCHEMA}
    