In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pdp
import pprint
import datetime

import pickle

# preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [11]:
train = pd.read_csv('../data/input/train.csv')
test = pd.read_csv('../data/input/test.csv')
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

train_test = pd.concat([train, test], ignore_index=True, sort=False)

# PreProcessing

## Replace Nan, Correct car name & horsepower

In [12]:
# 平均値を計算するために欠損値をNanに変換し、horsepower全体をfloatに変換する
train_test['horsepower'].replace('?', np.nan, inplace=True)
train_test['horsepower'] = train_test['horsepower'].astype(float)

In [13]:
train_test['car name'] = train_test['car name'].str.replace('vw', 'volkswagen')
train_test['car name'] = train_test['car name'].str.replace('vokswagen', 'volkswagen')
train_test['car name'] = train_test['car name'].str.replace('toyouta', 'toyota')
train_test['car name'] = train_test['car name'].str.replace('chevy', 'chevrolet')
train_test['car name'] = train_test['car name'].str.replace('datsun 200-sx', 'datsun 200sx')

train_test['car name'] = train_test['car name'].str.replace('datsun 210 mpg', 'datsun 210')
train_test['car name'] = train_test['car name'].str.replace('ford gran torino (sw)', 'ford gran torino')

In [14]:
train_test['car_brand'] = train_test['car name'].apply(lambda x: x.strip().split(' ')[0])

In [15]:
# car name毎のhorse powerの平均値を計算(nanは計算外)
name_hp_mean = train_test.groupby('car name').horsepower.mean()
df_hp_isnull = train_test[train_test['horsepower'].isnull()]

# nan位置のtrainデータのhorse powerを置換する
for i in df_hp_isnull.index:
    train_test.loc[i, 'horsepower'] = name_hp_mean[df_hp_isnull.loc[i, 'car name']]

# Standard Scaler

In [24]:
cols = ['displacement', 'horsepower', 'acceleration']

scaler = StandardScaler(copy=True, with_mean=True, with_std=True)

train_test_sc = train_test.copy()
train_test_sc[cols] = scaler.fit_transform(train_test[cols])

display(train_test_sc)

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car_brand
0,0,23.059782,6,-0.483685,0.143900,2815,1.195071,80,1,dodge aspen,dodge
1,3,17.674521,8,1.521800,1.333045,4456,-0.797646,72,1,dodge rampage,dodge
2,4,17.136353,8,1.063404,1.035759,2774,-0.933663,79,1,mercury cougar brougham,mercury
3,7,22.664666,6,1.999297,-0.599315,2190,-0.046689,71,1,pontiac j2000 se hatchback,pontiac
4,9,17.872018,8,2.276245,3.414047,2245,-2.535961,70,1,ford galaxie 500,ford
...,...,...,...,...,...,...,...,...,...,...,...
995,992,,4,0.328059,-0.004743,3870,0.033651,76,1,chevrolet nova,chevrolet
996,993,,4,-0.483685,0.203358,2720,-0.751965,82,1,buick century limited,buick
997,996,,8,1.216202,0.143900,2774,-0.905656,75,1,ford ltd,ford
998,998,,4,-0.961181,-0.896601,2807,1.112525,77,1,dodge monaco brougham,dodge


# Encoding

## Label Encoding

In [31]:
train_test_lab_enc = train_test_sc.copy()

cat_cols = ['car name', 'car_brand']
for cols in cat_cols:
    train_test_lab_enc[cols] = LabelEncoder().fit_transform(train_test_lab_enc[cols])

display(train_test_lab_enc)

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car_brand
0,0,23.059782,6,-0.483685,0.143900,2815,1.195071,80,1,33,6
1,3,17.674521,8,1.521800,1.333045,4456,-0.797646,72,1,39,6
2,4,17.136353,8,1.063404,1.035759,2774,-0.933663,79,1,59,11
3,7,22.664666,6,1.999297,-0.599315,2190,-0.046689,71,1,72,16
4,9,17.872018,8,2.276245,3.414047,2245,-2.535961,70,1,47,8
...,...,...,...,...,...,...,...,...,...,...,...
995,992,,4,0.328059,-0.004743,3870,0.033651,76,1,21,4
996,993,,4,-0.483685,0.203358,2720,-0.751965,82,1,9,2
997,996,,8,1.216202,0.143900,2774,-0.905656,75,1,51,8
998,998,,4,-0.961181,-0.896601,2807,1.112525,77,1,37,6


## Save Features(Standard Scaler, Label Encoding)

In [34]:
train_test_proc = train_test_lab_enc.copy()

In [19]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

test_treated = train_test_proc[train_test_proc['mpg'].isnull()].copy()
train_treated = train_test_proc.dropna(subset=['mpg'], axis=0).copy()

# train, test
train_treated.to_pickle(
    '../features/feature_train_' + dt + '_treated.pkl'
)
test_treated.to_pickle(
    '../features/feature_test_' + dt + '_treated.pkl'
)

## One-Hot-Encoding

In [26]:
cat_cols = ['cylinders', 'model year', 'origin', 'car name', 'car_brand']
train_test[cat_cols].nunique()

cylinders      3
model year    13
origin         3
car name      88
car_brand     23
dtype: int64

In [33]:
train_test_one_hot = pd.get_dummies(
    train_test_sc,
    columns=cat_cols, 
    sparse=True
)

display(train_test_one_hot)

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name_amc ambassador dpl,...,car_brand_opel,car_brand_peugeot,car_brand_plymouth,car_brand_pontiac,car_brand_renault,car_brand_saab,car_brand_toyota,car_brand_triumph,car_brand_volkswagen,car_brand_volvo
0,0,23.059782,6,-0.483685,0.143900,2815,1.195071,80,1,0,...,0,0,0,0,0,0,0,0,0,0
1,3,17.674521,8,1.521800,1.333045,4456,-0.797646,72,1,0,...,0,0,0,0,0,0,0,0,0,0
2,4,17.136353,8,1.063404,1.035759,2774,-0.933663,79,1,0,...,0,0,0,0,0,0,0,0,0,0
3,7,22.664666,6,1.999297,-0.599315,2190,-0.046689,71,1,0,...,0,0,0,1,0,0,0,0,0,0
4,9,17.872018,8,2.276245,3.414047,2245,-2.535961,70,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,992,,4,0.328059,-0.004743,3870,0.033651,76,1,0,...,0,0,0,0,0,0,0,0,0,0
996,993,,4,-0.483685,0.203358,2720,-0.751965,82,1,0,...,0,0,0,0,0,0,0,0,0,0
997,996,,8,1.216202,0.143900,2774,-0.905656,75,1,0,...,0,0,0,0,0,0,0,0,0,0
998,998,,4,-0.961181,-0.896601,2807,1.112525,77,1,0,...,0,0,0,0,0,0,0,0,0,0


## Save Features(Standard Scaler, One-Hot-Encoding Encoding)

In [35]:
train_test_proc = train_test_one_hot.copy()

In [36]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

test_treated = train_test_proc[train_test_proc['mpg'].isnull()].copy()
train_treated = train_test_proc.dropna(subset=['mpg'], axis=0).copy()

# train, test
train_treated.to_pickle(
    '../features/feature_train_' + dt + '_stdscl_onehot.pkl'
)
test_treated.to_pickle(
    '../features/feature_test_' + dt + '_stdscl_onehot.pkl'
)