In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pdp
import pprint
import datetime

import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

In [2]:
train = pd.read_csv('../data/input/train.csv')
test = pd.read_csv('../data/input/test.csv')
sample_sub = pd.read_csv('../data/input/sample_submit.csv', header=None, names=['id', 'mpg'])

train_test = pd.concat([train, test], ignore_index=True, sort=False)

## 前処理

In [3]:
# 平均値を計算するために欠損値をNanに変換し、horsepower全体をfloatに変換する
train_test['horsepower'].replace('?', np.nan, inplace=True)
train_test['horsepower'] = train_test['horsepower'].astype(float)

In [4]:
train_test['car name'] = train_test['car name'].str.replace('vw', 'volkswagen')
train_test['car name'] = train_test['car name'].str.replace('vokswagen', 'volkswagen')
train_test['car name'] = train_test['car name'].str.replace('toyouta', 'toyota')
train_test['car name'] = train_test['car name'].str.replace('chevy', 'chevrolet')
train_test['car name'] = train_test['car name'].str.replace('datsun 200-sx', 'datsun 200sx')

train_test['car name'] = train_test['car name'].str.replace('datsun 210 mpg', 'datsun 210')
train_test['car name'] = train_test['car name'].str.replace('ford gran torino (sw)', 'ford gran torino')

In [5]:
train_test['car_brand'] = train_test['car name'].apply(lambda x: x.strip().split(' ')[0])

In [6]:
# car name毎のhorse powerの平均値を計算(nanは計算外)
name_hp_mean = train_test.groupby('car name').horsepower.mean()
df_hp_isnull = train_test[train_test['horsepower'].isnull()]

# nan位置のtrainデータのhorse powerを置換する
for i in df_hp_isnull.index:
    train_test.loc[i, 'horsepower'] = name_hp_mean[df_hp_isnull.loc[i, 'car name']]

In [13]:
tmp = train_test
test_treated = tmp[tmp['mpg'].isnull()].copy()
train_treated = tmp.dropna(subset=['mpg'], axis=0).copy()

In [14]:
display(train_treated)
display(test_treated)

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car_brand
0,0,23.059782,6,140,110.0,2815,17.977429,80,1,dodge aspen,dodge
1,3,17.674521,8,350,150.0,4456,13.514535,72,1,dodge rampage,dodge
2,4,17.136353,8,302,140.0,2774,13.209912,79,1,mercury cougar brougham,mercury
3,7,22.664666,6,400,85.0,2190,15.196381,71,1,pontiac j2000 se hatchback,pontiac
4,9,17.872018,8,429,220.0,2245,9.621400,70,1,ford galaxie 500,ford
...,...,...,...,...,...,...,...,...,...,...,...
495,981,22.798447,4,140,148.0,2835,13.477573,82,1,datsun 200sx,datsun
496,983,35.173640,4,97,67.0,2234,17.542681,80,3,plymouth valiant,plymouth
497,994,17.825448,8,302,220.0,2774,15.177189,76,1,triumph tr7 coupe,triumph
498,995,28.545147,4,97,150.0,2130,13.324669,70,1,datsun pl510,datsun


Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car_brand
500,1,,4,98,67.0,2000,15.049795,81,1,peugeot 504,peugeot
501,2,,4,97,90.0,2720,15.339172,79,1,dodge aspen,dodge
502,5,,4,90,75.0,2807,17.821599,77,1,dodge monaco brougham,dodge
503,6,,4,140,110.0,2807,13.780354,82,2,buick century limited,buick
504,8,,4,121,85.0,3070,13.688921,73,2,saab 99le,saab
...,...,...,...,...,...,...,...,...,...,...,...
995,992,,4,225,105.0,3870,15.376311,76,1,chevrolet nova,chevrolet
996,993,,4,140,112.0,2720,13.616843,82,1,buick century limited,buick
997,996,,8,318,110.0,2774,13.272636,75,1,ford ltd,ford
998,998,,4,90,75.0,2807,17.792560,77,1,dodge monaco brougham,dodge


## train, testの違いを確認する

### car name, car_brand

In [9]:
for brand in np.sort(train_test['car_brand'].unique()):
    print(brand)
    print('train:')
    print(np.unique(train_treated.loc[train_treated['car_brand']==brand, 'car name'].values))
    print("test:")
    print(np.unique(test_treated.loc[test_treated['car_brand']==brand, 'car name'].values))
    print("\n")

amc
train:
['amc ambassador dpl' 'amc gremlin' 'amc hornet' 'amc matador'
 'amc pacer d/l' 'amc spirit dl']
test:
['amc concord d/l' 'amc gremlin' 'amc hornet' 'amc matador'
 'amc pacer d/l' 'amc spirit dl']


audi
train:
['audi 5000s (diesel)']
test:
['audi 5000s (diesel)']


buick
train:
['buick century' 'buick century limited' 'buick lesabre custom'
 'buick regal sport coupe (turbo)' 'buick skyhawk']
test:
['buick century' 'buick century limited' 'buick lesabre custom'
 'buick regal sport coupe (turbo)' 'buick skyhawk' 'buick skylark limited']


capri
train:
['capri ii']
test:
['capri ii']


chevrolet
train:
['chevrolet caprice classic' 'chevrolet impala' 'chevrolet monza 2+2'
 'chevrolet nova' 'chevrolet vega' 'chevrolet woody']
test:
['chevrolet c20' 'chevrolet caprice classic'
 'chevrolet chevelle malibu classic' 'chevrolet impala'
 'chevrolet monte carlo s' 'chevrolet monza 2+2' 'chevrolet nova'
 'chevrolet vega' 'chevrolet woody']


datsun
train:
['datsun 1200' 'datsun 200sx' '

In [15]:
train_treated['pwr'] = train_treated['weight'] / train_treated['horsepower']
test_treated['pwr'] = test_treated['weight'] / test_treated['horsepower']
display(train_treated.head())
display(test_treated.head())

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car_brand,pwr
0,0,23.059782,6,140,110.0,2815,17.977429,80,1,dodge aspen,dodge,25.590909
1,3,17.674521,8,350,150.0,4456,13.514535,72,1,dodge rampage,dodge,29.706667
2,4,17.136353,8,302,140.0,2774,13.209912,79,1,mercury cougar brougham,mercury,19.814286
3,7,22.664666,6,400,85.0,2190,15.196381,71,1,pontiac j2000 se hatchback,pontiac,25.764706
4,9,17.872018,8,429,220.0,2245,9.6214,70,1,ford galaxie 500,ford,10.204545


Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car_brand,pwr
500,1,,4,98,67.0,2000,15.049795,81,1,peugeot 504,peugeot,29.850746
501,2,,4,97,90.0,2720,15.339172,79,1,dodge aspen,dodge,30.222222
502,5,,4,90,75.0,2807,17.821599,77,1,dodge monaco brougham,dodge,37.426667
503,6,,4,140,110.0,2807,13.780354,82,2,buick century limited,buick,25.518182
504,8,,4,121,85.0,3070,13.688921,73,2,saab 99le,saab,36.117647


In [16]:
train_treated['car_name_code'] = LabelEncoder().fit_transform(train_treated['car name'])
train_treated['car_brand_code'] = LabelEncoder().fit_transform(train_treated['car_brand'])

test_treated['car_name_code'] = LabelEncoder().fit_transform(test_treated['car name'])
test_treated['car_brand_code'] = LabelEncoder().fit_transform(test_treated['car_brand'])

display(train_treated.head())
display(test_treated.head())

Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car_brand,pwr,car_name_code,car_brand_code
0,0,23.059782,6,140,110.0,2815,17.977429,80,1,dodge aspen,dodge,25.590909,27,6
1,3,17.674521,8,350,150.0,4456,13.514535,72,1,dodge rampage,dodge,29.706667,33,6
2,4,17.136353,8,302,140.0,2774,13.209912,79,1,mercury cougar brougham,mercury,19.814286,49,11
3,7,22.664666,6,400,85.0,2190,15.196381,71,1,pontiac j2000 se hatchback,pontiac,25.764706,60,16
4,9,17.872018,8,429,220.0,2245,9.6214,70,1,ford galaxie 500,ford,10.204545,38,8


Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car_brand,pwr,car_name_code,car_brand_code
500,1,,4,98,67.0,2000,15.049795,81,1,peugeot 504,peugeot,29.850746,58,14
501,2,,4,97,90.0,2720,15.339172,79,1,dodge aspen,dodge,30.222222,31,6
502,5,,4,90,75.0,2807,17.821599,77,1,dodge monaco brougham,dodge,37.426667,34,6
503,6,,4,140,110.0,2807,13.780354,82,2,buick century limited,buick,25.518182,8,2
504,8,,4,121,85.0,3070,13.688921,73,2,saab 99le,saab,36.117647,68,18


In [18]:
train_treated.drop(['car name', 'car_brand'], axis=1, inplace=True)
test_treated.drop(['car name', 'car_brand'], axis=1, inplace=True)

In [19]:
dt = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')

# train, test
train_treated.to_pickle(
    '../features/feature_train_' + dt + '_treated.pkl'
)
test_treated.to_pickle(
    '../features/feature_test_' + dt + '_treated.pkl'
)