## Make features for models

In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import pandas as pd
from src.paths import CLEANED_DATA_DIR, TRANSFORMED_DATA_DIR

In [28]:
data = pd.read_csv(CLEANED_DATA_DIR / 'cleaned_data.csv')

In [29]:
data.shape

(4817, 19)

#### Add date related features

In [30]:
from src.data import get_season_us_by_month

In [31]:
data['month_sold_at'] = pd.to_datetime(data['sold_at']).dt.month

In [32]:
data['season_sold_at'] = data['month_sold_at'].apply(get_season_us_by_month)

In [33]:
data.head()

Unnamed: 0,maker_key,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,...,feature_4,feature_5,feature_6,feature_7,feature_8,price,sold_at,age_in_months_when_sold,month_sold_at,season_sold_at
0,BMW,118,140411,100,2012-02-01,diesel,black,convertible,True,True,...,False,True,True,True,False,11300,2018-01-01,71,1,winter
1,BMW,M4,13929,317,2016-04-01,petrol,grey,convertible,True,True,...,False,False,True,True,True,69700,2018-02-01,22,2,winter
2,BMW,320,183297,120,2012-04-01,diesel,white,convertible,False,False,...,False,True,False,True,False,10200,2018-02-01,70,2,winter
3,BMW,420,128035,135,2014-07-01,diesel,red,convertible,True,True,...,False,True,True,True,True,25100,2018-02-01,43,2,winter
4,BMW,425,97097,160,2014-12-01,diesel,silver,convertible,True,True,...,False,False,True,True,True,33400,2018-04-01,40,4,spring


#### Add model related features

In [34]:
data['model_initial'] = data['model_key'].str[0]

In [35]:
data.head()

Unnamed: 0,maker_key,model_key,mileage,engine_power,registration_date,fuel,paint_color,car_type,feature_1,feature_2,...,feature_5,feature_6,feature_7,feature_8,price,sold_at,age_in_months_when_sold,month_sold_at,season_sold_at,model_initial
0,BMW,118,140411,100,2012-02-01,diesel,black,convertible,True,True,...,True,True,True,False,11300,2018-01-01,71,1,winter,1
1,BMW,M4,13929,317,2016-04-01,petrol,grey,convertible,True,True,...,False,True,True,True,69700,2018-02-01,22,2,winter,M
2,BMW,320,183297,120,2012-04-01,diesel,white,convertible,False,False,...,True,False,True,False,10200,2018-02-01,70,2,winter,3
3,BMW,420,128035,135,2014-07-01,diesel,red,convertible,True,True,...,True,True,True,True,25100,2018-02-01,43,2,winter,4
4,BMW,425,97097,160,2014-12-01,diesel,silver,convertible,True,True,...,False,True,True,True,33400,2018-04-01,40,4,spring,4


In [36]:
data['model_initial'].value_counts()

model_initial
3    1785
5    1133
X    1059
1     562
4     103
7      54
2      51
6      30
M      26
i       7
Z       6
A       1
Name: count, dtype: int64

In [37]:
# Save the data to the transformed data folder
data.to_csv(TRANSFORMED_DATA_DIR / 'transformed_data.csv', index=False)

#### Prepare for linear regression models that don't handle categorical features

In [38]:
data.columns

Index(['maker_key', 'model_key', 'mileage', 'engine_power',
       'registration_date', 'fuel', 'paint_color', 'car_type', 'feature_1',
       'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6',
       'feature_7', 'feature_8', 'price', 'sold_at', 'age_in_months_when_sold',
       'month_sold_at', 'season_sold_at', 'model_initial'],
      dtype='object')

In [39]:
data_dummies = pd.get_dummies(data, columns=['model_key', 'fuel', 'paint_color', 'car_type', 'month_sold_at', 'season_sold_at', 'model_initial'])

In [40]:
data_dummies.shape

(4817, 137)

In [41]:
# Save the data to the transformed data folder
data_dummies.to_csv(TRANSFORMED_DATA_DIR / 'transformed_data_for_lr.csv', index=False)