In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import logging
import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler, Normalizer, LabelEncoder
from sklearn.feature_selection import SelectKBest,chi2
from xgboost import XGBClassifier, XGBRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector

from scipy import stats

sns.set(color_codes=True)

In [2]:
logger = logging.getLogger('ai')
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s  %(filename)s : %(levelname)s  %(message)s')

# Features Engineering

## outliers

In [3]:
dateparse = lambda x: pd.datetime.strptime(x, '%d.%m.%Y')
sales_df = pd.read_csv('../input/sales_train.csv', parse_dates = ['date'], date_parser=dateparse)

In [4]:
item_df = pd.read_csv('../input/items.csv')
shop_df = pd.read_csv('../input/shops.csv')
category_df = pd.read_csv('../input/item_categories.csv')

In [5]:
test_df = pd.read_csv('../input/test.csv').set_index('ID')

### Clear the data out of range

In [6]:
logger.info("item_price more than 100000 is %d" % sales_df[sales_df.item_price > 100000].shape[0])
logger.info("item_cnt_day more than 1000 is %d" % sales_df[sales_df.item_cnt_day > 1000].shape[0])

2019-02-13 21:41:01,602  <ipython-input-6-ea685e07fb41> : INFO  item_price more than 100000 is 1
2019-02-13 21:41:01,609  <ipython-input-6-ea685e07fb41> : INFO  item_cnt_day more than 1000 is 1


In [7]:
sales_df = sales_df[sales_df.item_price<100000]
sales_df = sales_df[sales_df.item_cnt_day<1000]

### Correct the negative item price

In [8]:
sales_df[sales_df.item_price < 0]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
484683,2013-05-15,4,32,2973,-1.0,1.0


In [9]:
sales_df.at[484683, 'item_price'] = sales_df[(sales_df.item_id == 2973) & (sales_df.item_price > 0)].item_price.mean()

### Adjust duplicates by shop name

According to the reference, some shops are duplicates. This feature found is based on the language background.

In [10]:
shop_df[shop_df.shop_id == 0]

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0


In [11]:
shop_df[shop_df.shop_id == 57]

Unnamed: 0,shop_name,shop_id
57,"Якутск Орджоникидзе, 56",57


The shop_id 0 and 57 are the same shop. And other pairs are 1 and 58, 10 and 11.

In [12]:
# Якутск Орджоникидзе, 56
sales_df.at[sales_df.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_df.at[sales_df.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_df.at[sales_df.shop_id == 10, 'shop_id'] = 11

## Merge Data into one entity

In [48]:
sales_detail_df = sales_df.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().reset_index(name='item_cnt_month')
sales_price_df = sales_df.groupby(['shop_id', 'item_id', 'date_block_num'])['item_price'].mean().reset_index(name='item_avg_price')

sales_detail_df = pd.merge(sales_detail_df, sales_price_df, on=['shop_id', 'item_id', 'date_block_num'], how='left')
sales_detail_df = pd.merge(sales_detail_df, item_df, on=['item_id'], how='left')
sales_detail_df = pd.merge(sales_detail_df, shop_df, on=['shop_id'], how='left')
sales_detail_df = pd.merge(sales_detail_df, category_df, on=['item_category_id'], how='left')

## Basic Features

+ **Feature 'city'**: Each shop_name starts with the city name.

In [49]:
sales_detail_df.loc[sales_detail_df.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
sales_detail_df['city'] = sales_detail_df['shop_name'].str.split(' ').map(lambda x: x[0])
sales_detail_df.loc[sales_detail_df.city == '!Якутск', 'city'] = 'Якутск'

encoder = LabelEncoder()
encoder.fit(sales_detail_df['city'])
sales_detail_df['city_code'] = encoder.transform(sales_detail_df['city'])

+ **Feature 'type'**: Each category contains type and subtype in its name.
+ **Feature 'subtype'**: Each category contains type and subtype in its name.

In [50]:
sales_detail_df['type'] = sales_detail_df['item_category_name'].map(lambda x: x.split('-')[0].strip())

encoder = LabelEncoder()
encoder.fit(sales_detail_df['type'])
sales_detail_df['type_code'] = encoder.transform(sales_detail_df['type'])

In [51]:
sales_detail_df['subtype'] = sales_detail_df['item_category_name'].map(lambda x: x.split('-')[1].strip() if len(x.split('-')) > 1 else x.split('-')[0].strip())

encoder = LabelEncoder()
encoder.fit(sales_detail_df['subtype'])
sales_detail_df['subtype_code'] = encoder.transform(sales_detail_df['subtype'])

In [52]:
sales_detail_df['month'] = sales_detail_df['date_block_num'].apply(lambda x: (x % 12) + 1)

+ **Feature 'p'**

In [None]:
for i in range(1, 13, 1):
    t = sales_feature_df['shop_id', 'item_category_id', 'date_block_num', 'item_cnt_month']
    t['date_block_num'] = t['date_block_num'] + i
    t = t.rename(index=str, columns={'date_block_num': 'p'+str(i)})
    sales_detail_df = pd.merge(sales_detail_df, t, on=['shop_id', 'item_category_id', 'date_block_num'], how='left')

+ **Feature 'sc'**

In [53]:
t = sales_detail_df.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_cnt_month'].mean().reset_index(name='sc1')
sales_detail_df = pd.merge(sales_detail_df, t, on=['shop_id', 'item_category_id', 'date_block_num'], how='left')

In [79]:
t = sales_detail_df.groupby(['shop_id', 'item_category_id', 'date_block_num'])['item_avg_price'].mean().reset_index(name='sc2')
sales_detail_df = pd.merge(sales_detail_df, t, on=['shop_id', 'item_category_id', 'date_block_num'], how='left')

+ **Feature 'st1'**

In [70]:
t = sales_detail_df.groupby(['shop_id', 'type_code', 'date_block_num'])['item_cnt_month'].mean().reset_index(name='st1')
sales_detail_df = pd.merge(sales_detail_df, t, on=['shop_id', 'type_code', 'date_block_num'], how='left')

+ **Feature 'c1'**

In [54]:
t = sales_detail_df.groupby(['city', 'date_block_num'])['item_cnt_month'].mean().reset_index(name='c1')
sales_detail_df = pd.merge(sales_detail_df, t, on=['city', 'date_block_num'], how='left')

+ **Feature 'i1'**

In [55]:
t = sales_detail_df.groupby(['item_id', 'date_block_num'])['item_cnt_month'].mean().reset_index(name='i1')
sales_detail_df = pd.merge(sales_detail_df, t, on=['item_id', 'date_block_num'], how='left')

+ **Feature 's1'**

In [62]:
t = sales_detail_df.groupby(['shop_id', 'date_block_num'])['item_cnt_month'].mean().reset_index(name='s1')
sales_detail_df = pd.merge(sales_detail_df, t, on=['shop_id', 'date_block_num'], how='left')

+ **Feature 'cat1'**

In [63]:
t = sales_detail_df.groupby(['item_category_id', 'date_block_num'])['item_cnt_month'].mean().reset_index(name='cat1')
sales_detail_df = pd.merge(sales_detail_df, t, on=['item_category_id', 'date_block_num'], how='left')

## Feature Selection

In [None]:
dataset = sales_detail_df[sales_detail_df.date_block_num > 11]

In [80]:
dataset.columns.values

array(['shop_id', 'item_id', 'date_block_num', 'item_cnt_month',
       'item_avg_price', 'item_name', 'item_category_id', 'shop_name',
       'item_category_name', 'city', 'city_code', 'type', 'type_code',
       'subtype', 'subtype_code', 'month', 'sc1', 'c1', 'i1', 's1',
       'cat1', 'st1', 'sc2'], dtype=object)

In [81]:
continuous = [
    'shop_id', 'item_id', 'date_block_num', 'item_avg_price', 'item_category_id', 'city_code', 'type_code', 'subtype_code', 'month', 'sc1', 'c1', 'i1', 's1', 'cat1', 'st1', 'sc2',
    'p1','p2','p3','p4','p5','p6','p7','p8','p9','p10','p11','p12',
]

fields = [
    
]

label = ['item_cnt_month']

In [None]:
feature_processor = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continuous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
        ])),
#         ('fields', Pipeline([
#             ('extract', ColumnSelector(fields)),
#             ('imputer', SimpleImputer(missing_values=np.nan,  strategy='most_frequent')),
#             ('one_hot', OneHotEncoder(categories='auto')),
#             ('to_dense', DenseTransformer())
#         ])),
    ])),
])

feature_processor.fit(dataset, dataset[label].values.ravel())
selector_model = XGBRegressor(max_depth=3, n_estimators=20, random_state=0)
selector_model.fit(feature_processor.transform(dataset), dataset[label].values.ravel())

In [None]:
selector_model.feature_importances_

In [None]:
feature_selector = []
for index, value in enumerate(selector_model.feature_importances_):
    if value > 0 and index < len(continuous):
        feature_selector.append((continuous[index], value))

feature_selector

## Train

In [82]:
dataset = dataset.replace([np.inf, -np.inf], np.nan)
dataset = dataset.fillna(0)

In [83]:
dataset_beta = dataset[dataset.date_block_num < 33]
dataset_alpha = dataset[dataset.date_block_num == 33]

In [84]:
features_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('continuous', Pipeline([
            ('extract', ColumnSelector(continuous)),
            ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
        ])),
    ])),
])

features_pipeline.fit(dataset_beta, dataset_beta[label].values.ravel())

train_dataset_x = features_pipeline.transform(dataset_beta)
train_dataset_y = dataset_beta[label].values.ravel()

valid_dataset_x = features_pipeline.transform(dataset_alpha)
valid_dataset_y = dataset_alpha[label].values.ravel()

In [85]:
from xgboost import XGBRegressor

model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    reg_lambda=10,
    eta=0.3,    
    seed=2018)

model.fit(
    train_dataset_x, 
    train_dataset_y, 
    eval_metric="rmse", 
    eval_set=[(train_dataset_x, train_dataset_y), (valid_dataset_x, valid_dataset_y)], 
    verbose=True, 
    early_stopping_rounds = 10)

[0]	validation_0-rmse:8.12133	validation_1-rmse:6.82314
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:7.63465	validation_1-rmse:6.30553
[2]	validation_0-rmse:7.21255	validation_1-rmse:5.86196
[3]	validation_0-rmse:6.84975	validation_1-rmse:5.55863
[4]	validation_0-rmse:6.52073	validation_1-rmse:5.2173
[5]	validation_0-rmse:6.23961	validation_1-rmse:5.06724
[6]	validation_0-rmse:6.02318	validation_1-rmse:4.808
[7]	validation_0-rmse:5.79232	validation_1-rmse:4.55196
[8]	validation_0-rmse:5.58479	validation_1-rmse:4.3106
[9]	validation_0-rmse:5.4119	validation_1-rmse:4.11212
[10]	validation_0-rmse:5.26952	validation_1-rmse:3.94863
[11]	validation_0-rmse:5.1512	validation_1-rmse:3.82239
[12]	validation_0-rmse:5.04929	validation_1-rmse:3.72502
[13]	validation_0-rmse:4.96511	validation_1-rmse:3.6394
[14]	validation_0-rmse:4.89638	validation_1-rmse:3.57483
[15

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, eta=0.3, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=8, min_child_weight=300, missing=None,
       n_estimators=1000, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=10, scale_pos_weight=1,
       seed=2018, silent=True, subsample=0.8)

## Save to CSV

In [27]:
dataset.to_csv('../features/' + FILENAME + '.csv')

# Reference

## Normal ML Analysis

+ [Feature engineering, xgboost](https://www.kaggle.com/dlarionov/feature-engineering-xgboost)

## Time Series

对于revenge的预测很有帮助，但是对于单个店的单个商品而言没有实际的意义，这种预测方式的学习非常有益处

+ [AR(I)MA时间序列建模过程——步骤和python代码](https://www.jianshu.com/p/cced6617b423)
+ [python时间序列分析](http://www.cnblogs.com/foley/p/5582358.html)
+ [AR、MA及ARMA模型](https://zhuanlan.zhihu.com/p/22248464)
+ [Time Series with Python (ODSC) STA.ipynb](https://github.com/ultimatist/ODSC17/blob/master/Time%20Series%20with%20Python%20(ODSC)%20STA.ipynb)
+ [Getting Started with Time Series](https://pyflux.readthedocs.io/en/latest/getting_started.html)
+ [Welcome to Statsmodels’s Documentation](http://www.statsmodels.org/devel/index.html)