In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import logging
import datetime
import gc

from sklearn.preprocessing import OneHotEncoder,MinMaxScaler, Normalizer, LabelEncoder
from sklearn.feature_selection import SelectKBest,chi2,SelectFromModel
from xgboost import XGBClassifier, XGBRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from xgboost import plot_importance
from mlxtend.preprocessing import DenseTransformer
from mlxtend.feature_selection import ColumnSelector
from itertools import product

sns.set(color_codes=True)

In [2]:
def _transfer_type(df, cols, dtype):
    for v in cols:
        df[v] = df[v].astype(dtype)
    
    return df

# Features Engineering

## outliers

从外部获取所有数据集合，做类型整理，方便后面进行使用，使得pandas的dataframe所占用的内存变小

In [3]:
dateparse = lambda x: pd.datetime.strptime(x, '%d.%m.%Y')
sales_df = pd.read_csv('../input/sales_train.csv', parse_dates = ['date'], date_parser=dateparse)

item_df = pd.read_csv('../input/items.csv')
shop_df = pd.read_csv('../input/shops.csv')
category_df = pd.read_csv('../input/item_categories.csv')

test_df = pd.read_csv('../input/test.csv').set_index('ID')

数据类型整理，减少空间使用

In [4]:
sales_df = _transfer_type(sales_df, ['date_block_num', 'shop_id', 'item_id', 'item_cnt_day'], np.int16)
sales_df = _transfer_type(sales_df, ['item_price'], np.float16)

item_df = _transfer_type(item_df, ['item_id', 'item_category_id'], np.int16)
item_df['item_name'] = item_df['item_name'].astype(str)

shop_df['shop_name'] = shop_df['shop_name'].astype(str)
shop_df['shop_id'] = shop_df['shop_id'].astype(np.int16)

category_df['item_category_name'] = category_df['item_category_name'].astype(str)
category_df['item_category_id'] = category_df['item_category_id'].astype(np.int16)

### Clear the data out of range

In [5]:
sales_df = sales_df[sales_df.item_price<100000]
sales_df = sales_df[sales_df.item_cnt_day<1000]

### Correct the negative item price

In [6]:
sales_df[sales_df.item_price < 0]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
484683,2013-05-15,4,32,2973,-1.0,1


In [7]:
sales_df.at[484683, 'item_price'] = sales_df[(sales_df.item_id == 2973) & (sales_df.item_price > 0)].item_price.mean()

### Adjust duplicates by shop name

According to the reference, some shops are duplicates. This feature found is based on the language background.

In [8]:
shop_df[shop_df.shop_id == 0]

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0


In [9]:
shop_df[shop_df.shop_id == 57]

Unnamed: 0,shop_name,shop_id
57,"Якутск Орджоникидзе, 56",57


The shop_id 0 and 57 are the same shop. And other pairs are 1 and 58, 10 and 11.

In [10]:
# Якутск Орджоникидзе, 56
sales_df.at[sales_df.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales_df.at[sales_df.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales_df.at[sales_df.shop_id == 10, 'shop_id'] = 11

## Extract Features

+ **Feature 'r' series**: Sales record

In [11]:
def _rename(prefix):
    cols = ['2013-01', '2013-02', '2013-03', '2013-04','2013-05','2013-06','2013-07','2013-08','2013-09','2013-10','2013-11','2013-12','2014-01','2014-02','2014-03','2014-04','2014-05','2014-06','2014-07','2014-08','2014-09','2014-10','2014-11','2014-12','2015-01','2015-02','2015-03','2015-04','2015-05','2015-06','2015-07','2015-08', '2015-09','2015-10']
    
    result = {}
    for i in range(1, len(cols) + 1, 1):
        result[cols[i-1]] = prefix + str(i)
    
    return result

In [12]:
t = sales_df.groupby([sales_df.date.apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).agg({'item_cnt_day': 'sum'}).reset_index()
t = t[['date','item_id','shop_id','item_cnt_day']]
t = t.pivot_table(index=['item_id','shop_id'], columns='date',values='item_cnt_day',fill_value=0).reset_index()
t = t.rename(index=str, columns=_rename('r'))
sales_detail_df = t.copy()

del t
gc.collect()

sales_detail_df = _transfer_type(sales_detail_df, ['item_id', 'shop_id'], np.int16)

arr = []
for i in range(1, 35, 1):
    arr.append('r'+str(i))
    
sales_detail_df = _transfer_type(sales_detail_df, arr, np.float16)
sales_detail_df = sales_detail_df.drop_duplicates(['item_id', 'shop_id'])

In [16]:
leak_df = test_df[['item_id', 'shop_id']].drop_duplicates()
sales_detail_df = pd.merge(sales_detail_df, leak_df, on=['item_id', 'shop_id'], how='outer')
sales_detail_df = sales_detail_df.fillna(0)
sales_detail_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 523789 entries, 0 to 523788
Data columns (total 36 columns):
item_id    523789 non-null int64
shop_id    523789 non-null int64
r1         523789 non-null float16
r2         523789 non-null float16
r3         523789 non-null float16
r4         523789 non-null float16
r5         523789 non-null float16
r6         523789 non-null float16
r7         523789 non-null float16
r8         523789 non-null float16
r9         523789 non-null float16
r10        523789 non-null float16
r11        523789 non-null float16
r12        523789 non-null float16
r13        523789 non-null float16
r14        523789 non-null float16
r15        523789 non-null float16
r16        523789 non-null float16
r17        523789 non-null float16
r18        523789 non-null float16
r19        523789 non-null float16
r20        523789 non-null float16
r21        523789 non-null float16
r22        523789 non-null float16
r23        523789 non-null float16
r24        523789 

In [17]:
def _extract(df, i, is_test=False):
    arr = ['item_id','shop_id']
    for j in range(1, 25, 1):
        arr = arr + ['r' + str(i-j)]
    
    if not is_test:
        arr = arr + ['r' + str(i)]
    
    tdf = df[arr]
    tdf['date_block_num'] = i - 1
    
    return tdf

In [18]:
cols = ['item_id','shop_id', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 'r16', 'r17', 'r18', 'r19', 'r20', 'r21', 'r22', 'r23', 'r24', 'item_cnt_month', 'date_block_num']
sales_record_df = pd.DataFrame(columns=cols)
for i in range(25, 35, 1):
    tdf = _extract(sales_detail_df, i)
    sales_record_df = sales_record_df.append(pd.DataFrame(columns=cols, data=tdf.values))

sales_record_df['month'] = sales_record_df['date_block_num'].apply(lambda x: (x % 12) + 1)

for i in range(1, 25, 1):
    sales_record_df['r'+str(i)] = sales_record_df['r'+str(i)].astype(np.float16)

sales_record_df = _transfer_type(sales_record_df, ['item_id', 'shop_id', 'date_block_num', 'month'], np.int16)
sales_record_df = _transfer_type(sales_record_df, ['item_cnt_month'], np.float16)
sales_record_df = sales_record_df.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


+ **Feature 'city'**: Each shop_name starts with the city name.

In [19]:
shop_df.loc[shop_df.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shop_df['shop_name'] = shop_df['shop_name'].astype(str)
shop_df['city'] = shop_df['shop_name'].str.split(' ').map(lambda x: x[0])
shop_df.loc[shop_df.city == '!Якутск', 'city'] = 'Якутск'

encoder = LabelEncoder()
encoder.fit(shop_df['city'])
shop_df['city_code'] = encoder.transform(shop_df['city'])

+ **Feature 'type'**: Each category contains type and subtype in its name.
+ **Feature 'subtype'**: Each category contains type and subtype in its name.

In [20]:
category_df['item_category_name'] = category_df['item_category_name'].astype(str)
category_df['type'] = category_df['item_category_name'].map(lambda x: x.split('-')[0].strip())

encoder = LabelEncoder()
encoder.fit(category_df['type'])
category_df['type_code'] = encoder.transform(category_df['type'])

In [21]:
category_df['subtype'] = category_df['item_category_name'].map(lambda x: x.split('-')[1].strip() if len(x.split('-')) > 1 else x.split('-')[0].strip())

encoder = LabelEncoder()
encoder.fit(category_df['subtype'])
category_df['subtype_code'] = encoder.transform(category_df['subtype'])

## Merge Data into one entity

In [22]:
sales_record_df = pd.merge(sales_record_df, item_df, on=['item_id'], how='left')
sales_record_df = pd.merge(sales_record_df, shop_df, on=['shop_id'], how='left')
sales_record_df = pd.merge(sales_record_df, category_df, on=['item_category_id'], how='left')
sales_record_df = sales_record_df.fillna(0)

sales_record_df = sales_record_df.drop(['city', 'type', 'subtype', 'item_name', 'shop_name', 'item_category_name'], axis=1)
sales_record_df = _transfer_type(sales_record_df, ['item_category_id', 'city_code', 'type_code', 'subtype_code'], np.int16)

sales_record_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5237890 entries, 0 to 5237889
Data columns (total 33 columns):
item_id             int16
shop_id             int16
r1                  float16
r2                  float16
r3                  float16
r4                  float16
r5                  float16
r6                  float16
r7                  float16
r8                  float16
r9                  float16
r10                 float16
r11                 float16
r12                 float16
r13                 float16
r14                 float16
r15                 float16
r16                 float16
r17                 float16
r18                 float16
r19                 float16
r20                 float16
r21                 float16
r22                 float16
r23                 float16
r24                 float16
item_cnt_month      float16
date_block_num      int16
month               int16
item_category_id    int16
city_code           int16
type_code           int16
subty

In [23]:
def _mean(df, prefix):
    
    df[prefix + '25'] = 0
    for i in range(1, 25, 1):
        df[prefix + '25'] = df[prefix + '25'] + df[prefix + str(i)]
    
    df[prefix + '25'] = df[prefix + '25'] /24
    df[prefix + '25'] = df[prefix + '25'].fillna(0).astype(np.float16)
    return df

In [24]:
sales_record_df = _mean(sales_record_df, 'r')

In [25]:
def _agg(df, cols, prefix):
    
    result = df[cols].drop_duplicates()
    
    for i in range(1, 25, 1):
        t = df.groupby(cols)['r'+str(i)].mean().fillna(0).astype(np.float16).reset_index(name=prefix+str(i))
        result = pd.merge(result, t, on=cols, how='left')
    
    result = _mean(result, prefix)
    
    return result

In [26]:
sc_df = _agg(sales_record_df, ['shop_id', 'item_category_id'], 'sc')
st_df = _agg(sales_record_df, ['shop_id', 'type_code'], 'st')
i_df = _agg(sales_record_df, ['item_id'], 'i')
it_df = _agg(sales_record_df, ['item_category_id'], 'it')
s_df = _agg(sales_record_df, ['shop_id'], 's')
c_df = _agg(sales_record_df, ['city_code'], 'c')

sales_record_df = pd.merge(sales_record_df, sc_df, on=['shop_id', 'item_category_id'], how='left')
sales_record_df = pd.merge(sales_record_df, st_df, on=['shop_id', 'type_code'], how='left')
sales_record_df = pd.merge(sales_record_df, i_df, on=['item_id'], how='left')
sales_record_df = pd.merge(sales_record_df, it_df, on=['item_category_id'], how='left')
sales_record_df = pd.merge(sales_record_df, s_df, on=['shop_id'], how='left')
sales_record_df = pd.merge(sales_record_df, c_df, on=['city_code'], how='left')

In [28]:
sales_record_df = sales_record_df.fillna(0)
sales_record_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5237890 entries, 0 to 5237889
Columns: 184 entries, item_id to c25
dtypes: float16(176), int16(8)
memory usage: 1.8 GB


## Feature Selection

In [29]:
dataset = sales_record_df
dataset.columns.values

array(['item_id', 'shop_id', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7',
       'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 'r16', 'r17',
       'r18', 'r19', 'r20', 'r21', 'r22', 'r23', 'r24', 'item_cnt_month',
       'date_block_num', 'month', 'item_category_id', 'city_code',
       'type_code', 'subtype_code', 'r25', 'sc1', 'sc2', 'sc3', 'sc4',
       'sc5', 'sc6', 'sc7', 'sc8', 'sc9', 'sc10', 'sc11', 'sc12', 'sc13',
       'sc14', 'sc15', 'sc16', 'sc17', 'sc18', 'sc19', 'sc20', 'sc21',
       'sc22', 'sc23', 'sc24', 'sc25', 'st1', 'st2', 'st3', 'st4', 'st5',
       'st6', 'st7', 'st8', 'st9', 'st10', 'st11', 'st12', 'st13', 'st14',
       'st15', 'st16', 'st17', 'st18', 'st19', 'st20', 'st21', 'st22',
       'st23', 'st24', 'st25', 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7',
       'i8', 'i9', 'i10', 'i11', 'i12', 'i13', 'i14', 'i15', 'i16', 'i17',
       'i18', 'i19', 'i20', 'i21', 'i22', 'i23', 'i24', 'i25', 'it1',
       'it2', 'it3', 'it4', 'it5', 'it6', 'it7', 'it8', 'it

划分测试和训练集合

In [30]:
dataset_beta = dataset[dataset.date_block_num < 33]
dataset_alpha = dataset[dataset.date_block_num == 33]

In [31]:
features = [
    'item_id', 'shop_id', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7',
    'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 'r16', 'r17',
    'r18', 'r19', 'r20', 'r21', 'r22', 'r23', 'r24',
    'date_block_num', 'month', 'item_category_id', 'city_code',
    'type_code', 'subtype_code', 'r25', 'sc1', 'sc2', 'sc3', 'sc4',
    'sc5', 'sc6', 'sc7', 'sc8', 'sc9', 'sc10', 'sc11', 'sc12', 'sc13',
    'sc14', 'sc15', 'sc16', 'sc17', 'sc18', 'sc19', 'sc20', 'sc21',
    'sc22', 'sc23', 'sc24', 'sc25', 'st1', 'st2', 'st3', 'st4', 'st5',
    'st6', 'st7', 'st8', 'st9', 'st10', 'st11', 'st12', 'st13', 'st14',
    'st15', 'st16', 'st17', 'st18', 'st19', 'st20', 'st21', 'st22',
    'st23', 'st24', 'st25', 'i1', 'i2', 'i3', 'i4', 'i5', 'i6', 'i7',
    'i8', 'i9', 'i10', 'i11', 'i12', 'i13', 'i14', 'i15', 'i16', 'i17',
    'i18', 'i19', 'i20', 'i21', 'i22', 'i23', 'i24', 'i25', 'it1',
    'it2', 'it3', 'it4', 'it5', 'it6', 'it7', 'it8', 'it9', 'it10',
    'it11', 'it12', 'it13', 'it14', 'it15', 'it16', 'it17', 'it18',
    'it19', 'it20', 'it21', 'it22', 'it23', 'it24', 'it25', 's1', 's2',
    's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12',
    's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21',
    's22', 's23', 's24', 's25', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6',
    'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'c15', 'c16',
    'c17', 'c18', 'c19', 'c20', 'c21', 'c22', 'c23', 'c24', 'c25'
]
label = 'item_cnt_month'

## Train

### 全量数据进行训练

In [32]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset_beta[features], dataset_beta[label], test_size=0.3, random_state=2019)
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
valid_dmatrix = xgb.DMatrix(X_test, label=y_test)

In [None]:
watchlist = [(train_dmatrix, 'train'), (valid_dmatrix, 'validate')] 

params = {
  'booster': 'gbtree',
  'objective': 'reg:linear',
  'eta': 0.1,
  'gamma': 0.7000000000000001,
  'min_child_weight': 7,
  'max_depth': 4,
  'subsample': 0.5,
  'colsample_bytree': 0.1,
#   'num_round': 190,
  'nthread': 2,
  'silent': 0,
  'seed': 2019,
  "max_evals": 200,
}

watchlist = [(train_dmatrix, 'train'), (valid_dmatrix, 'validate')] 
bst = xgb.train(params, train_dmatrix, evals=watchlist, early_stopping_rounds=10, num_boost_round=190)

In [None]:
model = xgb.train(params, train_dmatrix, num_boost_round=bst.best_iteration)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

t = pd.merge(test_df, dataset_alpha, on=['shop_id', 'item_id'], how='left')
test_dmatrix = xgb.DMatrix(t[features], label=t[label])

pred = model.predict(test_dmatrix)
sqrt(mean_squared_error(t[label], pred))

## Predict

In [36]:
tdf = _extract(sales_detail_df, 35, is_test=True)
t_cols = ['item_id','shop_id', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 'r16', 'r17', 'r18', 'r19', 'r20', 'r21', 'r22', 'r23', 'r24', 'date_block_num']
tdf = pd.DataFrame(columns=t_cols, data=tdf.values)
tdf = tdf.drop_duplicates(['item_id','shop_id'])

dataset_predict = pd.merge(test_df, tdf, on=['item_id', 'shop_id'], how='left')
dataset_predict = dataset_predict.fillna(0)

dataset_predict = pd.merge(dataset_predict, item_df, on=['item_id'], how='left')
dataset_predict = pd.merge(dataset_predict, shop_df, on=['shop_id'], how='left')
dataset_predict = pd.merge(dataset_predict, category_df, on=['item_category_id'], how='left')

dataset_predict['month'] = dataset_predict['date_block_num'].apply(lambda x: (x % 12) + 1)

dataset_predict = dataset_predict.drop(['city', 'type', 'subtype', 'item_name', 'shop_name', 'item_category_name'], axis=1)
dataset_predict = _transfer_type(dataset_predict, ['item_category_id', 'city_code', 'type_code', 'subtype_code', 'month'], np.int16)

dataset_predict = _mean(dataset_predict, 'r')

sc_df = _agg(dataset_predict, ['shop_id', 'item_category_id'], 'sc')
st_df = _agg(dataset_predict, ['shop_id', 'type_code'], 'st')
i_df = _agg(dataset_predict, ['item_id'], 'i')
it_df = _agg(dataset_predict, ['item_category_id'], 'it')
s_df = _agg(dataset_predict, ['shop_id'], 's')
c_df = _agg(dataset_predict, ['city_code'], 'c')

dataset_predict = pd.merge(dataset_predict, sc_df, on=['shop_id', 'item_category_id'], how='left')
dataset_predict = pd.merge(dataset_predict, st_df, on=['shop_id', 'type_code'], how='left')
dataset_predict = pd.merge(dataset_predict, i_df, on=['item_id'], how='left')
dataset_predict = pd.merge(dataset_predict, it_df, on=['item_category_id'], how='left')
dataset_predict = pd.merge(dataset_predict, s_df, on=['shop_id'], how='left')
dataset_predict = pd.merge(dataset_predict, c_df, on=['city_code'], how='left')
dataset_predict = dataset_predict.fillna(0)
dataset_predict.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


<class 'pandas.core.frame.DataFrame'>
Int64Index: 214200 entries, 0 to 214199
Columns: 183 entries, shop_id to c25
dtypes: float16(151), float64(25), int16(5), int64(2)
memory usage: 109.5 MB


In [37]:
predict_dmatrix = xgb.DMatrix(dataset_predict[features])
pred = model.predict(predict_dmatrix)

In [38]:
# pred = list(map(lambda x: min(20,max(x,0)), list(pred)))

submission = pd.DataFrame({
    "ID": test_df.index, 
    "item_cnt_month": pred.clip(0, 20)
})

submission.to_csv('submission.csv', index=False)

# Reference

## Normal ML Analysis

+ [Feature engineering, xgboost](https://www.kaggle.com/dlarionov/feature-engineering-xgboost)
+ [Predicting sales using Lightgbm](https://www.kaggle.com/sanket30/predicting-sales-using-lightgbm)

## Time Series

对于revenge的预测很有帮助，但是对于单个店的单个商品而言没有实际的意义，这种预测方式的学习非常有益处

+ [AR(I)MA时间序列建模过程——步骤和python代码](https://www.jianshu.com/p/cced6617b423)
+ [python时间序列分析](http://www.cnblogs.com/foley/p/5582358.html)
+ [AR、MA及ARMA模型](https://zhuanlan.zhihu.com/p/22248464)
+ [Time Series with Python (ODSC) STA.ipynb](https://github.com/ultimatist/ODSC17/blob/master/Time%20Series%20with%20Python%20(ODSC)%20STA.ipynb)
+ [Getting Started with Time Series](https://pyflux.readthedocs.io/en/latest/getting_started.html)
+ [Welcome to Statsmodels’s Documentation](http://www.statsmodels.org/devel/index.html)