In [3]:
import pandas as pd
import numpy as np
import gc
%matplotlib inline 
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [4]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [5]:
## load data
item_categories = downcast_dtypes(pd.read_csv('data/item_categories.csv'))
items = downcast_dtypes(pd.read_csv('data/items.csv'))
shops = downcast_dtypes(pd.read_csv('data/shops.csv'))
sales_train = downcast_dtypes(pd.read_csv('data/sales_train.csv'))

In [6]:
test = downcast_dtypes(pd.read_csv('data/test.csv'))
test.shape

(214200, 3)

In [7]:
item_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [8]:
test_train = test.copy()
test_train['date_block_num'] = 34
test_train.head()

Unnamed: 0,ID,shop_id,item_id,date_block_num
0,0,5,5037,34
1,1,5,5320,34
2,2,5,5233,34
3,3,5,5232,34
4,4,5,5268,34


In [9]:
from itertools import product
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales_train['date_block_num'].unique():
    cur_shops = sales_train[sales_train['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales_train[sales_train['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

#get aggregated values for (shop_id, item_id, month)
gb = sales_train.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}}) #, 'item_price' : {'max_price':'max', 'min_price':'min', 'avg_price': 'mean'}})

#fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
all_data = pd.concat([all_data, test_train.drop('ID', axis=1)], axis=0)

# # Groupby data to get shop-item-month aggregates
# gb = sales_train.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# # Fix column names
# gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# # Join it to the grid
# all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales_train.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales_train.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

## create future using item category
all_data = all_data.merge(items[['item_id','item_category_id']], how='left', on = ['item_id'])
sales_train_dummy = sales_train.merge(items, how='left', on=['item_id'])
gb = sales_train_dummy.groupby(['item_category_id', 'date_block_num'], as_index=False).agg({'item_cnt_day': {'target_item_category' : 'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_category_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb, sales_train_dummy
gc.collect();

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [10]:
all_data[all_data.date_block_num ==34].head(10)

Unnamed: 0,date_block_num,item_id,shop_id,target,target_shop,target_item,item_category_id,target_item_category
10913850,34,5037,5,0.0,0.0,0.0,19,0.0
10913851,34,5320,5,0.0,0.0,0.0,55,0.0
10913852,34,5233,5,0.0,0.0,0.0,19,0.0
10913853,34,5232,5,0.0,0.0,0.0,23,0.0
10913854,34,5268,5,0.0,0.0,0.0,20,0.0
10913855,34,5039,5,0.0,0.0,0.0,23,0.0
10913856,34,5041,5,0.0,0.0,0.0,20,0.0
10913857,34,5046,5,0.0,0.0,0.0,55,0.0
10913858,34,5319,5,0.0,0.0,0.0,55,0.0
10913859,34,5003,5,0.0,0.0,0.0,20,0.0


In [11]:
# List of columns that we will use to create lags
index_cols.append('item_category_id')
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
# item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

# all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();




In [12]:
all_data[all_data.date_block_num ==34].head(10)

Unnamed: 0,date_block_num,item_id,shop_id,target,target_shop,target_item,item_category_id,target_item_category,target_lag_1,target_item_lag_1,...,target_item_category_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_item_category_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_item_category_lag_12,target_shop_lag_12
10913850,34,5037,5,0.0,0.0,0.0,19,0.0,0.0,25.0,...,3178.0,991.0,1.0,105.0,3487.0,954.0,1.0,65.0,6134.0,1445.0
10913851,34,5320,5,0.0,0.0,0.0,55,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10913852,34,5233,5,0.0,0.0,0.0,19,0.0,1.0,42.0,...,3178.0,991.0,2.0,119.0,3487.0,954.0,0.0,0.0,0.0,0.0
10913853,34,5232,5,0.0,0.0,0.0,23,0.0,0.0,28.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10913854,34,5268,5,0.0,0.0,0.0,20,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10913855,34,5039,5,0.0,0.0,0.0,23,0.0,1.0,29.0,...,2759.0,991.0,0.0,84.0,2984.0,954.0,0.0,45.0,5275.0,1445.0
10913856,34,5041,5,0.0,0.0,0.0,20,0.0,2.0,62.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10913857,34,5046,5,0.0,0.0,0.0,55,0.0,0.0,12.0,...,6474.0,991.0,1.0,21.0,6017.0,954.0,1.0,29.0,9809.0,1445.0
10913858,34,5319,5,0.0,0.0,0.0,55,0.0,0.0,26.0,...,6474.0,991.0,4.0,56.0,6017.0,954.0,5.0,270.0,9809.0,1445.0
10913859,34,5003,5,0.0,0.0,0.0,20,0.0,0.0,95.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
# item_category_vect = CountVectorizer()
# item_name_vect = CountVectorizer()
# shop_name_vect = CountVectorizer()

In [14]:
# item_category_vect.fit(item_categories.item_category_name)
# item_name_vect.fit(items.item_name)
# shop_name_vect.fit(shops.shop_name)

# item_categories = pd.concat([item_categories,pd.DataFrame(item_category_vect.transform(item_categories.item_category_name).todense(), columns=item_category_vect.get_feature_names(),dtype=np.int32)], axis=1)
# items = pd.concat([items, pd.DataFrame(item_name_vect.transform(items.item_name).todense(), columns=item_name_vect.get_feature_names(), dtype=np.int32)], axis=1)
# shops = pd.concat([shops, pd.DataFrame(shop_name_vect.transform(shops.shop_name).todense(), columns=shop_name_vect.get_feature_names(), dtype=np.int32)], axis=1)
# del item_category_vect, item_name_vect, shop_name_vect
# gc.collect();
# print('item_categories shape: ', item_categories.shape)
# print('items shape: ', items.shape)
# print('shops shape: ', shops.shape)

In [15]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']

last_block = dates.max()
val_block = dates.max() -1
print('Test `date_block_num` is %d' % last_block)
print('Val `date_block_num` is %d' % val_block)

Test `date_block_num` is 34
Val `date_block_num` is 33


In [16]:
dates_train = dates[dates < val_block]
dates_val = dates[dates == val_block]
dates_test = dates[dates == last_block]

X_train = all_data.loc[dates <  val_block]
X_val = all_data.loc[dates == val_block]
X_test =  all_data.loc[dates == last_block]

y_train = all_data.loc[dates <  val_block, 'target'].values
y_val =  all_data.loc[dates == val_block, 'target'].values

In [17]:
#mean encoding

# item_id encoding
from sklearn.model_selection import KFold
target = X_train.target.values
kf = KFold(n_splits = 5, shuffle = False)
X_train['item_target_enc'] = np.nan
for tr_inds, val_inds in kf.split(target):
    tr_fold, val_fold = X_train.iloc[tr_inds], X_train.iloc[val_inds]
    val_fold['item_target_enc'] = val_fold['item_id'].map(tr_fold.groupby('item_id').target.mean())
    X_train.iloc[val_inds, :] = val_fold

global_mean = X_train.target.mean()
X_train.fillna(global_mean, inplace= True)

#shopid_id encoding
X_train['shop_target_enc'] = np.nan
for tr_inds, val_inds in kf.split(target):
    tr_fold, val_fold = X_train.iloc[tr_inds], X_train.iloc[val_inds]
    val_fold['shop_target_enc'] = val_fold['shop_id'].map(tr_fold.groupby('shop_id').target.mean())
    X_train.iloc[val_inds, :] = val_fold

global_mean = X_train.target.mean()
X_train.fillna(global_mean, inplace= True)

# caategory_id encoding
X_train['category_target_enc'] = np.nan
for tr_inds, val_inds in kf.split(target):
    tr_fold, val_fold = X_train.iloc[tr_inds], X_train.iloc[val_inds]
    val_fold['category_target_enc'] = val_fold['item_category_id'].map(tr_fold.groupby('item_category_id').target.mean())
    X_train.iloc[val_inds, :] = val_fold

global_mean = X_train.target.mean()
X_train.fillna(global_mean, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-vi

In [18]:
X_train = downcast_dtypes(X_train)
X_train.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


Unnamed: 0,date_block_num,item_id,shop_id,target,target_shop,target_item,item_category_id,target_item_category,target_lag_1,target_item_lag_1,...,target_item_lag_5,target_item_category_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_item_category_lag_12,target_shop_lag_12,item_target_enc,shop_target_enc,category_target_enc
4488756,12,27,2,0.0,890.0,1.0,19,9282.0,0.0,4.0,...,1.0,10488.0,875.0,1.0,7.0,8983.0,1146.0,0.040816,0.161184,0.585405
4488757,12,30,2,0.0,890.0,58.0,40,22065.0,0.0,47.0,...,19.0,24130.0,875.0,0.0,0.0,0.0,0.0,0.205596,0.161184,0.242273
4488758,12,31,2,0.0,890.0,15.0,37,7511.0,0.0,25.0,...,25.0,8680.0,875.0,0.0,0.0,0.0,0.0,0.324818,0.161184,0.159289
4488759,12,32,2,1.0,890.0,84.0,40,22065.0,0.0,89.0,...,72.0,24130.0,875.0,0.0,299.0,33489.0,1146.0,0.603406,0.161184,0.242273
4488760,12,33,2,1.0,890.0,42.0,37,7511.0,1.0,42.0,...,35.0,8680.0,875.0,1.0,61.0,6094.0,1146.0,0.350365,0.161184,0.159289
4488761,12,34,2,0.0,890.0,6.0,40,22065.0,0.0,5.0,...,12.0,24130.0,875.0,0.0,9.0,33489.0,1146.0,0.033784,0.161184,0.242273
4488762,12,36,2,0.0,890.0,2.0,37,7511.0,0.0,6.0,...,3.0,8680.0,875.0,0.0,0.0,0.0,0.0,0.031311,0.161184,0.159289
4488763,12,37,2,0.0,890.0,5.0,40,22065.0,0.0,8.0,...,15.0,24130.0,875.0,0.0,0.0,0.0,0.0,0.053459,0.161184,0.242273
4488764,12,39,2,0.0,890.0,1.0,41,1257.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022472,0.161184,0.141051
4488765,12,40,2,0.0,890.0,2.0,57,984.0,0.0,5.0,...,3.0,1316.0,875.0,0.0,4.0,1539.0,1146.0,0.037433,0.161184,0.093589


In [19]:
#mean encoding for test data
X_val['item_target_enc'] = X_val['item_id'].map(X_train.groupby('item_id').target.mean())
X_val['shop_target_enc'] = X_val['shop_id'].map(X_train.groupby('shop_id').target.mean())
X_val['category_target_enc'] = X_val['item_category_id'].map(X_train.groupby('item_category_id').target.mean())
X_val.fillna(global_mean, inplace=True)
X_val = downcast_dtypes(X_val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-

In [20]:
## mean encoding for test data
X_test['item_target_enc'] = X_test['item_id'].map(all_data[all_data.date_block_num < last_block].groupby('item_id').target.mean())
X_test['shop_target_enc'] = X_test['shop_id'].map(all_data[all_data.date_block_num < last_block].groupby('shop_id').target.mean())
X_test['category_target_enc'] = X_test['item_category_id'].map(all_data[all_data.date_block_num < last_block].groupby('item_category_id').target.mean())
X_test.fillna(global_mean, inplace=True)
X_test = downcast_dtypes(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-

In [28]:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [34]:
class DropItems(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict.drop(self.key, axis=1)

In [35]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('unionCount', FeatureUnion(
        transformer_list=[
            ('item_count', Pipeline([
                ('item_name_select', ItemSelector('item_name')),
                ('countVector', CountVectorizer())])),
            ('item_cat_count', Pipeline([
                ('item_cat_name_select', ItemSelector('item_category_name')),
                ('countVector', CountVectorizer())])),
            ('shop_count', Pipeline([
                ('shop_name_select', ItemSelector('shop_name')),
                ('countVector', CountVectorizer())])),
            ('droped_columns', DropItems(['item_name', 'item_category_name','shop_name']))
        ])),
    ('lr', LinearRegression())
])

In [21]:
X_train = pd.merge(X_train, item_categories, how='left', on=['item_category_id']).fillna(0)
X_train = pd.merge(X_train, shops, how='left', on=['shop_id']).fillna(0)
X_train = pd.merge(X_train, items, how='left', on=['item_id']).fillna(0)
X_train.shape

(6186922, 39)

In [22]:
X_val = pd.merge(X_val, item_categories, how='left', on=['item_category_id']).fillna(0)
X_val = pd.merge(X_val, shops, how='left', on=['shop_id']).fillna(0)
X_val = pd.merge(X_val, items, how='left', on=['item_id']).fillna(0)
X_val.shape

(238172, 39)

In [23]:
X_test = pd.merge(X_test, item_categories, how='left', on=['item_category_id']).fillna(0)
X_test = pd.merge(X_test, shops, how='left', on=['shop_id']).fillna(0)
X_test = pd.merge(X_test, items, how='left', on=['item_id']).fillna(0)
X_test.shape

(214200, 39)

In [27]:
X_train = X_train.drop( to_drop_cols + ['shop_id', 'item_id', 'item_category_id_x', 'item_category_id_y'], axis=1)
X_val = X_val.drop(to_drop_cols + ['shop_id', 'item_id', 'item_category_id_x', 'item_category_id_y'], axis=1)
X_val.head()

Unnamed: 0,target_lag_1,target_item_lag_1,target_item_category_lag_1,target_shop_lag_1,target_lag_2,target_item_lag_2,target_item_category_lag_2,target_shop_lag_2,target_lag_3,target_item_lag_3,...,target_lag_12,target_item_lag_12,target_item_category_lag_12,target_shop_lag_12,item_target_enc,shop_target_enc,category_target_enc,item_category_name,shop_name,item_name
0,0.0,3.0,6779.0,822.0,0.0,6.0,8513.0,942.0,0.0,4.0,...,0.0,13.0,13639.0,945.0,0.301508,0.153278,0.244925,Кино - DVD,"Адыгея ТЦ ""Мега""",007: КООРДИНАТЫ «СКАЙФОЛЛ»
1,0.0,9.0,2989.0,822.0,0.0,53.0,3426.0,942.0,0.0,6.0,...,0.0,8.0,5074.0,945.0,0.324623,0.153278,0.162352,Кино - Blu-Ray,"Адыгея ТЦ ""Мега""",007: КООРДИНАТЫ «СКАЙФОЛЛ» (BD)
2,0.0,19.0,6779.0,822.0,1.0,30.0,8513.0,942.0,0.0,21.0,...,2.0,37.0,13639.0,945.0,0.700503,0.153278,0.244925,Кино - DVD,"Адыгея ТЦ ""Мега""",1+1
3,1.0,16.0,2989.0,822.0,0.0,14.0,3426.0,942.0,1.0,15.0,...,0.0,14.0,5074.0,945.0,0.40402,0.153278,0.162352,Кино - Blu-Ray,"Адыгея ТЦ ""Мега""",1+1 (BD)
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,767.0,945.0,0.046322,0.153278,0.095487,Музыка - MP3,"Адыгея ТЦ ""Мега""",100 Best classical melodies (mp3-CD) (Digipack)


In [36]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('unionCount', FeatureUnion(n_jobs=1,
       transformer_list=[('item_count', Pipeline(steps=[('item_name_select', ItemSelector(key='item_name')), ('countVector', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input...eights=None)), ('lr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [37]:
pred_lr = pipeline.predict(X_val)

print('train r2 score ', r2_score(y_train, pipeline.predict(X_train)))
print('val r2 score ', r2_score(y_val, pred_lr))

train r2 score  0.480534456893
val r2 score  0.25471871847


In [40]:
from sklearn.ensemble import RandomForestRegressor
pipeline_rf = Pipeline([
    ('unionCount', FeatureUnion(
        transformer_list=[
            ('item_count', Pipeline([
                ('item_name_select', ItemSelector('item_name')),
                ('countVector', CountVectorizer())])),
            ('item_cat_count', Pipeline([
                ('item_cat_name_select', ItemSelector('item_category_name')),
                ('countVector', CountVectorizer())])),
            ('shop_count', Pipeline([
                ('shop_name_select', ItemSelector('shop_name')),
                ('countVector', CountVectorizer())])),
            ('droped_columns', DropItems(['item_name', 'item_category_name','shop_name']))
        ])),
    ('lr', RandomForestRegressor(n_estimators=50, n_jobs=-1, max_depth=5, max_features='auto'))
])

In [41]:
pipeline_rf.fit(X_train, y_train)

Pipeline(steps=[('unionCount', FeatureUnion(n_jobs=1,
       transformer_list=[('item_count', Pipeline(steps=[('item_name_select', ItemSelector(key='item_name')), ('countVector', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input...imators=50, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))])

In [44]:
pred_rf = pipeline.predict(X_val)

# print('train r2 score ', r2_score(y_train, pipeline_rf.predict(X_train)))
# print('val r2 score ', r2_score(y_val, pred_rf))

NotFittedError: CountVectorizer - Vocabulary wasn't fitted.