# Objective
This notebook simply concatenates the numeric features with text generated features.
Since the data size was getting large, a separate notebook was needed to combine the text generated features with all other features. Doing it all in same notebook was creating memory issues.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['addingzerototrain', 'fork-of-treebasedapproachdata', 'nn-features', 'competitive-data-science-predict-future-sales', 'textfeatures']


In [2]:
COMBINED_DATA_FPATH='DATA_WITH_TXT.h5'

In [3]:
DATA_FPATH = '../input/nn-features/NNDATA.hdf'
TEST_LIKE_SALES_FPATH = '../input/addingzerototrain/train_with_zero.hdf'
SALES_FPATH ='../input/competitive-data-science-predict-future-sales/sales_train.csv'
ITEMS_FPATH = '../input/competitive-data-science-predict-future-sales/items.csv'
SHOPS_FPATH = '../input/competitive-data-science-predict-future-sales/shops.csv'
TEST_SALES_FPATH = '../input/competitive-data-science-predict-future-sales/test.csv'
SAMPLE_SUBMISSION_FPATH = '../input/competitive-data-science-predict-future-sales/sample_submission.csv'
TRAINED_MODEL_FPATH = 'trained_model.bin'

TEXT_FEATURE_FPATH = '../input/textfeatures/text_features.h5'

In [4]:
# Load preprocessed data.
X_df = pd.read_hdf(DATA_FPATH, 'X')
y_df = pd.read_hdf(DATA_FPATH, 'y')

sales_df = pd.read_hdf(TEST_LIKE_SALES_FPATH, 'df')

item_text = pd.read_hdf(TEXT_FEATURE_FPATH, 'item_500')
shop_text = pd.read_hdf(TEXT_FEATURE_FPATH, 'shop_50')
category_text = pd.read_hdf(TEXT_FEATURE_FPATH, 'category_60')

In [5]:
float64_cols = X_df.dtypes[X_df.dtypes == np.float64].index.tolist()
if float64_cols:
    X_df[float64_cols] = X_df[float64_cols].astype(np.float32)

In [6]:
nan_df = X_df.isna().any().sort_values()

In [7]:
nan_df.unique()

array([False])

In [8]:
X_df.columns

Index(['item_cnt_day_1M_sum', 'item_cnt_day_1M_min', 'item_cnt_day_1M_max',
       'item_cnt_day_1M_0.25_q', 'item_cnt_day_1M_0.5_q',
       'item_cnt_day_1M_0.75_q', 'item_cnt_day_1M_0.9_q',
       'item_cnt_day_2M_sum', 'item_cnt_day_2M_min', 'item_cnt_day_2M_max',
       'item_cnt_day_2M_0.25_q', 'item_cnt_day_2M_0.5_q',
       'item_cnt_day_2M_0.75_q', 'item_cnt_day_2M_0.9_q',
       'item_cnt_day_4M_sum', 'item_cnt_day_4M_min', 'item_cnt_day_4M_max',
       'item_cnt_day_4M_0.25_q', 'item_cnt_day_4M_0.5_q',
       'item_cnt_day_4M_0.75_q', 'item_cnt_day_4M_0.9_q', 'month', 'year',
       'shop_id', 'item_id', 'item_category_id', 'avg_item_price',
       'last_item_price', 'std_item_price', 'category_item_price',
       'sub_category_item_price', 'avg_category_item_price',
       'avg_sub_category_item_price', 'avg_dollar_value', 'last_dollar_value',
       'std_dollar_value', 'category_dollar_value',
       'sub_category_dollar_value', 'avg_category_dollar_value',
       'avg_sub_

## Target value cleaning

In [9]:
y_df[y_df > 20 ]  = 20
y_df[y_df < 0] = 0


## Handling data types for memory efficiency

In [10]:
t_cols =['item_name_text_{}'.format(i) for i in range(50)]
item_text = item_text[['item_id'] + t_cols]
item_text[t_cols] = item_text[t_cols].astype(np.float32)

t_cols = ['shop_name_text_{}'.format(i) for i in range(8)]
shop_text = shop_text[['shop_id'] + t_cols]
shop_text[t_cols] = shop_text[t_cols].astype(np.float32)

t_cols =['item_category_name_text_{}'.format(i) for i in range(5)]
category_text = category_text[['item_category_id'] + t_cols]
category_text[t_cols] = category_text[t_cols].astype(np.float32)

In [11]:
def add_text_features(df):
    df.reset_index(inplace=True)
    df = pd.merge(df, shop_text, how='left', on='shop_id')
    print('Shop text added')
    gc.collect()
    df = pd.merge(df, category_text, how='left', on='item_category_id')
    print('Category text added')
    gc.collect()
    df = pd.merge(df, item_text, how='left', on='item_id')
    print('Item text added')
    gc.collect()
    df.set_index('index',inplace=True)
    return df


# Feature concatenation
We separate the data into years. For each year we create one file. This is done just to circumvent the out of memory issue.

In [12]:
y_df.to_hdf(COMBINED_DATA_FPATH, 'y')
del y_df

for year in X_df.year.unique():
    X_train_df = X_df[X_df.year == year].copy()
    X_train_df = add_text_features(X_train_df)
    gc.collect()
    train_columns = X_train_df.columns.tolist()
    X_train_df.to_hdf(COMBINED_DATA_FPATH,'X_{}'.format(year))    
    del X_train_df
    gc.collect()

del X_df
gc.collect()

Shop text added
Category text added
Item text added
Shop text added
Category text added
Item text added
Shop text added
Category text added
Item text added


0

In [13]:
len(train_columns)

143

## Feature concatenation for test data.

In [14]:
test_X_df = pd.read_hdf(DATA_FPATH, 'test_X')
test_X_df = add_text_features(test_X_df)
test_X_df = test_X_df[train_columns]
if float64_cols:
    test_X_df[float64_cols] = test_X_df[float64_cols].astype(np.float32)
test_X_df.to_hdf(COMBINED_DATA_FPATH, 'X_test')


Shop text added
Category text added
Item text added
