In [1]:
import numpy as np
import pandas as pd

import operator
import pickle
from datetime import datetime

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, Normalizer

from scipy import sparse

In [2]:
PATH_TO_DATA = './../data/kaggle_receipts/'
PATH_TO_PROCESSED_DATA = PATH_TO_DATA + 'processed/'

## load data train

In [3]:
train_df = pd.read_csv(PATH_TO_DATA + 'train.csv')
#train_df.head(15)

In [4]:
train_df.shape

(13682, 5)

In [5]:
train_checks_df = pd.read_csv(PATH_TO_DATA + 'train_checks.csv')
#train_checks_df.head()

In [6]:
train_checks_df.shape

(2042, 4)

In [7]:
train_full_df = pd.merge(train_df, train_checks_df, on='check_id')
train_full_df.head(15)

Unnamed: 0,check_id,name,category,price,count,shop_name,datetime,sum
0,0,*3479755 ТRUF.Конф.кр.корп.гл.вк.шок180г,Чай и сладкое,49.0,2.0,Не известно,2017-12-30 19:15:00,520.1
1,0,3408392 ECONTA Мешки д/мусора 30л 30шт,Для дома,21.0,1.0,Не известно,2017-12-30 19:15:00,520.1
2,0,3260497 ЯШКИНО Рулет С ВАР.СГУЩ. 200г,Чай и сладкое,39.0,1.0,Не известно,2017-12-30 19:15:00,520.1
3,0,3300573 Пакет ПЯТЕРОЧКА 65х40см,Упаковка,4.0,1.0,Не известно,2017-12-30 19:15:00,520.1
4,0,3413607 ЗЕР/СЕЛ.Сухари с изюмом 250г,Чай и сладкое,35.0,1.0,Не известно,2017-12-30 19:15:00,520.1
5,0,3221388 ШАРЛ.Печенье вафел.рассыпч.225г,Чай и сладкое,38.0,1.0,Не известно,2017-12-30 19:15:00,520.1
6,0,"*97452 ПРОСТ.Кефир 3,2% 930г",Молочка,55.0,1.0,Не известно,2017-12-30 19:15:00,520.1
7,0,57575 MILFORD Зам.сахара доз. 650таб,Бакалея,119.0,1.0,Не известно,2017-12-30 19:15:00,520.1
8,0,29880 ПИСК.Ацидоб.2.2%сл.пюр-пак0.5л,Молочка,34.0,1.0,Не известно,2017-12-30 19:15:00,520.1
9,1,ШОКОЛАДНОЕ ЯЙЦО 20Г КИНДЕР СЮРПРИЗ ФЕРРЕ,Дети,49.0,1.0,ЕВРОПА,2018-01-03 18:01:29,188.0


In [8]:
train_full_df.shape

(13682, 8)

## load data test

In [9]:
#test_df = pd.read_csv('./data/test.csv', index_col=0)
test_df = pd.read_csv(PATH_TO_DATA + 'test.csv')
test_checks_df = pd.read_csv(PATH_TO_DATA + 'test_checks.csv')

In [10]:
test_df.shape, test_checks_df.shape

((3000, 5), (933, 4))

In [11]:
test_full_df = pd.merge(test_df, test_checks_df, on='check_id')
test_full_df.head(15)

Unnamed: 0,id,check_id,name,price,count,shop_name,datetime,sum
0,0,3947,П/Ф д/чахохбили и шашлыка охл. 1,119.9,0.778,Тандер,2018-02-03 12:40:00,222.0
1,1,3947,АПЕЛЬСИНЫ свеж. (цена за 1кг),48.9,1.054,Тандер,2018-02-03 12:40:00,222.0
2,2,3947,"Масло подсолнечное раф/ дез 0,9л",49.4,1.0,Тандер,2018-02-03 12:40:00,222.0
3,3,3947,Макаронные изделия Рожки 400гр п,13.9,2.0,Тандер,2018-02-03 12:40:00,222.0
4,4,3948,*стм помада гигиеническая 4.5г яблоко,37.0,1.0,Нытва-Фарм,2018-02-02 10:59:00,137.0
5,5,3948,*страна детства помада детская гигиеническая 4...,37.0,1.0,Нытва-Фарм,2018-02-02 10:59:00,137.0
6,6,3948,hartmann omniplast пластырь фиксирующий тексти...,63.0,1.0,Нытва-Фарм,2018-02-02 10:59:00,137.0
7,7,3949,3219524 КР/ЦЕНА Хлеб БЕЛЫЙ в/с 380г,12.54,2.0,Агроторг,2018-02-03 12:33:00,160.0
8,8,3949,3074383 Картофель 1кг,20.8,1.405,Агроторг,2018-02-03 12:33:00,160.0
9,9,3949,197 Лук репчатый 1кг,21.75,0.6,Агроторг,2018-02-03 12:33:00,160.0


In [12]:
test_full_df.shape

(3000, 8)

## features engineering

In [13]:
features_train = dict()
features_test = dict()

### name

In [14]:
train_full_df['name'].fillna('', inplace=True)
test_full_df['name'].fillna('', inplace=True)

In [15]:
def contain_digits(name):
    digits_list = [str(i) for i in range(10)]
    for symbol in name:
        if symbol in digits_list:
            return True
    return False

def name_processing(name):
    result = name
    
    #result = result.lower()
    #result = ' '.join([token for token in result.split() if (not contain_digits(token)) and (len(token) > 2)])
    #result = result.strip()
    
    result = result.replace('*', '')
    result = ' '.join([token for token in result.split() if (not str.isdigit(token))])
    
    return result

In [16]:
train_name = train_full_df['name'].apply(name_processing)
test_name = test_full_df['name'].apply(name_processing)

#### tf-idf

In [17]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=10000)

tfidf_vectorizer.fit(train_name)

train_name_tfidf = tfidf_vectorizer.transform(train_name)
test_name_tfidf = tfidf_vectorizer.transform(test_name)

print(train_name_tfidf.shape, test_name_tfidf.shape)

(13682, 10000) (3000, 10000)


In [18]:
features_train['name_tfidf'] = train_name_tfidf
features_test['name_tfidf'] = test_name_tfidf

#### count vectorizer 

In [19]:
#count_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=10000)
count_vectorizer = CountVectorizer()

count_vectorizer.fit(train_name)

train_name_cvect = count_vectorizer.transform(train_name)
test_name_cvect = count_vectorizer.transform(test_name)

In [20]:
features_train['name_cvect'] = train_name_cvect
features_test['name_cvect'] = test_name_cvect

#### count vectorizer with minmaxscaler

In [21]:
max_abs_scaler = MaxAbsScaler()
max_abs_scaler.fit(train_name_cvect)

train_name_cvect_mms = max_abs_scaler.transform(train_name_cvect)
test_name_cvect_mms = max_abs_scaler.transform(test_name_cvect)

In [22]:
features_train['name_cvect_mms'] = train_name_cvect_mms
features_test['name_cvect_mms'] = test_name_cvect_mms

#### count vectorizer with normalizer

In [23]:
norm = Normalizer()
norm.fit(train_name_cvect)

train_name_cvect_norm = norm.transform(train_name_cvect)
test_name_cvect_norm = norm.transform(test_name_cvect)

In [24]:
features_train['name_cvect_norm'] = train_name_cvect_norm
features_test['name_cvect_norm'] = test_name_cvect_norm

#### count vectorizer with custom dictionary

In [25]:
top_words_count = 100

top_words_all_categories = set()

for category in np.unique(train_full_df['category']):
    
    train_name_by_category = train_full_df[train_full_df['category'] == category]['name'].apply(name_processing)
    
    count_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=top_words_count * 10)
    train_name_by_category = count_vectorizer.fit_transform(train_name_by_category)
    
    reverse_cv_vocabulary = {v: k for (k, v) in count_vectorizer.vocabulary_.items()}
    sums_for_feature = np.sum(train_name_by_category, axis=0)
    frequency_vocabulary = {reverse_cv_vocabulary[col]: sums_for_feature[0, col] for col in range(sums_for_feature.shape[1])}
    
    top_words = {word[0] for word in sorted(frequency_vocabulary.items(), key=lambda x: x[1], reverse=True)[:top_words_count]}
    
    top_words_all_categories = top_words_all_categories | top_words
    
print('top words for {} categories size: {} '.format(len(np.unique(train_full_df['category'])), 
                                                     len(top_words_all_categories)))

top words for 25 categories size: 2016 


In [26]:
count_vectorizer_topwords = CountVectorizer(vocabulary=top_words_all_categories)

count_vectorizer_topwords.fit(train_name)

train_name_cvect_tw = count_vectorizer_topwords.transform(train_name)
test_name_cvect_tw = count_vectorizer_topwords.transform(test_name)

In [27]:
max_abs_scaler = MaxAbsScaler()
max_abs_scaler.fit(train_name_cvect_tw)

train_name_cvect_tw = max_abs_scaler.transform(train_name_cvect_tw)
test_name_cvect_tw = max_abs_scaler.transform(test_name_cvect_tw)

In [28]:
features_train['name_cvect_tw'] = train_name_cvect_tw
features_test['name_cvect_tw'] = test_name_cvect_tw

### price

In [29]:
PERCENTILE_VALUE = 95

category_percentiles = {cat: np.percentile(train_full_df[train_full_df['category'] == cat]['price'], 
                                           PERCENTILE_VALUE) 
                        for cat in pd.unique(train_full_df['category'])}

def cut_percentile(row):
    return min(row['price'], category_percentiles[row['category']])

In [30]:
train_price = train_full_df.apply(cut_percentile, axis=1)

#test_price = test_full_df.apply(cut_percentile, axis=1)
test_price = test_full_df['price']

In [31]:
features_train['price'] = train_price
features_test['price'] = test_price

In [32]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
min_max_scaler.fit(train_price.values.reshape(-1, 1))

MinMaxScaler(copy=True, feature_range=(0, 1))

In [33]:
train_price = min_max_scaler.transform(train_price.values.reshape(-1, 1))
train_price = pd.DataFrame(train_price, columns=['percentile_filtered_price'])

test_price = min_max_scaler.transform(test_price.values.reshape(-1, 1))
test_price = pd.DataFrame(test_price, columns=['percentile_filtered_price'])

In [34]:
features_train['price_mms'] = train_price
features_test['price_mms'] = test_price

### count

In [35]:
train_count = train_full_df['count']
test_count = test_full_df['count']

In [36]:
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
min_max_scaler.fit(train_count.values.reshape(-1, 1))

MinMaxScaler(copy=True, feature_range=(0, 1))

In [37]:
train_count = min_max_scaler.transform(train_count.values.reshape(-1, 1))
train_count = pd.DataFrame(train_count, columns=['scaled_count'])

test_count = min_max_scaler.transform(test_count.values.reshape(-1, 1))
test_count = pd.DataFrame(test_count, columns=['scaled_count'])

In [38]:
features_train['count'] = train_count
features_test['count'] = test_count

### shop_name

In [39]:
train_full_df['shop_name'].fillna('', inplace=True)
test_full_df['shop_name'].fillna('', inplace=True)

In [40]:
train_shop_name = train_full_df['shop_name']
test_shop_name = test_full_df['shop_name']

In [41]:
na_string = 'unknown'

print('train and test contains na_string:', na_string in train_shop_name.values, na_string in test_shop_name.values)
print('unique nameis in train and test:', len(train_shop_name.unique()), len(test_shop_name.unique()))
print('intersection of train and test:', len(set(train_shop_name.values) & set(test_shop_name.values)))

train and test contains na_string: False False
unique nameis in train and test: 525 129
intersection of train and test: 74


In [42]:
def analyse_shop_name(name):
    if name in train_shop_name.values:
        return name
    else:
        return na_string

In [43]:
train_shop_name_with_na = train_shop_name.append(pd.Series([na_string]), ignore_index=True)
test_shop_name_with_na = test_shop_name.apply(analyse_shop_name)

In [44]:
print('unique nameis in train and test after processing:', 
      len(train_shop_name_with_na.unique()), 
      len(test_shop_name_with_na.unique()))

unique nameis in train and test after processing: 526 75


In [45]:
shop_name_labeler = LabelEncoder()

In [46]:
%%time

shop_name_labeler.fit(train_shop_name_with_na)

train_shop_name_le = pd.DataFrame(shop_name_labeler.transform(train_shop_name_with_na), columns=['shop_name'])
test_shop_name_le = pd.DataFrame(shop_name_labeler.transform(test_shop_name_with_na), columns=['shop_name'])

Wall time: 30 ms


In [47]:
shop_name_ohe = OneHotEncoder()

In [48]:
%%time

shop_name_ohe.fit(train_shop_name_le)

# remember to remove na_string from train!
train_shop_name_ohe = shop_name_ohe.transform(train_shop_name_le.iloc[:-1, :])
test_shop_name_ohe = shop_name_ohe.transform(test_shop_name_le)

Wall time: 11 ms


In [49]:
print('train_shop_name_ohe:', train_shop_name_ohe.shape)
print('test_shop_name_ohe:', test_shop_name_ohe.shape)

train_shop_name_ohe: (13682, 526)
test_shop_name_ohe: (3000, 526)


In [50]:
features_train['shop_name'] = train_shop_name_ohe
features_test['shop_name'] = test_shop_name_ohe

### datetime

In [51]:
one_hot_encoder = OneHotEncoder()

In [52]:
train_datetime = train_full_df['datetime'].apply(pd.to_datetime)
test_datetime = test_full_df['datetime'].apply(pd.to_datetime)

In [53]:
train_hour = train_datetime.apply(lambda dt: dt.hour).values.reshape(-1, 1)
test_hour = test_datetime.apply(lambda dt: dt.hour).values.reshape(-1, 1)

one_hot_encoder.fit(train_hour)

train_hour = one_hot_encoder.transform(train_hour)
test_hour = one_hot_encoder.transform(test_hour)

In [54]:
features_train['hour'] = train_hour
features_test['hour'] = test_hour

In [55]:
train_dayofweek = train_datetime.apply(lambda dt: dt.dayofweek).values.reshape(-1, 1)
test_dayofweek = test_datetime.apply(lambda dt: dt.dayofweek).values.reshape(-1, 1)

one_hot_encoder.fit(train_dayofweek)

train_dayofweek = one_hot_encoder.transform(train_dayofweek)
test_dayofweek = one_hot_encoder.transform(test_dayofweek)

In [56]:
features_train['dayofweek'] = train_dayofweek
features_test['dayofweek'] = test_dayofweek

#### polynomial (hour + dayofweek)

In [57]:
poly_transformer = PolynomialFeatures(degree=2)

In [58]:
train_hour_dayofweek = sparse.hstack([train_hour, train_dayofweek]).toarray()
test_hour_dayofweek = sparse.hstack([test_hour, test_dayofweek]).toarray()

In [59]:
print(train_hour.shape, train_dayofweek.shape, train_hour_dayofweek.shape)
print(test_hour.shape, test_dayofweek.shape, test_hour_dayofweek.shape)

(13682, 23) (13682, 7) (13682, 30)
(3000, 23) (3000, 7) (3000, 30)


In [60]:
poly_transformer.fit(train_hour_dayofweek)

train_hour_dayofweek_poly = poly_transformer.transform(train_hour_dayofweek)
train_hour_dayofweek_poly = pd.DataFrame(train_hour_dayofweek_poly, 
                                         columns=['h_dow_poly{}'.format(i) for i in range(train_hour_dayofweek_poly.shape[1])])

test_hour_dayofweek_poly = poly_transformer.transform(test_hour_dayofweek)
test_hour_dayofweek_poly = pd.DataFrame(test_hour_dayofweek_poly, 
                                         columns=['h_dow_poly{}'.format(i) for i in range(train_hour_dayofweek_poly.shape[1])])

In [61]:
train_hour_dayofweek_poly.shape, test_hour_dayofweek_poly.shape

((13682, 496), (3000, 496))

In [62]:
features_train['hour_dayofweek_poly'] = train_hour_dayofweek_poly
features_test['hour_dayofweek_poly'] = test_hour_dayofweek_poly

### category

In [63]:
category_labeler = LabelEncoder()

In [64]:
%%time

train_category = category_labeler.fit_transform(train_full_df['category'])
#train_category = pd.DataFrame(train_category, columns=['category'])

Wall time: 12 ms


In [65]:
np.unique(train_category)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24], dtype=int64)

In [66]:
y_train = train_category

## save features

In [67]:
for name, feat in features_train.items():
    print('{} : {}, {}'.format(name, type(feat), feat.shape))

name_tfidf : <class 'scipy.sparse.csr.csr_matrix'>, (13682, 10000)
name_cvect : <class 'scipy.sparse.csr.csr_matrix'>, (13682, 13905)
name_cvect_mms : <class 'scipy.sparse.csr.csr_matrix'>, (13682, 13905)
name_cvect_norm : <class 'scipy.sparse.csr.csr_matrix'>, (13682, 13905)
name_cvect_tw : <class 'scipy.sparse.csr.csr_matrix'>, (13682, 2016)
price : <class 'pandas.core.series.Series'>, (13682,)
price_mms : <class 'pandas.core.frame.DataFrame'>, (13682, 1)
count : <class 'pandas.core.frame.DataFrame'>, (13682, 1)
shop_name : <class 'scipy.sparse.csr.csr_matrix'>, (13682, 526)
hour : <class 'scipy.sparse.csr.csr_matrix'>, (13682, 23)
dayofweek : <class 'scipy.sparse.csr.csr_matrix'>, (13682, 7)
hour_dayofweek_poly : <class 'pandas.core.frame.DataFrame'>, (13682, 496)


In [68]:
features_train['price'].head()

0    49.0
1    21.0
2    39.0
3     4.0
4    35.0
dtype: float64

In [69]:
for name, feat in features_train.items():
    if 'csr_matrix' in str(type(feat)):
        sparse.save_npz(PATH_TO_PROCESSED_DATA + name + '_train.npz', feat)
    elif 'DataFrame' in str(type(feat)):
        feat.to_csv(PATH_TO_PROCESSED_DATA + name + '_train.csv')

In [70]:
for name, feat in features_test.items():
    print('{} : {}, {}'.format(name, type(feat), feat.shape))

name_tfidf : <class 'scipy.sparse.csr.csr_matrix'>, (3000, 10000)
name_cvect : <class 'scipy.sparse.csr.csr_matrix'>, (3000, 13905)
name_cvect_mms : <class 'scipy.sparse.csr.csr_matrix'>, (3000, 13905)
name_cvect_norm : <class 'scipy.sparse.csr.csr_matrix'>, (3000, 13905)
name_cvect_tw : <class 'scipy.sparse.csr.csr_matrix'>, (3000, 2016)
price : <class 'pandas.core.series.Series'>, (3000,)
price_mms : <class 'pandas.core.frame.DataFrame'>, (3000, 1)
count : <class 'pandas.core.frame.DataFrame'>, (3000, 1)
shop_name : <class 'scipy.sparse.csr.csr_matrix'>, (3000, 526)
hour : <class 'scipy.sparse.csr.csr_matrix'>, (3000, 23)
dayofweek : <class 'scipy.sparse.csr.csr_matrix'>, (3000, 7)
hour_dayofweek_poly : <class 'pandas.core.frame.DataFrame'>, (3000, 496)


In [71]:
for name, feat in features_test.items():
    if 'csr_matrix' in str(type(feat)):
        sparse.save_npz(PATH_TO_PROCESSED_DATA + name + '_test.npz', feat)
    elif 'DataFrame' in str(type(feat)):
        feat.to_csv(PATH_TO_PROCESSED_DATA + name + '_test.csv')

In [72]:
y_train_df = pd.DataFrame(y_train, columns=['category'])
y_train_df.to_csv(PATH_TO_PROCESSED_DATA + 'categories_train.csv')

In [73]:
check_id_train_df = train_full_df[['check_id']]
check_id_train_df.to_csv(PATH_TO_PROCESSED_DATA + 'check_id_train.csv')

In [74]:
id_test_df = test_df[['id']]
id_test_df.to_csv(PATH_TO_PROCESSED_DATA + 'id_test.csv')

In [75]:
with open(PATH_TO_PROCESSED_DATA + 'category_labeler.pkl', 'wb') as category_labeler_file:
    pickle.dump(category_labeler, category_labeler_file)