In [1]:
import pandas as pd

In [2]:
data_train = pd.read_csv('train.csv', sep=';', nrows=10)

In [3]:
data_train.head()

Unnamed: 0,﻿id,start_time,title,price,item_id,owner_type,category,subcategory,param1,param2,param3,region,item_views
0,0,2016-12-27 10:38:04,Сандали фирмы Crocs,800,1301822498390501359,Private,Личные вещи,Детская одежда и обувь,Для мальчиков,Обувь,> 36,Москва,27
1,1,2016-12-27 15:23:55,Бутсы футбольные Reebok,2000,4439620035274845039,Private,Личные вещи,Детская одежда и обувь,Для мальчиков,Обувь,> 36,Омская область,9
2,2,2016-12-28 19:34:15,Nike hypervenom Бутсы,600,4860577743813309218,Private,Личные вещи,Детская одежда и обувь,Для мальчиков,Обувь,> 36,Санкт-Петербург,105
3,3,2016-12-26 10:26:02,Сапоги,150,3492530336858889466,Private,Личные вещи,Детская одежда и обувь,Для мальчиков,Обувь,> 36,Тульская область,28
4,4,2016-12-26 17:09:19,Кеды 38,500,3559049054931858928,Private,Личные вещи,Детская одежда и обувь,Для мальчиков,Обувь,> 36,Самарская область,9


_____

In [1]:
import re
import gc
from  datetime import datetime
from collections import defaultdict

import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from scipy.sparse import csc_matrix, vstack, hstack
import numpy as np
import pandas as pd

from nltk import SnowballStemmer
import nltk.corpus

import matplotlib.pyplot as plt
import seaborn

import random
random.seed(666)

%matplotlib inline
pd.options.display.max_columns = 150



_____

##### Plots

___________


#### features

In [2]:
def _fillNa(dt, fill_na_train, is_train=True):
    """ 1
    Fillig nan values for next columns:
    'title', 'param1', 'param2', 'param3'.
    """
    if not is_train and (fill_na_train.__len__() == 0):
        raise Exception('Run on train data before!')
    
    if is_train:
        fill_na_train = {}
        for col in ['title', 'param1', 'param2', 'param3']:
            fill_na_train[col] = 'nan'
    dt.fillna(fill_na_train, inplace=True)
    return fill_na_train

In [3]:
def _working_with_time(dt):
    """ 2
    Parse datetime column. Extract hours (slided by -3.5), 
    week_day, relative time with minute (from 0 to 1).
    """
    dt.start_time = dt.start_time.astype(datetime)
    dt['rel_times'] = np.array(list(map(lambda dt: (dt.hour*60 + dt.minute)/(24.*60), 
                                                dt.start_time.values))).astype(float)
    dt['week_day'] = np.array(list(map( lambda x: x.isoweekday() ,
                                                dt.start_time.values))).astype(int)
    dt['slided_hours'] = np.array(list(map( lambda x: ((x.hour - 3.5) % 24) * 1. / 24.,
                                                dt.start_time.values))).astype(float)
    dt.drop(['start_time'],axis=1, inplace=True)

In [4]:
def _get_new_str_features(dt):
    """ 3
    Create new features for gender, mar_status,
    liv_region and other string columns. 
    ------------
    Return
    
    str_cols : list of str
        Columns for encoding and deleting.
    """
#     #one_hot_encoding
#     dt['owner_type_Private'] = (dt.owner_type=='Private').astype(int)
#     dt['owner_type_Company'] = (dt.owner_type=='Company').astype(int)
#     dt['owner_type_Shop'] = (dt.owner_type=='Shop').astype(int)

    # конкатенация строк
    dt['all_params'] = dt.param1 + ' ' + dt.param2 + ' ' + dt.param3
    dt['params_12'] = dt.param1 + ' ' + dt.param2
    dt['params_23'] = dt.param2 + ' ' + dt.param3
    dt['category_and_owner_type'] = dt.owner_type + ' ' + dt.category
    dt['category_and_region'] = dt.region + ' ' + dt.category
    dt['owner_type_and_region'] = dt.owner_type + ' ' + dt.category
    
    str_cols = ['owner_type', 
                'param1', 'param2', 'param3',
                'region', 'category' , 'subcategory', 
                'params_12', 'params_23', 'all_params',
                'category_and_owner_type',
                'category_and_region', 'owner_type_and_region'
               ]
    return str_cols

In [6]:
def _counter_encoder(dt, str_cols, counter_Encoders, is_train=True):
    """ 4
    Counter Encoder. 
    Set to the categories from 'str_cols' columns some numbers 
    (frequencies in train). 
    """
    if is_train:
        counter_Encoders = {col:dt[col].value_counts().to_dict()
                                            for col in str_cols}
    for column in str_cols:
        dt[column+'_enc_by_count'] = dt[column].apply(lambda x: 
                                counter_Encoders[column].get(x, 0)) 
                                # TODO ровнее бы
    return counter_Encoders

In [7]:
def _target_encoder(dt, cols_for_encoding, targ_encoders, is_train=True, target_mean = 60.1):
    """ 5
    Encode cat values by the mean in target.
    
    Params
    dt : DataFrame
        Data.
    cols_for_encoding : list
        List of columns for encode.
    targ_encoders : dict
        Encoder for test dataset.
    is_train : bool
        Flag for train/test.
    
    ------
    Return
    targ_encoders : dict
        Values of mean target for each category 
        in columns from cols_for_encoding.
    """
    if is_train:
        targ_encoders = {}
        targ_means = {}
        for col in cols_for_encoding:
            targ_means[col] = dt.groupby(col).item_views.\
                            mean().sort_values().index.values
            
        for col in cols_for_encoding:
            targ_encoders[col] = {v:i for i,v in enumerate(targ_means[col])}
            
    for col in cols_for_encoding:
        dt[col+'_by_mean_target'] = dt[col].apply(lambda x:
                                    targ_encoders[col].get(x, target_mean))
        
    return targ_encoders

#### working with title

In [8]:
# from https://www.kaggle.com/c/avito-prohibited-content/data 

stopwords= frozenset(word for word in nltk.corpus.stopwords.words("russian") \
                     if word!="не")  #(word.decode('utf-8')
stemmer = SnowballStemmer('russian')
engChars = [ord(char) for char in u"cCyoOBaAKpPeE"]
rusChars = [ord(char) for char in u"сСуоОВаАКрРеЕ"]
eng_rusTranslateTable = dict(zip(engChars, rusChars))
rus_engTranslateTable = dict(zip(rusChars, engChars))


def correctWord (w):
    """ 
    Corrects word by replacing characters with
    written similarly depending on which language the word. 
    Fraudsters use this technique to avoid 
    detection by anti-fraud algorithms.
    """

    if len(re.findall('[а-я]',w))>len(re.findall('[a-z]',w)):
        return w.translate(eng_rusTranslateTable)
    else:
        return w.translate(rus_engTranslateTable)

    
def getWords(text, stemmRequired = False, correctWordRequired = False):
    """ Splits the text into words, discards stop words and applies stemmer. 
    Parameters
    ----------
    text : str - initial string
    stemmRequired : bool - flag whether stemming required
    correctWordRequired : bool - flag whether correction of words required     
    """

    cleanText = re.sub(u'[^a-zа-я]', ' ', text.lower()) ###'[^a-zа-я0-9]'
    if correctWordRequired:
        words = [correctWord(w) if not stemmRequired or re.search('[a-z]', w) \
                 else stemmer.stem(correctWord(w)) for w in cleanText.split() \
                 if len(w)>1 and w not in stopwords]
    else:
        words = [w if not stemmRequired or re.search("[a-z]", w) else stemmer.stem(w) \
                 for w in cleanText.split() if len(w)>1 and w not in stopwords]
    
    return words
    

In [9]:
' '.join(['ghbdt', 'asd', 'rgrg', 'ptp'])

'ghbdt asd rgrg ptp'

In [10]:
def _title_normalize(dtr, dts):
    """ 2
    # prepr title - stemming
    """
    prepr_words = lambda title: ' '.join(getWords(title + ' qwe qwe ', 
                                                  stemmRequired=True,
                                                  correctWordRequired=False))
    dtr['title1'] = dtr.title.apply(prepr_words)
    
    dts['title1']= dts.title.apply(prepr_words)

In [11]:
def cv_pipline(dt_tr, dt_ts, cv=True):
    """
    Pipline for cv. and for train and test data.
    """
    col_not_used = ['item_id', 'id']
    
    data_train = dt_tr#.copy()
    data_test = dt_ts#.copy()
    y_tr = data_train.item_views
    if cv:
        y_ts = data_test.item_views
    print('filling na')
    
    # filling na
    fill_na_train = _fillNa(data_train, {}, is_train=True)
    _ = _fillNa(data_test, fill_na_train, is_train=False)
    print('time processing')
    
    # time features
    _working_with_time(data_train)
    _working_with_time(data_test)
    
    print('str features')
    # string features
    str_cols = _get_new_str_features(data_train)
    _ = _get_new_str_features(data_test)
    
    print('counter encoder')
    # encoding string features
    counter_Encoders = _counter_encoder(data_train,
                                        str_cols, {}, is_train=True)
    _ = _counter_encoder(data_test, str_cols,
                         counter_Encoders, is_train=False)
    
    print ('target_mean_ecoder')
    
    # Very Caution! target_mean_ecncoder
    cols_for_encoding = str_cols
    targ_encoders = _target_encoder(data_train,
                        cols_for_encoding, {},
                        is_train=True,
                        target_mean = int(data_train.item_views.mean()))
    
    _ = _target_encoder(data_test, cols_for_encoding,
                        targ_encoders, is_train=False,
                        target_mean = int(data_train.item_views.mean()))
    
    
    print ('title_prepr')
    
    all_words = data_train.title.values
    wordCounts = defaultdict(lambda: 0)
    for item in all_words:
        for word in getWords(item, stemmRequired=True, 
                             correctWordRequired=False):
            wordCounts[word] += 1
    top_words = np.array(sorted(wordCounts, key=wordCounts.get, 
                                reverse=True)[:11000])
    
    _title_normalize(data_train, data_test)
    
    print ('title_top')
    
    # map
    
    data_train.drop(str_cols, axis=1, inplace=True)
    data_test.drop(str_cols, axis=1, inplace=True)
    data_train.drop(col_not_used, axis=1, inplace=True)
    data_test.drop(col_not_used, axis=1, inplace=True)
    
    some_vals1 = data_train.title1
    some_vals2 = data_test.title1
    
    data_train.drop(['item_views', 'title', 'title1'], axis=1, inplace=True)
    data_test.drop(['title', 'title1'], axis=1, inplace=True)
    
    data_tr_val = csc_matrix(data_train.values)
    del data_train
    data_ts_val = csc_matrix(data_test.values)
    del data_test
    
    
    vectorizer = CountVectorizer(vocabulary=top_words)
    vals_tr = vectorizer.transform(some_vals1)
    vals_ts = vectorizer.transform(some_vals2)
        
    tr_data = csc_matrix(hstack([data_tr_val, vals_tr]))
    ts_data = csc_matrix(hstack([data_ts_val, vals_ts]))
       
    return tr_data, ts_data, np.log(y_tr.values+1.)

    
#     if cv:
#         print ('run!')
#         data_test.drop(['item_views', 'title', 'title1'], axis=1, inplace=True)
#         return data_train, data_test, np.log(y_tr.values+1.), np.log(y_ts.values+1.)
    
#     else:
#         #data_test.drop([ 'title', 'title1'], axis=1, inplace=True)
#         return data_train, data_test, np.log(y_tr.values+1.)

### Предсказываем сдвинутый логарифм np.log(y_tr.values+1.)

__________

_______

# локальная проверка на одном сплите  - коррелирует с лидербордом

In [12]:
data_train = pd.read_csv('train.csv', sep=';', parse_dates=['start_time'])
data_test = pd.read_csv('test.csv', sep=';', parse_dates=['start_time'])
dt_tr, dt_ts, y_tr = cv_pipline(data_train, data_test, False)

filling na
time processing
str features
counter encoder
target_mean_ecoder
title_prepr
title_top


In [13]:
dt_tr.shape, dt_ts.shape

((423772, 11030), (418991, 11030))

In [14]:
cv = KFold(n_splits=5, shuffle=True, random_state=846)

In [15]:
for i,j in cv.split(y_tr):
    tr, ts = i,j

In [16]:
import datetime

In [17]:
print ('run')

run


In [55]:
xgb_params = {
    'objective': 'reg:linear',
    'max_depth': 15,
    'eta': 0.029,
    'gamma' : 0,
    'booster': 'gbtree',
    'seed': 1,
    'alpha': 2.0,
    'lambda': 0.1,
    'colsample_bytree': 0.9,
    'colsample_bylevel': 1.0,
    'subsample': 0.95,
    'min_child_weight': 1,
    'silent': 1,
    'nthread': 10,
    'eval_metric':'rmse'
}

num_rounds = 10000

In [None]:
print (datetime.datetime.now())
dtrain = xgb.DMatrix(dt_tr[tr], label=y_tr[tr])
dtest = xgb.DMatrix(dt_tr[ts], label=y_tr[ts])

watchlist = [(dtrain, 'train'), (dtest, 'eval')]
eval_res ={}
gbdt = xgb.train(xgb_params, dtrain,
                 num_rounds, watchlist,
                 early_stopping_rounds=500,
                 verbose_eval=50,
                 evals_result=eval_res)
print (datetime.datetime.now())

In [59]:
eval_res['eval']['rmse'][-3:]

[0.547439, 0.547439, 0.547442]

In [60]:
eval_res['eval']['rmse'].__len__()

7017

_____

In [None]:
! блендинг наше все!

#  Генерация сабмишиона :)

In [61]:
dtrain = xgb.DMatrix(dt_tr, label=y_tr)
dtest = xgb.DMatrix(dt_ts)

In [None]:
%%time

xgb_params = {
  'objective': 'reg:linear',
    'max_depth': 15,
    'eta': 0.03,
    'booster': 'gbtree',
    'seed': 1,
    'alpha': 0.5,
    'lambda': 1.4,
    'colsample_bytree': 0.9,
    'colsample_bylevel': 1.0,
    'subsample': 0.9,
    'min_child_weight': 9,
    'silent': 1,
    #'nthread':11,
    'eval_metric':'rmse'
}

num_rounds = 9910

gbdt = xgb.train(xgb_params, dtrain, num_rounds)
ans = gbdt.predict(dtest)
real_ans1 = np.exp(ans)-1.
gbdt.save_model('xgb.blend1')

CPU times: user 9h 1min 13s, sys: 10.6 s, total: 9h 1min 24s
Wall time: 1h 1min 41s


In [None]:
del gbdt

In [None]:
%%time

xgb_params = {
  'objective': 'reg:linear',
    'max_depth': 13,
    'eta': 0.028,
    'booster': 'gbtree',
    'seed': 1,
    'alpha': 0.1,
    'lambda': 1.1,
    'colsample_bytree': 0.9,
    'colsample_bylevel': 1.0,
    'subsample': 0.99,
    'min_child_weight': 2,
    'silent': 1,
    #'nthread':11,
    'eval_metric':'rmse'
}

num_rounds = 10000

gbdt1 = xgb.train(xgb_params, dtrain, num_rounds)
ans1 = gbdt1.predict(dtest)
real_ans2 = np.exp(ans1)-1.

gbdt1.save_model('xgb.blend2')

CPU times: user 7h 49min 43s, sys: 9.14 s, total: 7h 49min 52s
Wall time: 53min 26s


In [None]:
del gbdt1

In [None]:
%%time

xgb_params = {
  'objective': 'reg:linear',
    'max_depth': 19,
    'eta': 0.28,
    'booster': 'gbtree',
    'seed': 1,
    'alpha': 0.0,
    'lambda': 1.3,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'subsample': 0.8,
    'min_child_weight': 9,
    'silent': 1,
    #'nthread':11,
    'eval_metric':'rmse'
}

num_rounds = 8000

gbdt2 = xgb.train(xgb_params, dtrain, num_rounds)

ans = gbdt2.predict(dtest)
real_ans3 = np.exp(ans)-1.

gbdt2.save_model('xgb.blend3')

In [None]:
del gbdt2

In [None]:
xgb_params = {
    'objective': 'reg:linear',
    'max_depth': 17,
    'eta': 0.029,
    'gamma' : 0,
    'booster': 'gbtree',
    'seed': 1,
    'alpha': 0.8,
    'lambda': 1.7,
    'colsample_bytree': 0.7,
    'colsample_bylevel': 1.0,
    'subsample': 0.7,
    'min_child_weight': 1,
    'silent': 1,
    'nthread': 10,
    'eval_metric':'rmse'
}

num_rounds = 5585

gbdt4 = xgb.train(xgb_params, dtrain, num_rounds)

ans = gbdt4.predict(dtest)
real_ans4 = np.exp(ans)-1.

gbdt4.save_model('xgb.blend4')

In [None]:
del gbdt4

In [62]:
xgb_params = {
    'objective': 'reg:linear',
    'max_depth': 15,
    'eta': 0.029,
    'gamma' : 0,
    'booster': 'gbtree',
    'seed': 1,
    'alpha': 2.0,
    'lambda': 0.1,
    'colsample_bytree': 0.9,
    'colsample_bylevel': 1.0,
    'subsample': 0.95,
    'min_child_weight': 1,
    'silent': 1,
    'nthread': 11,
    'eval_metric':'rmse'
}

num_rounds = 7000


gbdt5 = xgb.train(xgb_params, dtrain, num_rounds)

ans = gbdt5.predict(dtest)
real_ans5 = np.exp(ans)-1.

gbdt5.save_model('xgb.blend5')

In [63]:
del gbdt5

In [64]:
real_ans2.shape

(418991,)

In [66]:
tmp_data = pd.read_csv('test.csv', sep=';' , usecols=['id'])
tmp_data['item_views'] =  0.35 * real_ans1 + 0.10 * real_ans2 + \
                          0.10 * real_ans3 + 0.10 * real_ans4 + \
                          0.35 * real_ans5
tmp_data.to_csv('ans_amir_10+.csv', sep=';', index=False)

#  - третье место на лидерборде

метрика

$$ \sum_{i=1}^{n} \sqrt{\frac{1}{n} \big{(} log(p_i+1) - log(y_i+1) \big{)} } $$