# тестовое задания для Avito. Проверка гипотез и выбор лучшей модели.

(Можно сразу перейти к **заключению** внизу)

Импортируем необходимые библиотеки

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

Считываем предоставленные данные

In [2]:
category = pd.read_csv('data/category.csv')

In [3]:
category.head()

Unnamed: 0,category_id,name
0,0,Бытовая электроника|Телефоны|iPhone
1,1,Бытовая электроника|Ноутбуки
2,2,Бытовая электроника|Телефоны|Samsung
3,3,Бытовая электроника|Планшеты и электронные кни...
4,4,"Бытовая электроника|Игры, приставки и программ..."


In [4]:
data_train = pd.read_csv('data/train.csv', index_col='item_id')

In [5]:
data_train.head()

Unnamed: 0_level_0,title,description,price,category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Картина,Гобелен. Размеры 139х84см.,1000.0,19
1,Стулья из прессованной кожи,Продам недорого 4 стула из светлой прессованно...,1250.0,22
2,Домашняя мини баня,"Мини баня МБ-1(мини сауна), предназначена для ...",13000.0,37
3,"Эксклюзивная коллекция книг ""Трансаэро"" + подарок","Продам эксклюзивную коллекцию книг, выпущенную...",4000.0,43
4,Ноутбук aser,Продаётся ноутбук ACER e5-511C2TA. Куплен в ко...,19000.0,1


In [6]:
data_test = pd.read_csv('data/test.csv', index_col='item_id')

In [7]:
data_test.head()

Unnamed: 0_level_0,title,description,price
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
489517,Стоик журнальный сталь,продам журнальный столик изготавливаю столы из...,10000.0
489518,iPhone 5 64Gb,"Телефон в хорошем состоянии. Комплект, гаранти...",12500.0
489519,Утеплитель,ТЕПЛОПЕЛЕН-ЛИДЕР ТЕПЛА!!! Толщина утеплителя :...,250.0
489520,Пальто демисезонное,Продам пальто женское (букле) в отличном состо...,1700.0
489521,Samsung syncmaster T200N,"Условно рабочий, проблема в панели настройки м...",1000.0


# EDA

## Выделение признаков из текста
- считаем количество латинских букв
- считаем длину
- избавляемся от знаков препинания
- Все встречающиеся слова приводим к нормальной форме
- кодируем в вектора через CountVectorizer

In [8]:
import re

english_check = re.compile(r'[a-zA-Z]')

def count_eng(text):
    counter = 0
    for c in text:
        if english_check.match(c):
            counter += 1
            
    return counter

In [9]:
data_train['title_eng_count'] = data_train['title'].apply(count_eng)

data_train['descr_eng_count'] = data_train['description'].apply(count_eng)

In [10]:
data_train['title_len'] = data_train['title'].apply(len)

data_train['descr_len'] = data_train['description'].apply(len)

In [11]:
import pymorphy2 as morphy
import string

morpher = morphy.MorphAnalyzer()

In [12]:
from functools import lru_cache

@lru_cache(maxsize=100000)
def get_normal_form (word):
    return morpher.normal_forms(word)[0]

In [13]:
def text_normalizer(text):
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))).lower()
    words = text.split()
    normalized_text = ''
    for word in words:
        normalized_text += get_normal_form(word) + ' '
        
    return normalized_text.rstrip()

In [14]:
from multiprocessing import Pool

In [15]:
%%time
with Pool(processes=4) as pool:
    data_train['title_norm'] = pool.map(text_normalizer, data_train.title)
    data_train['desct_norm'] = pool.map(text_normalizer, data_train.description)
    
    data_test['title_norm'] = pool.map(text_normalizer, data_test.title)
    data_test['desct_norm'] = pool.map(text_normalizer, data_test.description)
    pool.terminate()

CPU times: user 6.64 s, sys: 4.27 s, total: 10.9 s
Wall time: 5min 3s


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
%%time
title_vectorizer = CountVectorizer(binary=True)
title_features_train = title_vectorizer.fit_transform(data_train.title)
title_features_test = title_vectorizer.transform(data_test.title)

descr_vectorizer = CountVectorizer(binary=True)
description_features_train = descr_vectorizer.fit_transform(data_train.description)
description_features_test = descr_vectorizer.transform(data_test.description)

CPU times: user 44 s, sys: 2.09 s, total: 46.1 s
Wall time: 46.7 s


## Числовые признаки
Отшкалируем числовые признаки для использования в метрических алгоритмах

In [18]:
num_features_columns = ['price', 'title_eng_count', 'descr_eng_count', 'title_len', 'descr_len']

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
scaler = StandardScaler()

num_features_scaled_train = scaler.fit_transform(data_train.loc[:, num_features_columns])
num_features_scaled_test = scaler.transform(data_test.loc[:, num_features_columns])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


## Выбор модели и обучение
Протестируем несколько моделей и выберем лучшую.
#### Модели:
- Logistic regression
- Desision tree
- Random forest

#### Метрики:
- Accuracy
- LogLoss

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, log_loss

from sklearn.model_selection import train_test_split

In [22]:
features = sp.hstack((title_features_train, description_features_train, data_train.loc[:, num_features_columns]))
features_scaled = sp.hstack((title_features_train, description_features_train, num_features_scaled_train))

In [23]:
X_train, X_val, y_train, y_val = train_test_split(features, data_train['category_id'], 
                                              random_state=648, test_size=0.25, shuffle=True)

X_s_train, X_s_val, y_s_train, y_s_val = train_test_split(features_scaled, data_train['category_id'], 
                                         random_state=648, test_size=0.25, shuffle=True)

In [26]:
%%time

logreg = LogisticRegression()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

for clf in (tree, forest):
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_val)
    y_hat_proba = clf.predict_proba(X_val)
    print(str(clf))
    print('Accuracy: ' + str(accuracy_score(y_val, y_hat)))
    print('LogLoss: ' + str(log_loss(y_val, y_hat_proba)))

#То же самое для logreg только с отшкалированными данными
logreg.fit(X_s_train, y_s_train)
y_hat = logreg.predict(X_s_val)
y_hat_proba = logreg.predict_proba(X_s_val)
print(str(logreg))
print('Accuracy: ' + str(accuracy_score(y_s_val, y_hat)))
print('LogLoss: ' + str(log_loss(y_s_val, y_hat_proba))) 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Accuracy: 0.719578362477529
LogLoss: 9.684881752096686




RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Accuracy: 0.7671596666121915
LogLoss: 3.2510848761097653




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Accuracy: 0.8806095767282236
LogLoss: 0.4759405998050163
CPU times: user 2h 36min 3s, sys: 34.5 s, total: 2h 36min 38s
Wall time: 2h 37min 10s


Видно, что с большим отрывом лучший результат показала логистическая регрессия. Попробуем улучшить результат, изменив некоторые параметры модели:

1) Изменим параметр CountVectorizer binary на False. Раньше признаком являлось просто вхождение слова в текст, теперь попробуем считать количество вхождений.

In [29]:
%%time
title_vectorizer = CountVectorizer(binary=False)
title_features_train = title_vectorizer.fit_transform(data_train.title)
title_features_test = title_vectorizer.transform(data_test.title)

descr_vectorizer = CountVectorizer(binary=False)
description_features_train = descr_vectorizer.fit_transform(data_train.description)
description_features_test = descr_vectorizer.transform(data_test.description)

CPU times: user 42.1 s, sys: 1.33 s, total: 43.4 s
Wall time: 43.9 s


In [34]:
scaler = StandardScaler()

num_features_train = scaler.fit_transform(data_train.loc[:, num_features_columns])
num_features_test = scaler.transform(data_test.loc[:, num_features_columns])

scaler = StandardScaler(with_mean=False)

title_features_train = scaler.fit_transform(title_features_train)
title_features_test = scaler.transform(title_features_train)

description_features_train = scaler.fit_transform(description_features_train)
description_features_test = scaler.transform(description_features_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [36]:
features = sp.hstack((title_features_train, description_features_train, num_features_train))

In [37]:
X_train, X_val, y_train, y_val = train_test_split(features, data_train['category_id'], 
                                              random_state=648, test_size=0.25, shuffle=True)

In [38]:
%%time

logreg.fit(X_train, y_train)
y_hat = logreg.predict(X_val)
y_hat_proba = logreg.predict_proba(X_val)
print(str(logreg))
print('Accuracy: ' + str(accuracy_score(y_val, y_hat)))
print('LogLoss: ' + str(log_loss(y_val, y_hat_proba))) 



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Accuracy: 0.790496813204772
LogLoss: 1.8242081826090177
CPU times: user 2h 52min 56s, sys: 39.4 s, total: 2h 53min 35s
Wall time: 2h 55min 15s


In [39]:
%%time
title_vectorizer = CountVectorizer(binary=False)
title_features_train = title_vectorizer.fit_transform(data_train.title)
title_features_test = title_vectorizer.transform(data_test.title)

descr_vectorizer = CountVectorizer(binary=False)
description_features_train = descr_vectorizer.fit_transform(data_train.description)
description_features_test = descr_vectorizer.transform(data_test.description)

CPU times: user 47.3 s, sys: 3.9 s, total: 51.2 s
Wall time: 54.4 s


In [40]:
features_unscaled = sp.hstack((title_features_train, description_features_train, data_train.loc[:, num_features_columns]))

In [41]:
X_train, X_val, y_train, y_val = train_test_split(features_unscaled, data_train['category_id'], 
                                              random_state=648, test_size=0.25, shuffle=True)

In [42]:
%%time

logreg.fit(X_train, y_train)
y_hat = logreg.predict(X_val)
y_hat_proba = logreg.predict_proba(X_val)
print(str(logreg))
print('Accuracy: ' + str(accuracy_score(y_val, y_hat)))
print('LogLoss: ' + str(log_loss(y_val, y_hat_proba))) 



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Accuracy: 0.6341640790978919
LogLoss: 1.7574700414389088
CPU times: user 56min 29s, sys: 15 s, total: 56min 45s
Wall time: 57min


In [53]:
num_features_columns = ['title_eng_count', 'descr_eng_count', 'title_len', 'descr_len']

In [64]:
price_scaled = np.array((data_train['price'] - np.mean(data_train['price'])) / np.std(data_train['price'])).reshape(-1, 1)

In [66]:
price_scaled.shape

(489517, 1)

In [67]:
features = sp.hstack((title_features_train, description_features_train,
                               data_train.loc[:, num_features_columns], price_scaled))

In [68]:
X_train, X_val, y_train, y_val = train_test_split(features, data_train['category_id'], 
                                              random_state=648, test_size=0.25, shuffle=True)

In [69]:
%%time

logreg.fit(X_train, y_train)
y_hat = logreg.predict(X_val)
y_hat_proba = logreg.predict_proba(X_val)
print(str(logreg))
print('Accuracy: ' + str(accuracy_score(y_val, y_hat)))
print('LogLoss: ' + str(log_loss(y_val, y_hat_proba))) 



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Accuracy: 0.790496813204772
LogLoss: 1.8242081826090177
CPU times: user 2h 59min 52s, sys: 1min 27s, total: 3h 1min 19s
Wall time: 3h 6min 53s


In [70]:
treeime

tree.fit(X_train, y_train)
y_hat = tree.predict(X_val)
y_hat_proba = tree.predict_proba(X_val)
print(str(logreg))
print('Accuracy: ' + str(accuracy_score(y_val, y_hat)))
print('LogLoss: ' + str(log_loss(y_val, y_hat_proba))) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
Accuracy: 0.7198398431116195
LogLoss: 9.676127092712614
CPU times: user 57min 24s, sys: 18.8 s, total: 57min 43s
Wall time: 58min 44s


Таким образом, лучший возможный результат без бустинга получен на линейной модели. Теперь попробуем воспользоваться xgboost.

In [72]:
import xgboost as xgb

In [77]:
%%time
title_vectorizer = CountVectorizer(binary=True)
title_features_train = title_vectorizer.fit_transform(data_train.title)

descr_vectorizer = CountVectorizer(binary=True)
description_features_train = descr_vectorizer.fit_transform(data_train.description)

CPU times: user 33.8 s, sys: 1.48 s, total: 35.3 s
Wall time: 37 s


In [75]:
num_features_columns = ['price', 'title_eng_count', 'descr_eng_count', 'title_len', 'descr_len']

In [76]:
scaler = StandardScaler()

num_features_scaled_train = scaler.fit_transform(data_train.loc[:, num_features_columns])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [80]:
features = sp.hstack((title_features_train, description_features_train, num_features_scaled_train))

In [81]:
X_train, X_val, y_train, y_val = train_test_split(features, data_train['category_id'], 
                                              random_state=648, test_size=0.25, shuffle=True)

попробуем две модели:
- логистическую регрессию (с фичами показавшими лучший результат)
- random forest (с лучшими для линейной модели фичами, и с другим набором (субъективно более информативным))

In [84]:
xgb_params = {
    'objective': 'multi:softmax',
    'num_class': category.shape[0],
    'booster': 'gblinear',
    
    'alpha': 0.5,
    
    'seed': 648,
    'nthread': 2,
    'eval_metric':'merror'
}

num_rounds = 150

In [85]:
X_train, X_val, y_train, y_val = train_test_split(features, data_train['category_id'], 
                                              random_state=648, test_size=0.25, shuffle=True)

In [87]:
%%time

dtrain = xgb.DMatrix( X_train, label=y_train)
dtest = xgb.DMatrix( X_val, label=y_val,)

watchlist = [(dtrain, 'train'), (dtest, 'eval')]
eval_res ={}
gbdt = xgb.train(xgb_params, dtrain,
                 num_rounds, watchlist,
                 early_stopping_rounds=5,
                 verbose_eval=10,
                 evals_result=eval_res)

  if getattr(data, 'base', None) is not None and \


[0]	train-merror:0.979574	eval-merror:0.97958
Multiple eval metrics have been passed: 'eval-merror' will be used for early stopping.

Will train until eval-merror hasn't improved in 5 rounds.
Stopping. Best iteration:
[0]	train-merror:0.979574	eval-merror:0.97958

CPU times: user 3min 10s, sys: 1.73 s, total: 3min 12s
Wall time: 3min 16s


In [94]:
xgb_params = {
    'objective': 'multi:softmax',
    'num_class': category.shape[0],
    'max_depth': 7,
    'eta': 0.12,
    'booster': 'gbtree',
    
    'alpha': 2.0,
    'lambda': 0.1,
    
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'colsample_bylevel': 1.0,
    
    'seed': 648,
    'nthread': 2,
    'eval_metric':'merror'
}

num_rounds = 150

In [97]:
%%time

dtrain = xgb.DMatrix( X_train, label=y_train)
dtest = xgb.DMatrix( X_val, label=y_val,)

watchlist = [(dtrain, 'train'), (dtest, 'eval')]
eval_res ={}
gbdt = xgb.train(xgb_params, dtrain,
                 num_rounds, watchlist,
                 early_stopping_rounds=5,
                 verbose_eval=5,
                 evals_result=eval_res)

[0]	train-merror:0.37711	eval-merror:0.380176
Multiple eval metrics have been passed: 'eval-merror' will be used for early stopping.

Will train until eval-merror hasn't improved in 5 rounds.
[5]	train-merror:0.221182	eval-merror:0.231778
[10]	train-merror:0.202401	eval-merror:0.215852
[15]	train-merror:0.191825	eval-merror:0.207599
[20]	train-merror:0.184487	eval-merror:0.202002
[25]	train-merror:0.1781	eval-merror:0.197238
[30]	train-merror:0.17247	eval-merror:0.193063
[35]	train-merror:0.166875	eval-merror:0.18914
[40]	train-merror:0.162332	eval-merror:0.18548
[45]	train-merror:0.158186	eval-merror:0.182799
[50]	train-merror:0.15409	eval-merror:0.179449
[55]	train-merror:0.150489	eval-merror:0.177186
[60]	train-merror:0.147008	eval-merror:0.174832


KeyboardInterrupt: 

# Заключение:
- Лучшая модель - Логистическая регрессия
- Лучший способ кодирования признаков из текста - CountVectorizer(binary=True) -> вхождение слова в название/описание важнее количества его вхождений
- бустинг не дал улучшения результата (по крайней мере, в условиях ограниченного времени и мощностей)