In [None]:
import re
import numpy as np
import pandas as pd
import cufflinks as cf
import xgboost as xgb
import lightgbm as lgb
import pymorphy2
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, pyll
from hyperopt.pyll import scope

cf.go_offline()
%matplotlib inline
RS = 42

In [1]:
load = pd.read_csv('val.csv')
with open("stop_words_ru.txt", encoding='utf-8') as f:
    stop_words = [x.replace("\n", "") for x in f] 
load.info()

NameError: name 'pd' is not defined

In [5]:
# добавляем категориальные переменные 
y = load.is_bad
data = pd.DataFrame(load.loc[:, ('price')])
load.datetime_submitted = pd.to_datetime(load.datetime_submitted)
load['weekday_submitted'] = load.datetime_submitted.dt.weekday
load['month_submitted'] =  load.datetime_submitted.dt.month
load['year_submitted'] =  load.datetime_submitted.dt.year
d = pd.get_dummies(load[['subcategory', 'category', 'region', 'city', 'weekday_submitted', 'month_submitted', 
                         'year_submitted']].astype('str'))
data = data.join(d)

In [6]:
morph = pymorphy2.MorphAnalyzer()

def lemmatize(text):
    words = text.split()
    res = list()
    int_list = []
    for word in words:
        if any(i.isdigit() for i in word) and len(word) > 3:
            int_word = ''
            for w in word:
                if w.isdigit():
                    int_word = int_word + w 
            int_list.append(int_word)
        p = morph.parse(word)[0]
        res.append(p.normal_form)
    res = res + int_list
    res = [i for i in res if i not in stop_words]
    res = ' '.join(res)
    return res        

def del_s(_text):
    punctuation = "!#$%^&*()_+<>?:.,;/-"    
    for c in _text:
        if c in punctuation:
            _text = _text.replace(c, "")
    return _text

data['new_description'] = load.description.apply(lambda x: del_s(x))
data['new_description'] = data['new_description'].apply(lambda x: lemmatize(x))

In [7]:
def find_(string, rule):
    if re.search(rule, string):
        return 1
    else:
        return 0

def count_int(text):
    counter = 0
    for t in text:
        if t.isdigit():
            counter += 1
    return counter
    
tel = r'[\+]?[(-]?[0-9]{3}[-)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}'
sabaka = '@'
data['telephone'] = data['new_description'].apply(lambda x: find_(x, tel))
data['@'] = data['new_description'].apply(lambda x: find_(x, sabaka))
data['count_int'] = data['new_description'].apply(lambda x: count_int(x))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.2, stratify=y, random_state=RS)
X_test, X_control, y_test, y_control = train_test_split(X_test, y_test, test_size=.5, stratify=y_test, random_state=RS)

tfidf = TfidfVectorizer(max_features=5000)
values = tfidf.fit_transform(X_train.new_description)
feature_names = tfidf.get_feature_names()
X_train = X_train.join(pd.DataFrame(values.toarray(), columns=feature_names))

values_test = tfidf.transform(X_test.new_description)
X_test = X_test.join(pd.DataFrame(values_test.toarray(), columns=feature_names))
values_control = tfidf.transform(X_control.new_description)
X_control = X_control.join(pd.DataFrame(values_control.toarray(), columns=feature_names))
train = X_train.drop('new_description', axis=1).fillna(0)
test =  X_test.drop('new_description', axis=1).fillna(0)
control = X_control.drop('new_description', axis=1).fillna(0)

# нормализуем для линейных методов
scaler = StandardScaler()
norm_train = pd.DataFrame(scaler.fit_transform(train), columns=train.columns) 
norm_test = pd.DataFrame(scaler.transform(test), columns=train.columns)
norm_control = pd.DataFrame(scaler.transform(control), columns=train.columns)

In [None]:
not_distance = [lgb.LGBMClassifier().__class__.__name__, xgb.XGBClassifier().__class__.__name__,]

# подбираем гиперпараметры и модель
methods = [
# 'LR',
# 'SGDC',
# 'LGB',
'XGB',
# 'MLPC'
]
for i in methods:
    if __name__ == '__main__':
        def hyperopt_train_test(params):
            if i == 'LR':
                X = norm_train
                model = LogisticRegression(n_jobs=5,  **params)     
            elif i == 'SGDC':
                X = norm_train
                model = SGDClassifier(n_jobs=5, early_stopping=True, **params)
            elif i == 'LGB':
                p = params.pop('eval_metric')
                X = train
                model = lgb.LGBMClassifier(n_jobs=5, objective='binary', **params)
            elif i == 'XGB':
                p = params.pop('eval_metric')
                X = train
                model = xgb.XGBClassifier(n_jobs=5, **params)
            elif i == 'MLPC':
                X = norm_train
                model = MLPClassifier(early_stopping=True, **params)
            
            if i in ('LGB', 'XGB'):
                eval_set = [(test.values, y_test)]
                model.fit(X.values, y_train, eval_set=eval_set, eval_metric=p, early_stopping_rounds=20, verbose=False)
            else:
                model.fit(X.values, y_train)
            
            if  model.__class__.__name__ in not_distance:
                y_pred = model.predict_proba(test.values)
                y_pred_control = model.predict_proba(control.values)                    
            else:
                y_pred = model.predict_proba(norm_test.values)
                y_pred_control = model.predict_proba(norm_control.values)
                
            y_pred_1d = [x[1] for x in y_pred]
            y_pred_control_1d = [x[1] for x in y_pred_control]
            
            f1_t, f1_c = roc_auc_score(y_test, y_pred_1d), roc_auc_score(y_control, y_pred_control_1d)
            return f1_t, f1_c
        
        n_features = train.shape[1]
        min_features = 1 / n_features
        
        if i == 'LR':
            space = {
                'class_weight': hp.choice('class_weight', (None, 'balanced')), 
                'penalty': hp.choice('penalty', ('l2', 'none')),
                'C': 10 ** hp.quniform('C', -3, 3, 0.5),
                'solver': hp.choice('metric', ('lbfgs', 'sag', 'saga')),
                'max_iter': hp.quniform('max_iter', 50, 1000, 50),
            }
            step = 300
            
#         elif i == 'KNN':
#             space = {
#                 'n_neighbors': scope.int(hp.quniform('n_neighbors', 2, 30, 1)),
#                 'weights': hp.choice('weights', ('uniform', 'distance')),
#                 'p': hp.quniform('p', 1, 20, 1),
#                 'metric': hp.choice('metric', ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 
#                                                'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 
#                                                'hamming', 'jaccard', 'kulsinski', 'minkowski', 'rogerstanimoto', 
#                                                'russellrao', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']),
#             }
#             step = 300            
        
        elif i == 'SGDC':
            space = {
                'loss': hp.choice('loss', ('log', 'modified_huber', 
#                                            'hinge', 'squared_hinge', 'perceptron'
                                          )),
                'alpha': 10 ** hp.quniform('alpha', -6, -1, 1),
                'penalty': hp.choice('penalty', ('none', 'l2', 'l1', 'elasticnet')),
                'l1_ratio': hp.quniform('l1_ratio', 0.05, 0.95, 0.05),
                'max_iter': scope.int(hp.quniform('n_estimators', 500, 2000, 50)),
                'learning_rate': hp.choice('learning_rate', ('constant', 'optimal', 'invscaling')),
                'eta0': 10 ** hp.quniform('eta0', -6, -1, 1),
                'power_t': hp.quniform('power_t', 0.05, 0.95, 0.05),
                'class_weight': hp.choice('class_weight', ['balanced', None]),
            }
            step = 500

#         elif i == 'RF':
#             space = {
#                 'n_estimators': scope.int(hp.quniform('n_estimators', 50, 1200, 50)),
#                 'max_depth': hp.choice('max_depth', [x for x in range(1, 51)] + [None]),
#                 'min_samples_split': hp.quniform('min_samples_split', 0.05, 1, 0.05),
#                 'min_samples_leaf': hp.quniform('min_samples_leaf', 0.05, 0.5, 0.05),
#                 'min_weight_fraction_leaf': hp.quniform('min_weight_fraction_leaf', 0.05, 0.5, 0.05),
#                 'max_features': hp.quniform('max_features', min_features, 1, min_features),
#                 'class_weight': hp.choice('class_weight', ['balanced', 'balanced_subsample', None])
#             }
#             step = 300
            
        elif i == 'LGB':
            space = {
                'boosting_type': hp.choice('boosting_type', ('gbdt', 'dart')),
                'num_leaves': scope.int(hp.quniform('num_leaves', 2, 100, 1)),
                'max_depth': hp.choice('max_depth', [x for x in range(1, 51)] + [-1]),
                'learning_rate': hp.choice('learning_rate', np.logspace(-3, -0.5, 49)),
                'n_estimators': scope.int(hp.quniform('n_estimators', 100, 2000, 50)),
#                 'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 400000, 20000),
#                 'objective': hp.choice('objective', ['multiclass', 'multiclassova']),
                'class_weight': hp.choice('class_weight', ['balanced', None]),
                'min_child_samples': scope.int(hp.quniform('min_child_samples', 5, 200, 5)),
                'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
                'subsample_freq': scope.int(hp.quniform('subsample_freq', 1, 100, 1)),
                'colsample_bytree': hp.quniform('colsample_bytree', min_features, 1, min_features),
                'reg_alpha': hp.choice('reg_alpha', np.logspace(-3, 2, 20)),
                'reg_lambda': hp.choice('reg_lambda', np.logspace(-3, 2, 20)),
                'eval_metric': hp.choice('eval_metric', ['None', 'auc', 'binary_logloss', 'binary_error'])
            }
            step = 700
        elif i == 'XGB':
            space = {
                'max_depth': scope.int(hp.quniform('max_depth', 1, 50, 1)),
                'learning_rate': hp.loguniform('learning_rate', -3, -0.3),
                'n_estimators': scope.int(hp.quniform('n_estimators', 100, 2000, 50)),
                'objective': hp.choice('objective', ['binary:logistic', 'binary:logitraw', 'binary:hinge']),
                'booster': hp.choice('booster', ['gbtree', 
#                                                  'gblinear', # Check failed: ntree_limit == 0U (1 vs. 0) 
                                                 'dart']),
                'gamma': hp.quniform('gamma', 0, 10, 0.05),
                'min_child_weight': scope.int(hp.quniform('min_child_weight', 0, 50, 1)),
                'max_delta_step': scope.int(hp.quniform('max_delta_step', 0, 50, 1)),
                'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
                'colsample_bytree': hp.quniform('colsample_bytree', min_features, 1, min_features),
                'colsample_bylevel': hp.quniform('colsample_bylevel', min_features, 1, min_features),
                'reg_alpha': hp.choice('reg_alpha', np.logspace(-4, 2, 31)),
                'reg_lambda': hp.choice('reg_lambda', np.logspace(-4, 2, 31)),
                'eval_metric': hp.choice('eval_metric', ['logloss',  'error', 'auc', 'map'])
            }
            step = 500
            
        elif i == 'MLPC':
            space = {
                'activation': hp.choice('activation', ['relu', 'logistic', 'tanh']),
                'solver': hp.choice('solver', ['sgd', 'adam']),
                'alpha': hp.loguniform('alpha', 1e-6, 1e-1),
                'batch_size': hp.choice('batch_size', [x for x in range(50, 700, 10)] + ['auto']),
                'learning_rate': hp.choice('learning_rate', ['constant', 'invscaling', 'adaptive']),
                'learning_rate_init': hp.loguniform('learning_rate_init', 1e-4, 5e-1),
                'power_t': hp.quniform('power_t', 0.05, 0.95, 0.05),
                'max_iter': hp.quniform('max_iter', 50, 700, 50),
                'momentum': hp.quniform('momentum', 0.05, 0.95, 0.05),
                'beta_1': hp.quniform('beta_1', 0.05, 0.99, 0.05),
                'beta_2': hp.quniform('beta_2', 0.05, 0.99, 0.05),
                'hidden_layer_sizes': hp.quniform('hidden_layer_sizes', 
                                                  (hp.uniform('size', 10, 500, 10),) * hp.uniform('layer', 1, 10, 1))
            }
            step = 500            
            
        count = 0
        best = 0


        def f(params):
            global best, count
            count += 1
            t = hyperopt_train_test(params.copy())[0]
            cont = hyperopt_train_test(params.copy())[1]
            acc = np.mean((t, cont))
            if acc > best:
                print('new best: {},'.format(str(round(acc * 100, 5))), 
                      'test {}, control {}'.format(str(round(t * 100, 5)), str(round(cont * 100, 5))), 'using {}'.format(i), params, sep='  ')
                best = acc
#             if count % 30 == 0:
#                 end_time = datetime.now()
#                 print('iters:', count, 'duration: {}'.format(end_time - start_time), sep=' ')
            return {'loss': -acc, 'status': STATUS_OK}


        trials = Trials()
        best = fmin(f, space, algo=tpe.suggest, max_evals=step, trials=trials)
        print('BEST', best)
        print(min(trials.losses()))

In [25]:
newsgroups = datasets.fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space'])
values = tfidf.fit_transform(newsgroups.data)
feature_names = tfidf.get_feature_names()
pd.DataFrame(values.toarray(), columns = feature_names)
len(newsgroups.data)

1786

In [195]:
hp.quniform('boosting_type', (hp.uniform('size', 10, 500, 10),) * hp.uniform('layer', 1, 10, 1))

<hyperopt.pyll.base.Apply at 0x1b439ad1788>

In [16]:
m = SGDClassifier(n_jobs=5, loss='log')
m.fit(norm_train, y_train)
roc_auc_score(y_test, [x[1] for x in m.predict_proba(norm_test)])

0.7524746489380016

new best: 93.16894,                                                                                                    
test 92.69943, control 93.63846                                                                                        
using LGB                                                                                                              
{'boosting_type': 'dart', 'class_weight': None, 'colsample_bytree': 0.7524080214748146, 'eval_metric': 'auc', 'learning_rate': 0.01778279410038923, 'max_depth': 49, 'min_child_samples': 5, 'n_estimators': 1150, 'num_leaves': 85, 'reg_alpha': 0.003359818286283781, 'reg_lambda': 0.12742749857031335, 'subsample': 0.9500000000000001, 'subsample_freq': 6}

In [9]:
params = {'boosting_type': 'dart', 'class_weight': None, 'colsample_bytree': 0.7524080214748146, 'eval_metric': 'auc', 
          'learning_rate': 0.01778279410038923, 'max_depth': 49, 'min_child_samples': 5, 'n_estimators': 1150, 
          'num_leaves': 85, 'reg_alpha': 0.003359818286283781, 'reg_lambda': 0.12742749857031335, 
          'subsample': 0.95, 'subsample_freq': 6}
model = lgb.LGBMClassifier(n_jobs=4, objective='binary', **params)
model.fit(train.values, y_train)
roc_auc_score(y_test, [x[1] for x in model.predict_proba(test.values)])

NameError: name 'm' is not defined

In [12]:
roc_auc_score(y_control, [x[1] for x in model.predict_proba(control.values)])

0.9358538204890685

In [15]:
model.booster_.save_model('lgb_classifier.txt')

<lightgbm.basic.Booster at 0x21e9d7b5c88>

In [17]:
clf = lgb.Booster(model_file='lgb_classifier.txt')
roc_auc_score(y_control, clf_fs.predict(control.values))

0.9358538204890685