In [9]:
# !/usr/bin/python
# -*- coding:utf-8 -*-
# Author: yadi Lao
import pickle
import logging
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


def train_LR(x_train, y_train):
    """
    训练逻辑回归
    """
    x_train = StandardScaler().fit_transform(x_train)
    parameters = {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 2, 5, 10, 20, 50, 100],
    }
    lr_model = LogisticRegression(C=1e5, solver='liblinear', multi_class='ovr', class_weight='balanced')
    gsearch = RandomizedSearchCV(lr_model, param_distributions=parameters, scoring='f1', cv=3)
    print('gridsearchcv fit begin...')
    gsearch.fit(x_train, y_train)
    print('Best score: {}'.format(gsearch.best_score_))
    print('Best parameters set: {}'.format(gsearch.best_estimator_.get_params()))

    lr_classifier = LogisticRegression(
        penalty=gsearch.best_estimator_.get_params()['penalty'],
        C=gsearch.best_estimator_.get_params()['C'],
        class_weight=gsearch.best_estimator_.get_params()['class_weight']
    )
    lr_classifier.fit(x_train, y_train)
    pickle.dump(lr_classifier, open('./save_model/pointwise/LR_{}.pkl'.format(feature_name), 'wb'))

    return lr_classifier

In [10]:
def train_GBDT(x_train, y_train, sample_weight=None):
    """
    训练GBDT
    """
    param_test1 = {'n_estimators': range(20, 81, 10),
                   'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
                   'max_depth': [4, 6, 8, 10, 15],
                   }
    gsearch = GridSearchCV(estimator=GradientBoostingClassifier(
        learning_rate=0.1, min_samples_split=300, min_samples_leaf=20,
        max_features='sqrt', random_state=10),
        param_grid=param_test1, scoring='f1', iid=False, cv=3)
    gsearch.fit(x_train, y_train)
    print('Best score: {}'.format(gsearch.best_score_))
    print('Best parameters set: {}'.format(gsearch.best_params_))

    gbdt_classifier = GradientBoostingClassifier(
        learning_rate=0.1,
        min_samples_split=300,
        min_samples_leaf=20,
        max_features='sqrt',
        max_depth=gsearch.best_params_['max_depth'],
        subsample=gsearch.best_params_['subsample'],
        n_estimators=gsearch.best_params_['n_estimators'])

    gbdt_classifier.fit(x_train, y_train, sample_weight)
    pickle.dump(gbdt_classifier, open('./save_model/pointwise/gbdt_{}.pkl'.format(feature_name), 'wb'))

    return gbdt_classifier

In [11]:
def train_xgboost(x_train, y_train):
    """
    训练xgboost
    """
    parameters = {
        'max_depth': [4, 6, 8, 10, 15],
        'learn_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
        'n_estimators': [100, 300, 500, 1000],
        'min_child_weight': [0, 2, 5, 10, 20],
        'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
        'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
        'scale_pos_weight': [2,4,6,8,10],
    }
    model = xgb.sklearn.XGBClassifier(
        nthread=5,
        silent=False,
        learn_rate=0.1,
        max_depth=6,
        min_child_weight=3,
        subsample=0.7,
        colsample_bytree=0.7,
        objective='binary:logistic',
        n_estimators=10)
    gsearch = RandomizedSearchCV(model, param_distributions=parameters, scoring='f1', cv=3)
    print('gridsearchcv fit begin...')
    gsearch.fit(x_train, y_train)
    print('Best score: {}'.format(gsearch.best_score_))
    print('Best parameters set: {}'.format(gsearch.best_estimator_.get_params()))

    xgb_classifier = xgb.sklearn.XGBClassifier(
        nthread=gsearch.best_estimator_.get_params()['nthread'],
        learn_rate=gsearch.best_estimator_.get_params()['learn_rate'],
        learning_rate=gsearch.best_estimator_.get_params()['learning_rate'],
        max_depth=gsearch.best_estimator_.get_params()['max_depth'],
        min_child_weight=gsearch.best_estimator_.get_params()['min_child_weight'],
        subsample=gsearch.best_estimator_.get_params()['subsample'],
        colsample_bytree=gsearch.best_estimator_.get_params()['colsample_bytree'],
        objective=gsearch.best_estimator_.get_params()['objective'],
        n_estimators=gsearch.best_estimator_.get_params()['n_estimators'],
        gamma=gsearch.best_estimator_.get_params()['gamma'],
        reg_alpha=gsearch.best_estimator_.get_params()['reg_alpha'],
        reg_lambda=gsearch.best_estimator_.get_params()['reg_lambda'],
        max_delta_step=gsearch.best_estimator_.get_params()['max_delta_step'],
        scale_pos_weight=gsearch.best_estimator_.get_params()['scale_pos_weight'],

    )
    watchlist = [(x_train, y_train), (x_test, y_test)]
    xgb_classifier.fit(x_train, y_train, eval_set=watchlist, eval_metric='ndcg', early_stopping_rounds=10)
    pickle.dump(xgb_classifier, open('./save_model/pointwise/xgboost_{}.pkl'.format(feature_name), 'wb'))

    return xgb_classifier

In [5]:
from libsvm.python.svmutil import*
from libsvm.python.svm import*


def generate_format_file(x, y, output_file):
    """
    生成规定格式的文件
    """
    dim = x.shape[1]
    with codecs.open(output_file, 'w') as fout:
        for i, vec in enumerate(x):
            if y[i] == 1:
                label = '+1'
            else:
                label = '-1'
            fea = list(map(lambda x: str(x[0])+':'+str(x[1]), list(zip(range(1, dim+1), vec))))
            fea = ' '.join(fea)
            fea = ' '.join([label, fea])
            fout.write(fea + '\n')
            
            
# y_train, x_train = svm_read_problem(train_file)
# y_test, x_test = svm_read_problem(test_file)
# print('train={}, test={}'.format(len(x_train), len(x_test)))

# rankSVM = svm_train(y_train, x_train, '-t 2 -c 4')
# p_label, acc, val = svm_predict(y_test, x_test, rankSVM)

In [2]:
def _to_list(x):
    if isinstance(x, list):
        returnx
    return[x]


def ndcg(y_true, y_pred, k=20, rel_threshold=0):
    """
    calculate NDCG
    """
    if k <= 0:
        return 0
    y_true = _to_list(np.squeeze(y_true).tolist())
    y_pred = _to_list(np.squeeze(y_pred).tolist())
    c = list(zip(y_true, y_pred))
    random.shuffle(c)
    c_g = sorted(c, key=lambda x: x[0], reverse=True)
    c_p = sorted(c, key=lambda x: x[1], reverse=True)
    idcg = 0
    ndcg = 0
    for i, (g, p) in enumerate(c_g):
        if i >= k:
            break
        if g > rel_threshold:
            idcg += (math.pow(2, g) - 1) / math.log(2+ i)
    for i, (g, p) in enumerate(c_p):
        if i >= k:
            break
        if g > rel_threshold:
            ndcg += (math.pow(2, g) - 1) / math.log(2+ i)
    if idcg == 0:
        return 0
    else:
        return ndcg / idcg


def grid_search(x_train, y_train):
    ndcg_score = make_scorer(ndcg)
    parameters = {
        'max_depth': [4, 6, 8, 10, 15],
        'learn_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
        'n_estimators': [100, 300, 500, 1000],
        'min_child_weight': [0, 2, 5, 10, 20],
        'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
        'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9]
    }
    model = xgb.sklearn.XGBClassifier(
        nthread=5,
        silent=False,
        learn_rate=0.1,
        max_depth=6,
        min_child_weight=3,
        subsample=0.7,
        colsample_bytree=0.7,
        objective='rank:pairwise',
        n_estimators=10)
    gsearch = RandomizedSearchCV(model, param_distributions=parameters, scoring=ndcg_score, cv=3)
    print('gridsearchcv fit begin...')
    gsearch.fit(x_train, y_train)
    print('Best score: {}'.format(gsearch.best_score_))
    print('Best parameters set: {}'.format(gsearch.best_estimator_.get_params()))

    return gsearch

In [24]:
def run():
    # loadtrain andtest data
    x_train, x_test, y_train, y_test = pickle.load(open(train_data_path, 'rb'))


    if is_train:
        model_id = len(os.listdir('./save_model/rankNet/{}'.format(feature_name)))
        rankNet = RankNet.RankNet(in_dim=len(x_train[0]), n_units1=FC_DIM_1, n_units2=FC_DIM_2)

        # optimizer: Adam, AdaGrad, SGD
        rankNet.fit(
            x_train, y_train, optimizerAlgorithm='SGD',
            savemodelName='./save_model/rankNet/{}/model_{}'.format(feature_name, model_id),
            savefigName='./save_model/rankNet/{}/fig_{}'.format(feature_name, model_id)
        )
        p_label = rankNet.predict(x_test)
    else:
        print('Load model')
        model_name = './save_model/rankNet/{}/model_{}'.format(feature_name, model_id)
        # model_name = './save_model/rankNet/model_5'
        rankNet = RankNet.RankNet(
            in_dim=len(x_train[0]), n_units1=FC_DIM_1, n_units2=FC_DIM_2, resumemodelName=model_name)
        p_label = rankNet.predict(x_test)