In [1]:
import json
import pandas as pd
import pymongo
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
def getGoalMid(masterGoal, guestGoal, masterMidGoal, guestMidGoal):
    return int(masterGoal) + int(guestGoal) - int(masterMidGoal) - int(guestMidGoal)


def removeSub(pankou):
    pankou = pankou.replace("升", "")
    pankou = pankou.replace("降", "")
    return pankou.strip()


def getResult(masterGoal, guestGoal, pankou):
    pankouList = removeSub(pankou).split("/")
    row = 0
    for p in pankouList:
        row += float(p)
    row = row / len(pankouList)
    
    if masterGoal + guestGoal - row > 0:
        return 1
    if masterGoal + guestGoal - row <= 0:
        return 0


def getType(yapanMasterStartOdd, yapanGuestStartOdd, yapanPankouStart):
    linTypeStart = get18(yapanMasterStartOdd, yapanGuestStartOdd)
    return str(linTypeStart) + "_" + str(yapanPankouStart)


def get18(master, guest):
    if master > guest:
        return 18
    if master < guest:
        return 81
    if master == guest:
        return 99


def daxiao_num(x):
    x_list = x.split("/")
    num = 0
    for i in x_list:
        num += float(i)
    return (float(num)/len(x_list))


def realDaxiao(x, master, guest):
    return float(x) - int(master) - int(guest)


def shengjiang(start, end):
    return ((end) - (start))

def round2(x):
    return round((x), 2)

In [4]:
def getShuiPing(x):
    result = 11
    if x < 0.75:
        result = 0
    if 0.75<= x and x<=0.85:
        result = 1
    if 0.85<x and x<=0.90:
        result = 2
    if 0.90<x and x<=0.95:
        result = 3
    if 0.95<x and x<=1.00:
        result = 4
    if 1.00<x and x<=1.08:
        result = 5
    if 1.08<x:
        result = 6
    return result

def num_fea_dis(df,features):
    for f in features:
        nm = f+'_'+'shuiPing'
        df[nm] = df[f].map(getShuiPing)
    return df

In [5]:
def fillNa(x, value):
    if pd.isnull(x):
        return value
    else:
        return x


def preF(test):
#     test['result'] = test.apply(lambda x: getResult(
#         x['masterGoal'], x['guestGoal'], x['pankouOdd_End_Zhong_3']), axis=1)
    test = test.dropna(axis=0,subset = ["pankou_Start_Zao_Crown"]) 
    test['result'] = test.apply(lambda x: getResult(
        x['masterGoal'], x['guestGoal'], x['pankouOdd_End_Zao_Crown']), axis=1)
    
    test['goalMid'] = test.apply(lambda x: getGoalMid(
        x['masterGoal'], x['guestGoal'], x['midGuestGoal'], x['midMasterGoal']), axis=1)

    test = test[test['goalMid'] >= 0]

    test['zhongbifengNew'] = test['midMasterGoal'].astype(
        str) + "_" + test['midGuestGoal'].astype(str)

#     test["water_type_mid"] = test['masterOdd_End_Zhong_3'] + \
#         test['guestOdd_End_Zhong_3']
#     test["water_type_mid"] = test["water_type_mid"].map(round2)

#     test['pankou_Start_Ji_3'] = test.apply(
#         lambda x: removeSub(x['pankou_Start_Ji_3']), axis=1)
#     test['pankouOdd_End_Ji_3'] = test.apply(
#         lambda x: removeSub(x['pankouOdd_End_Ji_3']), axis=1)

#     test['pankou_Start_Zhong_3'] = test.apply(
#         lambda x: removeSub(x['pankou_Start_Zhong_3']), axis=1)
#     test['pankouOdd_End_Zhong_3'] = test.apply(
#         lambda x: removeSub(x['pankouOdd_End_Zhong_3']), axis=1)

#     pankou = ["pankou_Start_Ji_3", "pankouOdd_End_Ji_3"]

#     for col in pankou:
#         test[col] = test[col].map(daxiao_num)

#     pankou = ["pankouOdd_End_Zhong_3", "pankou_Start_Zhong_3"]

#     for col in pankou:
#         nm = col
#         test[col] = test[col].map(daxiao_num)
#         test[nm] = test.apply(lambda x: realDaxiao(
#             x[col], x['midGuestGoal'], x['midMasterGoal']), axis=1)
        
#     fes = ['masterOdd_Start_Ji_3', 'masterOdd_End_Ji_3',
#            'masterOdd_Start_Zhong_3', 'masterOdd_End_Zhong_3']
    
#     num_fea_dis(test, fes)

#     test['daxiaoTypeStart'] = test['masterOdd_Start_Ji_3_shuiPing'].astype(str) + test['pankou_Start_Ji_3'].astype(str)
#     test['daxiaoType'] = test['masterOdd_End_Ji_3_shuiPing'].astype(str) + test['pankouOdd_End_Ji_3'].astype(str)
#     test['daxiaoTypeStartMid'] = test['masterOdd_Start_Zhong_3_shuiPing'].astype(str) + test['pankou_Start_Zhong_3'].astype(str)
#     test['daxiaoTypeMid'] = test['masterOdd_End_Zhong_3_shuiPing'].astype(str) + test['pankouOdd_End_Zhong_3'].astype(str)
    
#     test['daxiaoTypeStart'] = test.apply(lambda x: getType(
#         x['masterOdd_Start_Ji_3'], x['guestOdd_Start_Ji_3'], x['pankou_Start_Ji_3']), axis=1)
#     test['daxiaoType'] = test.apply(lambda x: getType(
#         x['masterOdd_End_Ji_3'], x['guestOdd_End_Ji_3'], x['pankouOdd_End_Ji_3']), axis=1)
#     test['daxiaoTypeStartMid'] = test.apply(lambda x: getType(
#         x['masterOdd_Start_Zhong_3'], x['guestOdd_Start_Zhong_3'], x['pankou_Start_Zhong_3']), axis=1)
#     test['daxiaoTypeMid'] = test.apply(lambda x: getType(
#         x['masterOdd_End_Zhong_3'], x['guestOdd_End_Zhong_3'], x['pankouOdd_End_Zhong_3']), axis=1)

#     test['daxiaoTypeAll'] = test['daxiaoTypeMid'] + test['daxiaoTypeStartMid'] + test['daxiaoType'] + test['daxiaoTypeStart'] + test['lianShai'] + test['zhongbifengNew']

#     test['daxiaoTypeMidAll'] = test['daxiaoTypeMid'] + test['daxiaoTypeStartMid'] + test['lianShai'] + test['zhongbifengNew']
    
#     test = test.drop(['masterOdd_Start_Ji_3_shuiPing', 'masterOdd_End_Ji_3_shuiPing',
#                       'masterOdd_Start_Zhong_3_shuiPing','masterOdd_End_Zhong_3_shuiPing'], axis=1)

#     test = test.drop(['daxiaoTypeStart', 'daxiaoType',
#                       'daxiaoTypeStartMid'], axis=1)


#     test = test[(test['masterOdd_End_Ji_3'] >= 0.80)]


    test = test.drop(columns=['masterGoal', 'guestGoal', "goalMid"])
    test = test.drop(columns=['midGuestGoal', 'midMasterGoal'])

    return test

In [None]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["soccerData"]
mycol = mydb["win007daxiaoZao"]
data_mid = pd.DataFrame(list(mycol.find()))
data_mid = data_mid.drop(['_id','place','time'], axis=1)
data_mid = reduce_mem_usage(data_mid)
data_mid = preF(data_mid)

In [None]:
pd.set_option('display.max_columns', None)

data_mid.info()

In [None]:
import gc

def encode_CB(col1,col2,df1,name=""):
    if name == "":
        nm = col1+'_'+col2
    else:
        nm = name
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)

def encode_CB3(col1,col2,col3,df1,name=""):
    if name == "":
        nm = col1+'_'+col2+'_'+col3
    else:
        nm = name
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)+'_'+df1[col3].astype(str)

In [None]:
# data_mid = data_mid.sort_values('time')
# train = data_mid[:230000]
# test = data_mid[230000:]

# idMap = data_mid[["place", "result"]].set_index("place").to_dict()["result"]
# test_y = test['result']
# test = test.drop(['result'], axis=1)
# data_mid = pd.concat([train,test])

# fes = ['daxiaoMasterStartOdd','daxiaoGuestStartOdd',
#        'daxiaoMasterOdd','daxiaoGuestOdd',
#        'daxiaoMasterStartOddMid','daxiaoGuestStartOddMid',
#        'daxiaoMasterOddMid','daxiaoGuestOddMid']

# fes = []

# listOdd = ['typeLinStartReal', 'typeLinEndReal','typeLinStart'] + fes

# for day in [7,30,140]:
#     for odd in listOdd:
#         data_mid.sort_values([odd, 'time'], ascending=[True, True], inplace=True)
#         def f_mean(x): return x.rolling(window=day, min_periods=1).mean()
#         def f_std(x): return x.rolling(window=day, min_periods=1).std()
#         function_list = [f_mean, f_std]
#         function_name = ['mean', 'std']
#         for i in range(len(function_list)):
#             data_mid['stat_' + odd + "_" + function_name[i] + "_" + str(day)] = data_mid.sort_values(
#                 'time').groupby([odd])['result'].apply(function_list[i])
# data_mid['result'] = data_mid['place'].map(idMap)
# data_mid = data_mid.sort_values('time')

In [None]:
train = data_mid

In [None]:
mycol = mydb["win007daxiaoZaotest"]
test = pd.DataFrame(list(mycol.find()))
test = test.drop(['_id','place','time'], axis=1)
test = reduce_mem_usage(test)
test = preF(test)

In [None]:
test.info()

In [None]:
from sklearn.model_selection import  StratifiedKFold,KFold
import category_encoders as ce

def mean_woe_target_encoder(train,test,target,col,n_splits=10):
    folds = StratifiedKFold(n_splits)

    y_oof = np.zeros(train.shape[0])
    y_oof_2= np.zeros(train.shape[0])
    y_test_oof = np.zeros(test.shape[0]).reshape(-1,1)
    y_test_oof2 = np.zeros(test.shape[0]).reshape(-1,1)

    splits = folds.split(train, target)
    
    for fold_n, (train_index, valid_index) in enumerate(splits):
        X_train, X_valid = train[col].iloc[train_index], train[col].iloc[valid_index]
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        clf=ce.target_encoder.TargetEncoder()
    
        clf.fit(X_train.values,y_train.values)    
        y_pred_valid = clf.transform(X_valid.values)

        y_oof[valid_index] = y_pred_valid.values.reshape(1,-1)

        tp=(clf.transform(test[col].values)/(n_splits*1.0)).values
        tp=tp.reshape(-1,1)
        y_test_oof+=tp    
    
        del X_train, X_valid, y_train, y_valid
        gc.collect()    
        
    for fold_n, (train_index, valid_index) in enumerate(splits):
        X_train, X_valid = train[col].iloc[train_index], train[col].iloc[valid_index]
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        clf=ce.woe.WOEEncoder()
    
        clf.fit(X_train.values,y_train.values)    
        y_pred_valid = clf.transform(X_valid.values)
        y_oof2[valid_index] = y_pred_valid.values.reshape(1,-1)
    
        tp=(clf.transform(test[col].values)/(n_splits*1.0)).values
        tp=tp.reshape(-1,1)
        y_test_oof2+=tp    
        del X_train, X_valid, y_train, y_valid
        gc.collect()     
    return y_oof,y_oof_2,y_test_oof,y_test_oof2

In [None]:
test.head()

In [None]:
import gc
cat_features = train.select_dtypes(include='object').columns

In [None]:
train_x =  train.drop(columns=['result'])
train_y =  train['result']

test_y = test['result']
test_x =  test.drop(columns=['result'])

In [None]:
for col in train.select_dtypes(include='object').columns:
  y_oof,y_oof_2,y_test_oof,y_test_oof2 = mean_woe_target_encoder(train,test,train_y,col,n_splits=10)
  train_x[col] = y_oof
  test_x[col] = y_test_oof
  train[col] = y_oof
  test[col] = y_test_oof

In [None]:
train_x.info()

In [None]:
# import gc
# cat_features = test_x.select_dtypes(include='object').columns
# def encode_LE(col,train,test):
#     df_comb = pd.concat([train[col],test[col]],axis=0)
#     df_comb,_ = df_comb.factorize(sort=True)
#     nm = col
#     if df_comb.max()>32000: 
#         train[nm] = df_comb[:len(train)].astype('int32')
#         test[nm] = df_comb[len(train):].astype('int32')
#     else:
#         train[nm] = df_comb[:len(train)].astype('int16')
#         test[nm] = df_comb[len(train):].astype('int16')
#     del df_comb; x=gc.collect()

# for col in cat_features:
#     encode_LE(col,train_x,test_x)
#     encode_LE(col,train,test)

In [None]:
cat_features

In [None]:
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

lgb_train = lgb.Dataset(train_x, train_y)

# 参数设置
params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,		#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 7,	#随机数种子，可以防止每次运行的结果不一致
          'min_child_samples': 67,
          }



y_pred = np.zeros(test_x.shape[0])

folds = 16
for i in range(folds):
    print('Starting training...',i)
    params['random_state'] = i + 2021
    # 模型训练
    

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=100)
    
    model = 'C:\\Users\\24525\\code\\spider\\test\\' + 'test_' + str(i) + ".txt"
    gbm.save_model(model)
    # 模型预测
    y_pred += gbm.predict(test_x, num_iteration=gbm.best_iteration) / folds

accuracy = roc_auc_score(test_y, y_pred)
print("accuarcy: %.2f%%" % (accuracy*100.0))

In [None]:
from sklearn.metrics import roc_auc_score,accuracy_score
test_new = pd.DataFrame({'result':test_y,'predict':y_pred})
a = test_new[(test_new["predict"] >= 0.66)]
pred_labels = np.rint(a['predict'])
accuracy = accuracy_score(a['result'], pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

In [None]:
# test_new_2 = test_x
# test_new_2['result'] = test_y
# test_new_2['predict'] = y_pred
# a = test_new_2[(test_new_2["predict"] <= 0.90)]
# pred_labels = np.rint(a['predict'])
# accuracy = accuracy_score(a['result'], pred_labels)
# print("accuarcy: %.2f%%" % (accuracy*100.0))
# print(a.shape)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
lgb.plot_importance(gbm, max_num_features=60, height=0.5, ax=ax)
plt.show()

In [None]:
test_x.info()

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import StratifiedKFold
import gc
X = train_x
y = train_y
# del train_x,train_y
# gc.collect()


params = {'num_leaves': 240,  # 结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary',  # 定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,  # 提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction": 0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,  # l1正则
          'lambda_l2': 5.985747612243422e-07,  # l2正则
          "verbosity": -1,
          "nthread": -1,  # 线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'auc'},  # 评价函数选择
          "random_state": 2019,  # 随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67
          }

NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(test_x.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 1000, valid_sets=[
                    dtrain, dvalid], verbose_eval=200, early_stopping_rounds=300)

    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()

    y_pred_valid = clf.predict(X_valid)
    print(y_pred_valid)
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")

    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(test_x) / NFOLDS
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(test_y, y_preds)}")
print("sub",score - roc_auc_score(test_y, y_preds))

In [None]:
from sklearn.metrics import roc_auc_score,accuracy_score
test_new = pd.DataFrame({'result':test_y,'predict':y_preds})
a = test_new[(test_new["predict"] >= 0.66)]
pred_labels = np.rint(a['predict'])
accuracy = accuracy_score(a['result'], pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

In [None]:
test_x.head()------------------

In [None]:
lgb_params = {
    'objective': 'binary',
    'n_estimators': 20000,
    'random_state': 42,
    'learning_rate': 8e-3,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
}


xgb_params = {'n_estimators': 10000,
               'learning_rate': 0.03689407512484644,
               'max_depth': 8,
               'colsample_bytree': 0.3723914688159835,
               'subsample': 0.780714581166012,
               'eval_metric': 'auc',
               'use_label_encoder': False,
               'gamma': 0,
               'reg_lambda': 50.0,
               'gpu_id': 0,
               'objective': 'binary:logistic',
               'random_state': 42}


cat_params = {'iterations': 17298,
               'learning_rate': 0.03429054860458741,
               'reg_lambda': 0.3242286463210283,
               'subsample': 0.9433911589913944,
               'random_strength': 22.4849972385133,
               'depth': 8,
               'min_data_in_leaf': 4,
               'leaf_estimation_iterations': 8,
               'task_type':"GPU",
               'bootstrap_type':'Poisson',
               'verbose' : 500,
               'early_stopping_rounds' : 200,
               'eval_metric' : 'AUC'}

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

lgbm = LGBMClassifier(**lgb_params)

xgb = XGBClassifier(**xgb_params)

cat = CatBoostClassifier(**cat_params)

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier

lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[lgbm, xgb, cat],
                            use_probas=True,
                            meta_classifier=lr,
                            random_state=42)

statcker = sclf.fit(train_x,train_y)

y_pred = statcker.predict_proba(test_x)[:, 1]

accuracy = roc_auc_score(test_y, y_pred)
print("accuarcy: %.2f%%" % (accuracy*100.0))

from sklearn.metrics import roc_auc_score,accuracy_score
test_new = pd.DataFrame({'result':test_y,'predict':y_pred})
a = test_new[(test_new["predict"] >= 0.66)]
pred_labels = np.rint(a['predict'])
accuracy = accuracy_score(a['result'], pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

In [None]:
accuracy = roc_auc_score(test_y, y_pred)
print("accuarcy: %.2f%%" % (accuracy*100.0))

In [None]:
# import joblib

# joblib.dump(statcker, 'clf.pkl')

#读取Model
# statcker = joblib.load('clf.pkl')