In [None]:
import json
import pandas as pd
import pymongo
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["soccerData"]
mycol = mydb["rawData007yapanV2"]

data = pd.DataFrame(list(mycol.find()))

In [None]:
data = pd.to_csv("rawData007V2.csv")

data = data.drop(['_id','place'], axis=1)

In [None]:
# data = pd.read_csv("rawData007.csv")

In [None]:
data = reduce_mem_usage(data)

In [None]:
data.head()

In [None]:
# masters = ["明陞", "10B", "金宝博","易胜","利记"]

# features = []
# for master in masters:
#     data = data.drop(['masterOdd_Start_{}_yapan'.format(master), 'pankou_Start_{}_yapan'.format(master),
#                       'guestOdd_Start_{}_yapan'.format(master),'masterOdd_End_{}_yapan'.format(master),
#                       'pankouOdd_End_{}_yapan'.format(master),'guestOdd_End_{}_yapan'].format(master), axis=1)

In [None]:
data_new = data[(data['pankouOdd_End_365_yapan']==0)
                |(data['pankouOdd_End_365_yapan']==0.25)
                |(data['pankouOdd_End_365_yapan']==0.5)
                |(data['pankouOdd_End_365_yapan']==0.75)
                |(data['pankouOdd_End_365_yapan']==-0.25)
                |(data['pankouOdd_End_365_yapan']==-0.5)
                |(data['pankouOdd_End_365_yapan']==-0.75)]

# (data['pankouOdd_End_365_yapan']=="平手")
# data_new =  data_new[data_new['masterOdd_End_365_yapan'] >=0.80]

In [None]:
data_new.info()

In [None]:
# data_new = data_new.drop(columns=['Unnamed: 0'])

# data_new = data_new.dropna(subset = ["masterOdd_Start_365_yapan","masterOdd_Start_Crown_yapan"])

In [None]:
data_new.info()

In [None]:
def getResult(masterGoal, guestGoal,pankou):
    if masterGoal - guestGoal - pankou > 0:
        return 1
    if masterGoal - guestGoal - pankou<= 0:
        return 0

In [None]:
def get18(master,guest):
    if master > guest:
        return 18
    if master < guest:
        return 81
    if master == guest:
        return 99

In [None]:
def getShuiPing(x):
    result = 9999
    if x <= 0.75:
        result = 0
    if 0.75< x and x<=0.80:
        result = 1
    if 0.80<x and x<=0.85:
        result = 2
    if 0.85<x and x<=0.90:
        result = 3
    if 0.95<x and x<=1.00:
        result = 4
    if 1.00<x and x<=1.08:
        result = 5
    if 1.08<x and x<=1.10:
        result = 6
    if 1.10<x and x<=1.15:
        result = 7
    if 1.15<x and x<=1.20:
        result = 8
    if x > 1.20:
        result = 9
    return result

def getShuiPing(x):
    result = 9999
    if x <= 0.75:
        result = 0
    if 0.75< x and x<=0.85:
        result = 1
    if x > 0.95:
        result = 2
    return result

In [None]:
def getOriginFlow(x):
    if x > 0:
        return 3
    if x == 0:
        return 1
    if x < 0:
        return 0

In [None]:
def getType(yapanMasterStartOdd, yapanGuestStartOdd, yapanPankouStart):
    linTypeStart = get18(yapanMasterStartOdd, yapanGuestStartOdd)
    return str(linTypeStart) + "_" + str(yapanPankouStart)

In [None]:
data_new['result'] = data_new.apply(lambda x: getResult(
        x['masterGoal'], x['guestGoal'],x['pankouOdd_End_365_yapan']), axis=1)

data_new = data_new.drop(columns=['masterGoal', 'guestGoal'])

data_new = data_new.drop(columns=['midMasterGoal', 'midGuestGoal'])

In [None]:
# data_new["masterOdd_Start" + "_" + "max" + "_yapan" ] = data_new[masterStartCol].max(axis = 1, skipna = True)
# data_new["pankou_Start" + "_" + "max" + "_yapan" ] = data_new[pankouStartCol].max(axis = 1, skipna = True)
# data_new["guestOdd_Start" + "_" + "max" + "_yapan"] = data_new[guestStartCol].max(axis = 1, skipna = True)

# data_new["masterOdd_Start_EX" + "_" + "max" + "_yapan" ] = data_new[masterStartCol].max(axis = 1, skipna = True)
# data_new["pankou_Start_EX" + "_" + "max" + "_yapan" ] = data_new[pankouStartCol].max(axis = 1, skipna = True)
# data_new["guestOdd_Start_EX" + "_" + "max" + "_yapan"] = data_new[guestStartCol].max(axis = 1, skipna = True)

# data_new["masterOdd_End" + "_" + "max" + "_yapan"] = data_new[masterEndCol].max(axis = 1, skipna = True)
# data_new["pankouOdd_End" + "_" + "max" + "_yapan"] = data_new[pankouEndCol].max(axis = 1, skipna = True)
# data_new["guestOdd_End" + "_" + "max" + "_yapan"] = data_new[guestEndCol].max(axis = 1, skipna = True)

# data_new["masterOdd_Start_EX" + "_" + "min" + "_yapan"] = data_new[masterStartCol].min(axis = 1, skipna = True)
# data_new["pankou_Start_EX" + "_" + "min" + "_yapan"] = data_new[pankouStartCol].min(axis = 1, skipna = True)
# data_new["guestOdd_Start_EX" + "_" + "min" + "_yapan"] = data_new[guestStartCol].min(axis = 1, skipna = True)

# data_new["masterOdd_Start" + "_" + "min" + "_yapan"] = data_new[masterStartCol].min(axis = 1, skipna = True)
# data_new["pankou_Start" + "_" + "min" + "_yapan"] = data_new[pankouStartCol].min(axis = 1, skipna = True)
# data_new["guestOdd_Start" + "_" + "min" + "_yapan"] = data_new[guestStartCol].min(axis = 1, skipna = True)


# data_new["masterOdd_End" + "_" + "min" + "_yapan"] = data_new[masterEndCol].min(axis = 1, skipna = True)
# data_new["pankouOdd_End" + "_" + "min" + "_yapan"] = data_new[pankouEndCol].min(axis = 1, skipna = True)
# data_new["guestOdd_End" + "_" + "min" + "_yapan"] = data_new[guestEndCol].min(axis = 1, skipna = True)

In [None]:
# masters = ["max","min"]

# features = []
# for master in masters:
#     data_new['yapanOddFlow'+master] = data_new["masterOdd_Start" + "_" + master + "_yapan"] - data_new["masterOdd_End" + "_" + master + "_yapan"]
#     data_new['yapanGuestOddFlow'+master] = data_new["guestOdd_Start" + "_" + master + "_yapan"] - data_new["guestOdd_End" + "_" + master + "_yapan"]
    
#     data_new['yapanOddOriginFlow'+master] =  data_new['yapanOddFlow'+master].map(getOriginFlow)
#     data_new['yapanGuestOddOriginFlow'+master] = data_new['yapanGuestOddFlow'+master].map(getOriginFlow)
    
#     data_new["masterOdd_Start" + "_" + master + "_yapan" + "_SP"] = data_new["masterOdd_Start" + "_" + master + "_yapan"].map(getShuiPing)
#     data_new["guestOdd_Start" + "_" + master + "_yapan" + "_SP"] = data_new["guestOdd_Start" + "_" + master + "_yapan"].map(getShuiPing)
    
#     data_new["masterOdd_Start_EX" + "_" + master + "_yapan" + "_SP"] = data_new["masterOdd_Start" + "_" + master + "_yapan"].map(getShuiPing)
#     data_new["guestOdd_Start_EX" + "_" + master + "_yapan" + "_SP"] = data_new["guestOdd_Start_EX" + "_" + master + "_yapan"].map(getShuiPing)
    
#     data_new["masterOdd_End" + "_" + master + "_yapan" + "_SP"] = data_new["masterOdd_Start" + "_" + master + "_yapan"].map(getShuiPing)
#     data_new["guestOdd_End" + "_" + master + "_yapan" + "_SP"] = data_new["guestOdd_End" + "_" + master + "_yapan"].map(getShuiPing)
    
#     data_new['yapanTypeStart'+master] = data_new.apply(lambda x: getType(
#             x["masterOdd_Start" + "_" + master + "_yapan"], x["guestOdd_Start" + "_" + master + "_yapan"],
#         x["pankou_Start" + "_" + master + "_yapan"]), axis=1)
        
#     data_new['yapanTypeStartEX'+master] = data_new.apply(lambda x: getType(
#             x["masterOdd_Start_EX" + "_" + master + "_yapan"], x["guestOdd_Start_EX" + "_" + master + "_yapan"],
#         x["pankou_Start_EX" + "_" + master + "_yapan"]), axis=1)
    
#     data_new['yapanTypeEnd'+master] = data_new.apply(lambda x: getType(
#             x["masterOdd_End" + "_" + master + "_yapan"], x["guestOdd_End" + "_" + master + "_yapan"],
#         x["pankouOdd_End" + "_" + master + "_yapan"]), axis=1)
    
#     data_new['yapanTypeStartAll'+master] = data_new['yapanTypeStart'+master].astype(str) + "_" + data_new["masterOdd_Start" + "_" + master + "_yapan" + "_SP"].astype(str)
#     data_new['yapanTypeEndAll'+master] = data_new['yapanTypeEnd'+master].astype(str) + "_" + data_new["masterOdd_End" + "_" + master + "_yapan" + "_SP"].astype(str)
#     data_new['yapanTypeAll'+master] = data_new["pankouOdd_End_365_yapan"].astype(str)  + "_" + data_new['yapanOddOriginFlow'+master].astype(str) +"_"+ data_new['yapanTypeStartAll'+master] +"_"+ data_new['yapanTypeEndAll'+master]

In [None]:
# data_new['yapanTypeAllMaxMin'] = data_new['yapanTypeAllmax'] + data_new['yapanTypeAllmin']

In [None]:
pd.set_option('display.max_columns', None)
data_new.head()

In [None]:
# masters = ["澳门","立博","平博","香港马会","Interwetten"]

# for master in masters:
#     data_new = data_new.drop(columns=["pankou_Start" + "_" + master + "_yapan"])
#     data_new = data_new.drop(columns=["pankou_Start_EX" + "_" + master + "_yapan"])
#     data_new = data_new.drop(columns=["pankouOdd_End" + "_" + master + "_yapan"])

#     data_new = data_new.drop(columns=["masterOdd_Start" + "_" + master + "_yapan"])
#     data_new = data_new.drop(columns=["masterOdd_Start_EX" + "_" + master + "_yapan"])
#     data_new = data_new.drop(columns=["masterOdd_End" + "_" + master + "_yapan"])
    
#     data_new = data_new.drop(columns=["guestOdd_Start" + "_" + master + "_yapan"])
#     data_new = data_new.drop(columns=["guestOdd_Start_EX" + "_" + master + "_yapan"])
#     data_new = data_new.drop(columns=["guestOdd_End" + "_" + master + "_yapan"])

In [None]:
train = data_new[:150000]

test =  data_new[150000:]

In [None]:
data_new

In [None]:
from sklearn.model_selection import  StratifiedKFold,KFold
import category_encoders as ce

def mean_woe_target_encoder(train,test,target,col,n_splits=10):
    folds = StratifiedKFold(n_splits)

    y_oof = np.zeros(train.shape[0])
    y_oof_2= np.zeros(train.shape[0])
    y_test_oof = np.zeros(test.shape[0]).reshape(-1,1)
    y_test_oof2 = np.zeros(test.shape[0]).reshape(-1,1)

    splits = folds.split(train, target)
    
    for fold_n, (train_index, valid_index) in enumerate(splits):
        X_train, X_valid = train[col].iloc[train_index], train[col].iloc[valid_index]
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        clf=ce.target_encoder.TargetEncoder()
    
        clf.fit(X_train.values,y_train.values)    
        y_pred_valid = clf.transform(X_valid.values)

        y_oof[valid_index] = y_pred_valid.values.reshape(1,-1)

        tp=(clf.transform(test[col].values)/(n_splits*1.0)).values
        tp=tp.reshape(-1,1)
        y_test_oof+=tp    
    
        del X_train, X_valid, y_train, y_valid
        gc.collect()    
        
    for fold_n, (train_index, valid_index) in enumerate(splits):
        X_train, X_valid = train[col].iloc[train_index], train[col].iloc[valid_index]
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        clf=ce.woe.WOEEncoder()
    
        clf.fit(X_train.values,y_train.values)    
        y_pred_valid = clf.transform(X_valid.values)
        y_oof2[valid_index] = y_pred_valid.values.reshape(1,-1)
    
        tp=(clf.transform(test[col].values)/(n_splits*1.0)).values
        tp=tp.reshape(-1,1)
        
        y_test_oof2+=tp    
        del X_train, X_valid, y_train, y_valid
        gc.collect()     
    return y_oof,y_oof_2,y_test_oof,y_test_oof2

In [None]:
import gc

train_x =  train.drop(columns=['result'])
train_y =  train['result']

test_x =  test.drop(columns=['result'])
test_y =  test['result']

In [None]:
# cat_features = ['yapanTypeAllmin','yapanTypeAllmax','yapanTypeAllMaxMin']
# for col in cat_features:
#   y_oof,y_oof_2,y_test_oof,y_test_oof2 = mean_woe_target_encoder(train_x,test_x,train_y,col,n_splits=10)
#   nm = '_count'
#   train_x[col+nm] = y_oof
#   test_x[col+nm] = y_test_oof
#   train[col+nm] = y_oof
#   test[col+nm] = y_test_oof

In [None]:
cat_features = train.select_dtypes(include='object').columns

In [None]:
def encode_LE(col,train,test):
    df_comb = pd.concat([train[col],test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    nm = col
    if df_comb.max()>32000: 
        train[nm] = df_comb[:len(train)].astype('int32')
        test[nm] = df_comb[len(train):].astype('int32')
    else:
        train[nm] = df_comb[:len(train)].astype('int16')
        test[nm] = df_comb[len(train):].astype('int16')
    del df_comb; x=gc.collect()

for col in cat_features:
    encode_LE(col,train_x,test_x)

In [None]:
data_new

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import StratifiedKFold
import gc
X = train_x
y = train_y
# del train_x,train_y
# gc.collect()


params = {'num_leaves': 240,  # 结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary',  # 定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,  # 提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction": 0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,  # l1正则
          'lambda_l2': 5.985747612243422e-07,  # l2正则
          "verbosity": -1,
          "nthread": -1,  # 线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'auc'},  # 评价函数选择
          "random_state": 2019,  # 随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67
          }

# params = {'lambda_l1': 1.311484608995952,
#           'lambda_l2': 0.008865158312952497,
#           'num_leaves': 132,
#           'feature_fraction': 0.991110639036875,
#           'bagging_fraction': 0.806081033664652,
#           'bagging_freq': 2,
#           'min_child_samples': 191,
#           'cat_smooth': 100,
#           'max_depth': 50,
#           'learning_rate': 0.014,
#           'subsample': 1.0,
#           'reg_alpha': 0.001686292659297514,
#           'reg_lambda': 0.7879699485443895,
#           'colsample_bytree': 0.5,
#           'metric': {'auc'}}

NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(test_x.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 1000, valid_sets=[
                    dtrain, dvalid], verbose_eval=200, early_stopping_rounds=300)

    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()

    y_pred_valid = clf.predict(X_valid)
    print(y_pred_valid)
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")

    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(test_x) / NFOLDS
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
dtrain = lgb.Dataset(train_x, label=train_y)
clf = lgb.train(params, dtrain, 1000, verbose_eval=200)
y_preds_new = clf.predict(test_x)
y_preds = (y_preds+y_preds_new)/2

print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(test_y, y_preds)}")

In [None]:
pred_labels = np.rint(y_preds)
accuracy = accuracy_score(test_y, pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))

In [None]:
test_new = pd.DataFrame({'result':test_y,'predict':y_preds})
a = test_new[(test_new["predict"] >= 0.65) | (test_new["predict"] <= 0.35)]
pred_labels = np.rint(a['predict'])
accuracy = accuracy_score(a['result'], pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

In [None]:
import seaborn as sns
sns.set(font='LiSu')
feature_importance_gain=pd.DataFrame()
feature_importance_gain['feature']=columns
feature_importance_gain['fold_1']=clf.feature_importance(importance_type='gain')
feature_importance_gain['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(1)]].mean(axis=1)
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importance_gain.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('100 TOP feature importance over {} folds average'.format(folds.n_splits));

In [None]:
# import sweetviz as sv
# # 可以选择目标特征
# my_report = sv.analyze(train, target_feat ='result')
# my_report.show_html()

In [None]:
# import lightgbm as lgb
# from sklearn.metrics import roc_auc_score,accuracy_score
# from sklearn.model_selection import StratifiedKFold
# import gc
# X = train_x
# y = train_y
# # del train_x,train_y
# # gc.collect()

# param = {'learning_rate': 0.04624866821131782,
#         'gamma': 0.4521452209872597, 
#         'reg_alpha': 2,
#         'reg_lambda': 7, 
#         'n_estimators': 410,
#         'colsample_bynode': 0.2158053139547304,
#         'colsample_bylevel': 0.747234611546242,
#         'subsample': 0.9462638943432846, 
#         'min_child_weight': 117,
#         'colsample_bytree': 0.29748741231156306,
#         'max_depth': 15}


# NFOLDS = 5
# folds = StratifiedKFold(n_splits=NFOLDS)

# columns = X.columns
# splits = folds.split(X, y)
# y_preds_xg = np.zeros(test_x.shape[0])


# for fold_n, (train_index, valid_index) in enumerate(splits):
#     X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
#     y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

#     model = XGBClassifier(objective='binary:logistic', eval_metric="auc",
#                               tree_method='gpu_hist', **param)

    
#     model.fit(X_train, y_train)
#     y_preds_xg += model.predict_proba(test_x)[:,1] / NFOLDS
#     del X_train, X_valid, y_train, y_valid
#     gc.collect()
    
# model = XGBClassifier(objective='binary:logistic', eval_metric="auc",
#                           tree_method='gpu_hist', **param)
# model.fit(train_x, train_y)

# # 对测试集进行预测
# y_pred_new = model.predict_proba(test_x)    
# y_preds_xg = (y_preds_xg + y_pred_new[:,1])/2
# print(f"Out of folds AUC = {roc_auc_score(test_y, y_preds)}")

In [None]:
# print(f"Out of folds AUC = {roc_auc_score(test_y, y_preds)}")