In [80]:
import json
import pandas as pd

import pymongo
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

In [81]:
import warnings
warnings.filterwarnings('ignore')

In [82]:
def reduce_mem_usage(df, verbose=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max and c_prec == np.finfo(np.float16).precision:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [83]:
def getGoalMid(masterGoal, guestGoal, masterMidGoal, guestMidGoal):
    return int(masterGoal) + int(guestGoal) - int(masterMidGoal) - int(guestMidGoal)


def removeSub(pankou):
    pankou = pankou.replace("升", "")
    pankou = pankou.replace("降", "")
    return pankou.strip()


def getResultOne(goalMid, pankou):
    row = pankou
    if goalMid - float(row) > 0:
        return 1
    if goalMid - float(row) <= 0:
        return 0
    
def getType(yapanMasterStartOdd, yapanGuestStartOdd, yapanPankouStart):
    linTypeStart = get18(yapanMasterStartOdd, yapanGuestStartOdd)
    return str(linTypeStart) + "_" + str(yapanPankouStart)


def get18(master, guest):
    if master > guest:
        return 18
    if master < guest:
        return 81
    if master == guest:
        return 99


def daxiao_num(x):
    x_list = x.split("/")
    num = 0
    for i in x_list:
        num += float(i)
    return (float(num)/len(x_list))


def realDaxiao(x, master, guest):
    return float(x) - float(master) - float(guest)


def shengjiang(start, end):
    return ((end) - (start))

def round2(x):
    return round((x), 2)

In [84]:
def getShuiPing(x):
    result = 11
    if x < 0.75:
        result = 0
    if 0.75<= x and x<=0.85:
        result = 1
    if 0.85<x and x<=0.90:
        result = 2
    if 0.90<x and x<=0.95:
        result = 3
    if 0.95<x and x<=1.00:
        result = 4
    if 1.00<x and x<=1.08:
        result = 5
    if 1.08<x:
        result = 6
    return result

def num_fea_dis(df,features):
    for f in features:
        nm = f+'_'+'shuiPing'
        df[nm] = df[f].map(getShuiPing)
    return df

In [85]:
def fillNa(x, value):
    if pd.isnull(x):
        return value
    else:
        return x

    
def preF(test):
    test = test[(test['masterOdd_Start_Ji_3'] >= 0.50) & (test['masterOdd_Start_Ji_3'] < 2.00)]
    test = test[(test['masterOdd_End_Ji_3'] >= 0.50) & (test['masterOdd_End_Ji_3'] < 2.00)]
    test = test[(test['masterOdd_Start_Zhong_3'] >= 0.50) & (test['masterOdd_Start_Zhong_3'] < 2.00)]
    test = test[(test['masterOdd_End_Zhong_3'] >= 0.50) & (test['masterOdd_End_Zhong_3'] < 2.00)]

    test['goalMid'] = test.apply(lambda x: getGoalMid(
        x['masterGoal'], x['guestGoal'], x['midGuestGoal'], x['midMasterGoal']), axis=1)
    
    test['zhongbifengNew'] = test['midMasterGoal'].astype(
        str) + "_" + test['midGuestGoal'].astype(str)

    test["water_type_mid"] =  test['masterOdd_End_Zhong_3'] + test['guestOdd_End_Zhong_3']
    test["water_type_mid"] =  test["water_type_mid"].map(round2)
    test["water_type_mid"] =  test["water_type_mid"].astype(str)
     
    pankou = ["pankouOdd_End_Zhong_3","pankou_Start_Zhong_3"] + ["pankou_Start_Ji_3","pankouOdd_End_Ji_3"]

    for col in pankou:
        test[col] = test[col].map(daxiao_num)
        test[col] = test[col].astype(str)
        nm = col + "Real"
        test[nm] = test.apply(lambda x: realDaxiao(x[col],x['midGuestGoal'],x['midMasterGoal']), axis=1)
        test[nm] = test[nm].astype(str)

    test['result'] = test.apply(lambda x: getResultOne(
        x['goalMid'], x['pankouOdd_End_Zhong_3Real']), axis=1)
    
    fes = ['masterOdd_Start_Ji_3', 'masterOdd_End_Ji_3',
           'masterOdd_Start_Zhong_3', 'masterOdd_End_Zhong_3'] + ['guestOdd_Start_Ji_3', 'guestOdd_End_Ji_3',
           'guestOdd_Start_Zhong_3', 'guestOdd_End_Zhong_3']
    
    num_fea_dis(test, fes)
    
    test['daxiaoTypeStart'] = test['masterOdd_Start_Ji_3_shuiPing'].astype(str) + test['pankou_Start_Ji_3'] + test['zhongbifengNew']
    test['daxiaoType'] = test['masterOdd_End_Ji_3_shuiPing'].astype(str) + test['pankouOdd_End_Ji_3'] + test['zhongbifengNew']
    test['daxiaoTypeStartMid'] = test['masterOdd_Start_Zhong_3_shuiPing'].astype(str) + test['pankou_Start_Zhong_3Real'] + test['zhongbifengNew']
    test['daxiaoTypeMid'] = test['masterOdd_End_Zhong_3_shuiPing'].astype(str) + test['pankouOdd_End_Zhong_3Real'] + test['zhongbifengNew']

    test['daxiaoTypeALL'] = test['daxiaoTypeStart'] + test['daxiaoType']
    test['daxiaoTypeMidALL'] = test['daxiaoTypeStartMid'] + test['daxiaoTypeMid']

    test['daxiaoTypeStartGuest'] = test['guestOdd_Start_Ji_3_shuiPing'].astype(str) + test['pankou_Start_Ji_3'] + test['zhongbifengNew']
    test['daxiaoTypeGuest'] = test['guestOdd_End_Ji_3_shuiPing'].astype(str) + test['pankouOdd_End_Ji_3'] + test['zhongbifengNew']
    test['daxiaoTypeStartMidGuest'] = test['guestOdd_Start_Zhong_3_shuiPing'].astype(str) + test['pankou_Start_Zhong_3Real'] + test['zhongbifengNew']
    test['daxiaoTypeMidGuest'] = test['guestOdd_End_Zhong_3_shuiPing'].astype(str) + test['pankouOdd_End_Zhong_3Real'] + test['zhongbifengNew']

    test['daxiaoTypeALLGuest'] = test['daxiaoTypeStartGuest'] + test['daxiaoTypeGuest']
    test['daxiaoTypeMidALLGuest'] = test['daxiaoTypeStartMidGuest'] + test['daxiaoTypeMidGuest']  

 
    test['daxiaoPankou'] = test['pankou_Start_Ji_3'] + test['pankouOdd_End_Ji_3']
    test['daxiaoPankouMid'] = test['pankou_Start_Zhong_3Real'] + test['pankouOdd_End_Zhong_3Real']
    test['daxiaoPankouALL'] = test['daxiaoPankou'] + test['daxiaoPankouMid'] + test['zhongbifengNew']


    test['MasterOddFlowPankou'] = test["pankouOdd_End_Ji_3"].astype(float) - test["pankou_Start_Ji_3"].astype(float)
    test['MasterOddFlowPankouMid'] = test["pankou_Start_Zhong_3"].astype(float) - test["pankouOdd_End_Zhong_3"].astype(float)

    test['MasterOddFlow'] = test["masterOdd_Start_Ji_3"] - test["masterOdd_End_Ji_3"]
    test['MasterOddFlowMid'] = test["masterOdd_Start_Zhong_3"] - test["masterOdd_End_Zhong_3"]
    test['MasterOddFlowALL'] = test["MasterOddFlow"] + test["MasterOddFlowPankou"]
    test['MasterOddFlowALLMid'] = test["MasterOddFlowMid"] + test["MasterOddFlowPankouMid"]
    
    test = test.drop(columns=['masterOdd_Start_Ji_3_shuiPing', 'masterOdd_End_Ji_3_shuiPing', "masterOdd_Start_Zhong_3_shuiPing", "masterOdd_End_Zhong_3_shuiPing"])
    test = test.drop(columns=['guestOdd_Start_Ji_3_shuiPing', 'guestOdd_End_Ji_3_shuiPing', "guestOdd_Start_Zhong_3_shuiPing", "guestOdd_End_Zhong_3_shuiPing"])
    test = test.drop(columns=['masterGoal', 'guestGoal', "goalMid"])
    test = test.drop(columns=['midGuestGoal', 'midMasterGoal'])
    test = test.drop(columns=['time'])

    return test

In [86]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["soccerData"]
mycol = mydb["midDaxiao007v3"]
data_mid = pd.DataFrame(list(mycol.find()))
data_mid = data_mid.drop(['_id','place'], axis=1)
data_mid = data_mid.dropna()
data_mid = reduce_mem_usage(data_mid)
data_mid = preF(data_mid)
data_mid.info()

# data_mid = pd.read_csv('train.txt')
# data_mid = data_mid.drop(['Unnamed: 0'], axis=1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 347032 entries, 0 to 348916
Data columns (total 41 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   lianShai                   347032 non-null  object 
 1   guestOdd_End_Ji_3          347032 non-null  float64
 2   guestOdd_End_Zhong_3       347032 non-null  float64
 3   guestOdd_Start_Ji_3        347032 non-null  float64
 4   guestOdd_Start_Zhong_3     347032 non-null  float64
 5   masterOdd_End_Ji_3         347032 non-null  float64
 6   masterOdd_End_Zhong_3      347032 non-null  float64
 7   masterOdd_Start_Ji_3       347032 non-null  float64
 8   masterOdd_Start_Zhong_3    347032 non-null  float64
 9   pankouOdd_End_Ji_3         347032 non-null  object 
 10  pankouOdd_End_Zhong_3      347032 non-null  object 
 11  pankou_Start_Ji_3          347032 non-null  object 
 12  pankou_Start_Zhong_3       347032 non-null  object 
 13  zhongbifengNew             34

In [87]:
train = data_mid

In [88]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mycol = mydb["win007daxiaotest11"]
test = pd.DataFrame(list(mycol.find()))
test = test.drop(['_id','place'], axis=1)
test = test.dropna()
test = reduce_mem_usage(test)
test = preF(test)

# test.to_csv('test_202111.txt')
# test_da = pd.read_csv('test.txt')
# test_da = test_da.drop(['Unnamed: 0'], axis=1)
# mycol = mydb["win007yapantest"]
# test = pd.DataFrame(list(mycol.find()))
# test = test.dropna()
# test = test[['place']]

In [89]:
# from sklearn.model_selection import  StratifiedKFold,KFold
# import category_encoders as ce
# import gc
# def mean_woe_target_encoder(train,test,target,col,n_splits=10):
#     folds = StratifiedKFold(n_splits)

#     y_oof = np.zeros(train.shape[0])
#     y_oof_2= np.zeros(train.shape[0])
#     y_test_oof = np.zeros(test.shape[0]).reshape(-1,1)
#     y_test_oof2 = np.zeros(test.shape[0]).reshape(-1,1)

#     splits = folds.split(train, target)
    
#     for fold_n, (train_index, valid_index) in enumerate(splits):
#         X_train, X_valid = train[col].iloc[train_index], train[col].iloc[valid_index]
#         y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
#         clf=ce.target_encoder.TargetEncoder()
    
#         clf.fit(X_train.values,y_train.values)    
#         y_pred_valid = clf.transform(X_valid.values)

#         y_oof[valid_index] = y_pred_valid.values.reshape(1,-1)

#         tp=(clf.transform(test[col].values)/(n_splits*1.0)).values
#         tp=tp.reshape(-1,1)
#         y_test_oof+=tp    
    
#         del X_train, X_valid, y_train, y_valid
#         gc.collect()    
#     return y_oof,y_test_oof

In [90]:
# for col in cat_features:
#   y_oof,y_test_oof = mean_woe_target_encoder(train_x,test_x,train_y,col,n_splits=10)
#   train_x[col] = y_oof
#   test_x[col] = y_test_oof

In [91]:
train_x =  train.drop(columns=['result'])
train_y =  train['result']

test_x =  test.drop(columns=['result'])
test_y = test['result']

In [92]:
train_x = reduce_mem_usage(train_x)
test_x = reduce_mem_usage(test_x)

In [93]:
cat_features = test_x.select_dtypes(include='object').columns

In [94]:
cat_features

Index(['lianShai', 'pankouOdd_End_Ji_3', 'pankouOdd_End_Zhong_3',
       'pankou_Start_Ji_3', 'pankou_Start_Zhong_3', 'zhongbifengNew',
       'water_type_mid', 'pankouOdd_End_Zhong_3Real',
       'pankou_Start_Zhong_3Real', 'pankou_Start_Ji_3Real',
       'pankouOdd_End_Ji_3Real', 'daxiaoTypeStart', 'daxiaoType',
       'daxiaoTypeStartMid', 'daxiaoTypeMid', 'daxiaoTypeALL',
       'daxiaoTypeMidALL', 'daxiaoTypeStartGuest', 'daxiaoTypeGuest',
       'daxiaoTypeStartMidGuest', 'daxiaoTypeMidGuest', 'daxiaoTypeALLGuest',
       'daxiaoTypeMidALLGuest', 'daxiaoPankou', 'daxiaoPankouMid',
       'daxiaoPankouALL'],
      dtype='object')

In [None]:
import category_encoders as ce
import joblib
target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(train[cat_features], train_y)

joblib.dump(target_enc,"target_enc.txt")
target_enc = joblib.load("target_enc.txt")
train_x[cat_features] = target_enc.transform(train_x[cat_features])
test_x[cat_features] = target_enc.transform(test_x[cat_features])

In [None]:
test_x.info()

In [None]:
# from sklearn.preprocessing import LabelEncoder

# import gc

# cat_features = train.select_dtypes(include='object').columns
# def encode_LE(col,train,test):
#         le = LabelEncoder()
#         # le.fit(list(train[col]))
#         # le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
#         # train[col] = train[col].apply(lambda x: le_dict.get(x, np.nan))
#         # test[col] = test[col].apply(lambda x: le_dict.get(x, np.nan))
#         le.fit(list(train[col])+list(test[col]))
#         train[col] = le.transform(train[col])
#         test[col]  = le.transform(test[col])
# for col in cat_features:
#     encode_LE(col,train_x,test_x)

In [None]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import StratifiedKFold
import gc
X = train_x
y = train_y
# del train_x,train_y
# gc.collect()


params = {'num_leaves': 240,  # 结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary',  # 定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,  # 提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction": 0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,  # l1正则
          'lambda_l2': 5.985747612243422e-07,  # l2正则
          "verbosity": -1,
          "nthread": -1,  # 线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'average_precision','auc'},  # 评价函数选择
          "random_state": 2019,  # 随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67
          }

NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(test_x.shape[0])
y_preds_train  = np.zeros(train_x.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 1000, valid_sets=[
                    dtrain, dvalid], verbose_eval=200, early_stopping_rounds=300)

    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    model = "gbm_" + str(fold_n) + ".txt"
    joblib.dump(clf,model)
    y_pred_valid = clf.predict(X_valid)
    print(y_pred_valid)
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")

    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(test_x) / NFOLDS
    
    y_pred_valid = clf.predict(X_valid)
    y_preds_train[valid_index] = y_pred_valid
      
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(train_y, y_preds_train)}")

In [None]:
from sklearn.metrics import roc_auc_score,accuracy_score
test_new = pd.DataFrame({'result':test_y,'predict':y_preds})
a = test_new[(test_new["predict"] >= 0.70)]
pred_labels = np.rint(a['predict'])
accuracy = accuracy_score(a['result'], pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

In [None]:
test2 = test.copy(deep=True)
test2["result"] = test_y
test2['predict'] = y_preds

for i in ["1.0","1.25","1.5","1.75","2.0","2.25","2.5"]:
    a = test2[(test2["predict"] >= 0.80)]
    a = a[(a["pankouOdd_End_Zhong_3Real"] == i)]
    if (a.shape[0] != 0):
        pred_labels = np.rint(a['predict'])
        accuracy = accuracy_score(a['result'], pred_labels)
        if (accuracy > 0.70):
          print("accuarcy: %.2f%%" % (accuracy*100.0),i)
          print(a.shape)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
 
class Performance:
    """
    定义一个类，用来分类器的性能度量
    """
    def __init__(self, labels, scores, threshold=0.5):
        """
        :param labels:数组类型，真实的标签
        :param scores:数组类型，分类器的得分
        :param threshold:检测阈值
        """
        self.labels = labels
        self.scores = scores
        self.threshold = threshold
        self.db = self.get_db()
        self.TP, self.FP, self.FN, self.TN = self.get_confusion_matrix()
 
    def accuracy(self):
        """
        :return: 正确率
        """
        return (self.TP + self.TN) / (self.TP + self.FN + self.FP + self.TN)
 
    def presision(self):
        """
        :return: 准确率
        """
        if (self.TP + self.FP) == 0:
            return 0
        return self.TP / (self.TP + self.FP)
    
    def get_db(self):
        db = []
        for i in range(len(self.labels)):
            db.append([self.labels[i], self.scores[i]])
        db = sorted(db, key=lambda x: x[1], reverse=True)
        return db

    def get_confusion_matrix(self):
        """
        计算混淆矩阵
        :return:
        """
        tp, fp, fn, tn = 0., 0., 0., 0.
        for i in range(len(self.labels)):
            if self.labels[i] == 1 and self.scores[i] >= self.threshold:
                tp += 1
            elif self.labels[i] == 0 and self.scores[i] >= self.threshold:
                fp += 1
            elif self.labels[i] == 1 and self.scores[i] < self.threshold:
                fn += 1
            else:
                tn += 1
        return [tp, fp, fn, tn]

In [None]:
from sklearn.metrics import precision_score
for i in ["1.0", "1.25", "1.50"]:
    for per in range(50, 100):
        a = test2[(test2["pankouOdd_End_Zhong_3Real"] == i)]
        if (a.shape[0] != 0):
            p = Performance(np.array(a['result'].to_list()), np.array(
                a['predict'].to_list()), per/100)
            pre = p.presision()
            lenth = (p.get_confusion_matrix()[0] + p.get_confusion_matrix()[0])
            if (pre > 0.55 and lenth > 10):
                print(i, "pre: %.2f%%" % (pre*100.0), per, a.shape[0], lenth)

In [None]:
a[["predict","result"]]

In [None]:
a

In [None]:
import seaborn as sns
feature_importance_gain=pd.DataFrame()
feature_importance_gain['feature']=columns
feature_importance_gain['fold_1']=clf.feature_importance(importance_type='gain')
feature_importance_gain['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(1)]].mean(axis=1)
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importance_gain.sort_values(by='average', ascending=False).head(50), x='average', y='feature')
plt.title('100 TOP feature importance over {} folds average'.format(folds.n_splits))

In [None]:
test_x.info()

In [None]:
stop

In [None]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score,accuracy_score,accuracy_score

lgb_params = {'num_leaves': 240,  # 结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary',  # 定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,  # 提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction": 0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,  # l1正则
          'lambda_l2': 5.985747612243422e-07,  # l2正则
          "verbosity": -1,
          "nthread": -1,  # 线程数量，-1表示全部线程，线程越多，运行的速度越快
          "random_state": 2019,  # 随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67
          }


xgb_params = {'n_estimators': 10000,
               'learning_rate': 0.03689407512484644,
               'max_depth': 8,
               'colsample_bytree': 0.3723914688159835,
               'subsample': 0.780714581166012,
               'eval_metric': 'auc',
               'use_label_encoder': False,
               'gamma': 0,
               'reg_lambda': 50.0,
               'gpu_id': 0,
               'objective': 'binary:logistic',
               'random_state': 42}


cat_params = {'iterations': 17298,
               'learning_rate': 0.03429054860458741,
               'reg_lambda': 0.3242286463210283,
               'subsample': 0.9433911589913944,
               'random_strength': 22.4849972385133,
               'depth': 8,
               'min_data_in_leaf': 4,
               'leaf_estimation_iterations': 8,
               'task_type':"GPU",
               'bootstrap_type':'Poisson',
               'verbose' : 1,
               'early_stopping_rounds' : 200,
               'eval_metric' : 'AUC'}


lgbm = LGBMClassifier(**lgb_params)

xgb = XGBClassifier(**xgb_params)

cat = CatBoostClassifier(**cat_params)

In [None]:
import six
import sys
sys.modules['sklearn.externals.six'] = six

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.pipeline import make_pipeline

lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[lgbm,xgb,cat],
                            use_probas=True,
                            meta_classifier=lr)

In [None]:
clf = sclf.fit(train_x.values, train_y.values)

y_pred = clf.predict_proba(test_x)[:, 1]

accuracy = roc_auc_score(test_y, y_pred)
print("accuarcy: %.2f%%" % (accuracy*100.0))

In [None]:
# clf = cat.fit(train_x, train_y)

# y_pred = clf.predict_proba(test_x)[:, 1]

# accuracy = roc_auc_score(test_y, y_pred)
# print("accuarcy: %.2f%%" % (accuracy*100.0))

In [None]:
test_new = pd.DataFrame({'result':test_y,'predict':y_pred})
a = test_new[(test_new["predict"] >= 0.82)]
pred_labels = np.rint(a['predict'])
accuracy = accuracy_score(a['result'], pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

In [None]:
test2 = test.copy(deep=True)
test2["result"] = test_y
test2['predict'] = y_pred

In [None]:
zhongList = test['zhongbifengNew'].unique()

In [None]:
test2[(test2["predict"] >= 0.82)]['pankouOdd_End_Zhong_3Real'].value_counts()

In [None]:
for i in ["1.0"]:
    for j in zhongList:
        a = test2[(test2["predict"] >= 0.50)]
        a = a[(a["pankouOdd_End_Zhong_3Real"] == i)]
        a = a[(a["zhongbifengNew"] == j)]
        if a.shape[0] != 0:
            pred_labels = np.rint(a['predict'])
            accuracy = accuracy_score(a['result'], pred_labels)
            if accuracy > 0.70:
                print("accuarcy: %.2f%%" % (accuracy*100.0),i,j)
                print(a.shape)

In [None]:
fea_ = statcker.feature_importances_
fea_name = statcker.feature_names_
featrue_im = pd.DataFrame({'per':fea_,'name':fea_name})
featrue_im = featrue_im.sort_values(by='per', ascending=True)
plt.figure(figsize=(10, 10))
plt.barh(featrue_im["name"],featrue_im["per"],height =0.5)

In [None]:
import gc
X = train_x
y = train_y
NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS)
columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(test_x.shape[0])
score = 0


for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    clf = cat.fit(X_train,y_train,eval_set=(X_valid,y_valid))

    y_pred_valid = clf.predict_proba(X_valid)[:, 1]
    print(y_pred_valid)
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict_proba(test_x)[:, 1]/ NFOLDS
        
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")

In [None]:
accuracy = roc_auc_score(test_y, y_preds)
print("accuarcy: %.2f%%" % (accuracy*100.0))

In [None]:
from sklearn.metrics import roc_auc_score,accuracy_score
test_new = pd.DataFrame({'result':test_y,'predict':y_pred})
a = test_new[(test_new["predict"] >= 0.80)]
pred_labels = np.rint(a['predict'])
accuracy = accuracy_score(a['result'], pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

In [None]:
test2 = test.copy(deep=True)
test2["result"] = test_y
test2['predict'] = y_preds

In [None]:
test2[(test2["predict"] >= 0.70)]['pankouOdd_End_Zhong_3Real'].value_counts()

In [None]:
for i in ["1.25"]:
    for j in zhongList:
        a = test2[(test2["predict"] >= 0.75)]
        a = a[(a["pankouOdd_End_Zhong_3Real"] == i)]
        a = a[(a["zhongbifengNew"] == j)]
        if a.shape[0] != 0:
            pred_labels = np.rint(a['predict'])
            accuracy = accuracy_score(a['result'], pred_labels)
            if accuracy > 0.85:
                print("accuarcy: %.2f%%" % (accuracy*100.0),i,j)
                print(a.shape)

In [None]:
for i in ["1.5"]:
    for j in zhongList:
        a = test2[(test2["predict"] >= 0.80)]
        a = a[(a["pankouOdd_End_Zhong_3Real"] == i)]
        a = a[(a["zhongbifengNew"] == j)]
        if a.shape[0] != 0:
            pred_labels = np.rint(a['predict'])
            accuracy = accuracy_score(a['result'], pred_labels)
            if accuracy > 0.85:
                print("accuarcy: %.2f%%" % (accuracy*100.0),i,j)
                print(a.shape)

In [None]:
import joblib
joblib.dump(clf, 'statcker_daxiao.model')

In [None]:
test_x

In [None]:
!pip install hypergbm

In [None]:
train_data = pd.concat([train_x,train_y],axis=1)

In [None]:
from sklearn.metrics import classification_report
from hypergbm import make_experiment
from hypernets.core.trial import TrialHistory
from hypernets.searchers import PlaybackSearcher
from hypergbm.search_space import GeneralSearchSpaceGenerator
from hypernets.searchers import EvolutionSearcher
from hypernets.experiment.cfg import ExperimentCfg as cfg
cfg.experiment_discriminator=None

experiment = make_experiment(train_data,target='result',reward_metric='precision')
estimator = experiment.run()

In [None]:
y_pred=estimator.predict(test_x)
print(classification_report(test_y, y_pred, digits=5))

In [None]:
y_pred=estimator.predict_proba(test_x)[:,1]

In [None]:
from sklearn.metrics import roc_auc_score,accuracy_score
test_new = pd.DataFrame({'result':test_y,'predict':y_pred})
a = test_new[(test_new["predict"] >= 0.53)]
pred_labels = np.rint(a['predict'])
accuracy = accuracy_score(a['result'], pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

In [None]:
test2 = test.copy(deep=True)
test2["result"] = test_y
test2['predict'] = y_pred

In [None]:
zhongList = test2['zhongbifengNew'].unique()

In [None]:
for i in ["1.0","1.25","1.5"]:
    for j in zhongList:
        a = test2[test2["predict"] > 0.50]
        a = a[(a["pankouOdd_End_Zhong_3Real"] == i)]
        a = a[(a["zhongbifengNew"] == j)]
        if a.shape[0] != 0:
            pred_labels = np.rint(a['predict'])
            accuracy = accuracy_score(a['result'], pred_labels)
            if accuracy > 0.70:
                print("accuarcy: %.2f%%" % (accuracy*100.0),i,j)
                print(a.shape)

In [None]:
!pip install xlearn

In [None]:
import xlearn as xl

# Training task
ffm_model = xl.create_ffm()                # Use field-aware factorization machine (ffm)
ffm_model.setTrain("/content/drive/MyDrive/train.txt")    # Set the path of training dataset
ffm_model.setValidate("/content/drive/MyDrive/test.txt")  # Set the path of validation dataset

# Parameters:
#  0. task: binary classification
#  1. learning rate: 0.2
#  2. regular lambda: 0.002
#  3. evaluation metric: accuracy
param = {'task':'binary', 'lr':0.2, 'lambda':0.002, 'metric':'acc'}

# Start to train
# The trained model will be stored in model.out
ffm_model.fit(param, './model.out')

# Prediction task
ffm_model.setTest("./small_test.txt")  # Set the path of test dataset
ffm_model.setSigmoid()                 # Convert output to 0-1

# Start to predict
# The output result will be stored in output.txt
ffm_model.predict("./model.out", "./output.txt")