In [1]:
import json
import pandas as pd
import pymongo
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
def getGoalMid(masterGoal, guestGoal, masterMidGoal,guestMidGoal):
    if masterGoal + guestGoal - masterMidGoal - guestMidGoal >= 4:
        return 4
    else:
        return masterGoal + guestGoal - masterMidGoal - guestMidGoal
        
def removeSub(pankou):
    pankou = pankou.replace("升","")
    pankou = pankou.replace("降","")
    return pankou.strip()

def getResult(masterGoal, guestGoal, masterMidGoal,guestMidGoal,pankou):
    pankouList = removeSub(pankou).split("/")
    row = 0
    for p in pankouList:
        row += float(p)
    row = row / len(pankouList)
    
    if masterGoal + guestGoal - row >= 0:
        return 1
    if masterGoal + guestGoal - row < 0:
        return 0

def getType(yapanMasterStartOdd, yapanGuestStartOdd, yapanPankouStart):
    linTypeStart = get18(yapanMasterStartOdd, yapanGuestStartOdd)
    return str(linTypeStart) + "_" + yapanPankouStart

def get18(master,guest):
    if master > guest:
        return 18
    if master < guest:
        return 81
    if master == guest:
        return 99

def daxiao_num(x):
    x_list = x.split("/")
    num = 0
    for i in x_list:
        num += float(i)
    return str(float(num)/len(x_list))

def realDaxiao(x,master,guest):
    return str(float(x) - master - guest)

def shengjiang(start,end):
    return str(float(end) - float(start))

In [5]:
def preF(test):
    test = test.drop(['daxiaoMasterStartOddZao', 'daxiaoGuestStartOddZao','daxiaoPankouStartZao',
                      'daxiaoMasterOddZao', 'daxiaoGuestOddZao','daxiaoPankouZao'], axis=1)
    test['daxiaoPankouStart'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStart']), axis=1)
    test['daxiaoPankou'] = test.apply(
        lambda x: removeSub(x['daxiaoPankou']), axis=1)
    test['daxiaoPankouStartMid'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStartMid']), axis=1)
    test['daxiaoPankouMid'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouMid']), axis=1)
    
    test['result'] = test.apply(lambda x: getResult(
        x['masterGoal'], x['guestGoal'], x['masterMidGoal'], x['guestMidGoal'], x['daxiaoPankouMid']), axis=1)
    
    test['goalMid'] = test.apply(lambda x: getGoalMid(
        x['masterGoal'], x['guestGoal'], x['masterMidGoal'], x['guestMidGoal']), axis=1)
    
    pankou = ["daxiaoPankouStart","daxiaoPankou","daxiaoPankouStartMid","daxiaoPankouMid"]
    for col in pankou:
        test[col] = test[col].map(daxiao_num)
        nm = col+"_"+"real"
        test[nm] = test.apply(lambda x: realDaxiao(x[col],x['masterMidGoal'],x['guestMidGoal']), axis=1)
    
    test['zhongbifeng'] = test['masterMidGoal'] + test['guestMidGoal']
    
    test['daxiaoPankouShengJiang'] = test.apply(lambda x: shengjiang(x['daxiaoPankouStart'],x['daxiaoPankou']), axis=1)
    
    test['daxiaoPankouMidShengJiang'] = test.apply(lambda x: shengjiang(x['daxiaoPankouStartMid'],x['daxiaoPankouMid']), axis=1)

    test['daxiaoTypeStart'] = test.apply(lambda x: getType(
        x['daxiaoMasterStartOdd'], x['daxiaoGuestStartOdd'], x['daxiaoPankouStart']), axis=1)
    test['daxiaoType'] = test.apply(lambda x: getType(
        x['daxiaoMasterOdd'], x['daxiaoGuestOdd'], x['daxiaoPankou']), axis=1)
    test['daxiaoTypeStartMid'] = test.apply(lambda x: getType(
        x['daxiaoMasterStartOddMid'], x['daxiaoGuestStartOddMid'], x['daxiaoPankouStartMid']), axis=1)
    test['daxiaoTypeMid'] = test.apply(lambda x: getType(
        x['daxiaoMasterOddMid'], x['daxiaoGuestOddMid'], x['daxiaoPankouMid']), axis=1)

    test['daxiaoTypeStartReal'] = test.apply(lambda x: getType(
        x['daxiaoMasterStartOdd'], x['daxiaoGuestStartOdd'], x['daxiaoPankouStart_real']), axis=1)
    test['daxiaoTypeReal'] = test.apply(lambda x: getType(
        x['daxiaoMasterOdd'], x['daxiaoGuestOdd'], x['daxiaoPankou_real']), axis=1)
    test['daxiaoTypeStartMidReal'] = test.apply(lambda x: getType(
        x['daxiaoMasterStartOddMid'], x['daxiaoGuestStartOddMid'], x['daxiaoPankouStartMid_real']), axis=1)
    test['daxiaoTypeMidReal'] = test.apply(lambda x: getType(
        x['daxiaoMasterOddMid'], x['daxiaoGuestOddMid'], x['daxiaoPankouMid_real']), axis=1)
    
    test = test[test['goalMid'] >= 0]
    test = test[(test['daxiaoMasterStartOdd'] >= 0.75) & (test['daxiaoMasterStartOdd'] < 1.20)]
    test = test[(test['daxiaoMasterOdd'] >= 0.75) & (test['daxiaoMasterOdd'] < 1.20)]
    test = test[(test['daxiaoGuestOddMid'] >= 0.75) & (test['daxiaoGuestOddMid'] < 1.20)]
    test = test[test['zhongbifeng'].astype(float) <= test['daxiaoPankou'].astype(float)]
    test = test.drop(columns=['masterGoal', 'guestGoal', "goalMid"])
    return test

In [5]:
def getShuiPing(x):
    result = 9
    if x <= 0.75:
        result = 0
    if 0.75< x and x<=0.85:
        result = 1
    if 0.85<x and x<=0.90:
        result = 2
    if 0.90<x and x<=0.95:
        result = 3
    if 0.95<x and x<=1.00:
        result = 4
    if 1.00<x and x<=1.08:
        result = 5
    if 1.08<x and x<=1.20:
        result = 6
    if x > 1.20:
        result = 7
    return result
        
def num_fea_dis(df,features):
    for f in features:
        nm = f+'_'+'shuiPing'
        df[nm] = df[f].map(getShuiPing)
    return df

In [6]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["soccerData"]
mycol = mydb["midRawData007daxiao"]

test = pd.DataFrame(list(mycol.find({"time":{"$gte":"2021-00-00 00:00:00"}})))
test = test.drop(['_id', 'time','place'], axis=1)

In [None]:
test = reduce_mem_usage(test)

In [None]:
fes = ['daxiaoMasterStartOdd','daxiaoGuestStartOdd',
       'daxiaoMasterOdd','daxiaoGuestOdd',
       'daxiaoMasterStartOddMid','daxiaoGuestStartOddMid',
       'daxiaoMasterOddMid','daxiaoGuestOddMid']
test = preF(test)

In [None]:
test= num_fea_dis(test,fes)

In [None]:
# myclient = pymongo.MongoClient("mongodb://localhost:27017/")
# mydb = myclient["soccerData"]
# mycol = mydb["midRawData007"]

# train = pd.DataFrame(list(mycol.find({"time":{"$lt":"2021-00-00 00:00:00",}})))
# train = train.drop(['_id', 'time','place'], axis=1)

In [None]:
train = pd.read_csv('train.csv')

In [None]:
train = reduce_mem_usage(train)
train = preF(train)
train= num_fea_dis(train,fes)

In [None]:
train.head()

In [None]:
def encode_FE(df1, df2, cols):
    for col in cols:
        df = pd.concat([df1[col],df2[col]])
        vc = df.value_counts(dropna=True, normalize=True).to_dict()
        vc[-1] = -1
        nm = col+'_FE'
        df1[nm] = df1[col].map(vc)
        df1[nm] = df1[nm].astype('float32')
        df2[nm] = df2[col].map(vc)
        df2[nm] = df2[nm].astype('float32')

In [None]:
fe_features = ['daxiaoTypeMidReal','daxiaoTypeReal','daxiaoType','daxiaoTypeStart','daxiaoPankouStart_real','daxiaoPankou_real']
encode_FE(train,test,fe_features)

In [None]:
fe_features = ['daxiaoTypeMidReal','daxiaoTypeReal','daxiaoType','daxiaoTypeStart','daxiaoPankouStart_real','daxiaoPankou_real']

def encode_Count(df1, df2, cols):
    for col in cols:
        df = pd.concat([df1[[col,'result']],df2[[col,'result']]])
        nm = col+'_COUNT'
        dicts=df.groupby(col)['result'].sum().to_dict()
        train[nm]=train[col].map(dicts)
        test[nm]=test[col].map(dicts)
        
        nm = col+'_ALL'
        dicts=df.groupby(col)['result'].count().to_dict()
        train[nm]=train[col].map(dicts)
        test[nm]=test[col].map(dicts)
        
        nm = col+'_CT'
        train[nm]= train[col+'_COUNT'] / train[col+'_ALL']
        test[nm]=train[col+'_COUNT'] / train[col+'_ALL']
        train.drop([col+"_COUNT",col+"_ALL"],axis=1)
        test.drop([col+"_COUNT",col+"_ALL"],axis=1)

        
encode_Count(train,test,fe_features)

In [None]:
train.head()

In [None]:
import gc

def encode_LE(col,train,test):
    df_comb = pd.concat([train[col],test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    nm = col
    if df_comb.max()>32000: 
        train[nm] = df_comb[:len(train)].astype('int32')
        test[nm] = df_comb[len(train):].astype('int32')
    else:
        train[nm] = df_comb[:len(train)].astype('int16')
        test[nm] = df_comb[len(train):].astype('int16')
    del df_comb; x=gc.collect()


def encode_CB(col1,col2,df1,df2):
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str) 
#     encode_LE(nm,df1,df2)

def encode_CB3(col1,col2,col3,df1,df2):
    nm = col1+'_'+col2+'_'+col3
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)+'_'+df1[col3].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str)+'_'+df1[col3].astype(str)
#     encode_LE(nm,df1,df2)

encode_CB("daxiaoPankou","daxiaoPankouMid",train,test)
encode_CB("daxiaoPankouStart","daxiaoPankou",train,test)
encode_CB("daxiaoPankouStartMid","daxiaoPankouMid",train,test)

encode_CB("daxiaoType","daxiaoTypeMid",train,test)
encode_CB("daxiaoTypeStart","daxiaoType",train,test)
encode_CB("daxiaoTypeStartMid","daxiaoTypeMid",train,test)


encode_CB("daxiaoPankou_real","daxiaoPankouMid_real",train,test)
encode_CB("daxiaoPankouStart_real","daxiaoPankou_real",train,test)
encode_CB("daxiaoPankouStartMid_real","daxiaoPankouMid_real",train,test)

encode_CB("daxiaoTypeReal","daxiaoTypeMidReal",train,test)
encode_CB("daxiaoTypeStartReal","daxiaoTypeReal",train,test)
encode_CB("daxiaoTypeStartMidReal","daxiaoTypeMidReal",train,test)

encode_CB("zhongbifeng","daxiaoPankouShengJiang",train,test)
encode_CB("zhongbifeng","daxiaoPankouMidShengJiang",train,test)


encode_CB3("zhongbifeng","daxiaoPankou","daxiaoPankouMid",train,test)
encode_CB3("zhongbifeng","daxiaoPankouStart","daxiaoPankou",train,test)
encode_CB3("zhongbifeng","daxiaoPankouStartMid","daxiaoPankouMid",train,test)

encode_CB3("zhongbifeng","daxiaoType","daxiaoTypeMid",train,test)
encode_CB3("zhongbifeng","daxiaoTypeStart","daxiaoType",train,test)
encode_CB3("zhongbifeng","daxiaoTypeStartMid","daxiaoTypeMid",train,test)


encode_CB3("zhongbifeng","daxiaoPankou_real","daxiaoPankouMid_real",train,test)
encode_CB3("zhongbifeng","daxiaoPankouStart_real","daxiaoPankou_real",train,test)
encode_CB3("zhongbifeng","daxiaoPankouStartMid_real","daxiaoPankouMid_real",train,test)

encode_CB3("zhongbifeng","daxiaoTypeReal","daxiaoTypeMidReal",train,test)
encode_CB3("zhongbifeng","daxiaoTypeStartReal","daxiaoTypeReal",train,test)
encode_CB3("zhongbifeng","daxiaoTypeStartMidReal","daxiaoTypeMidReal",train,test)

    
encode_CB3("daxiaoMasterStartOdd_shuiPing","daxiaoGuestStartOdd_shuiPing","daxiaoPankouStart",train,test)
encode_CB3("daxiaoMasterOdd_shuiPing","daxiaoGuestOdd_shuiPing","daxiaoPankou",train,test)
encode_CB3("daxiaoMasterStartOddMid_shuiPing","daxiaoGuestStartOddMid_shuiPing","daxiaoPankouStartMid",train,test)
encode_CB3("daxiaoMasterOddMid_shuiPing","daxiaoGuestOddMid_shuiPing","daxiaoPankouMid",train,test)


def encode_AG(main_columns, uids, aggregations, train_df, test_df,fillna=True, usena=False):
    # AGGREGATION OF MAIN WITH UID FOR GIVEN STATISTICS
    for main_column in main_columns:  
        for col in uids:
            for agg_type in aggregations:
                print()
                new_col_name = main_column+'_'+col+'_'+agg_type
                temp_df = pd.concat([train_df[[col, main_column]], test_df[[col,main_column]]])
                if usena: temp_df.loc[temp_df[main_column]==-1,main_column] = np.nan
                temp_df = temp_df.groupby([col])[main_column].agg([agg_type]).reset_index().rename(
                                                        columns={agg_type: new_col_name})

                temp_df.index = list(temp_df[col])
                temp_df = temp_df[new_col_name].to_dict()   

                train_df[new_col_name] = train_df[col].map(temp_df).astype('float32')
                test_df[new_col_name]  = test_df[col].map(temp_df).astype('float32')
                
                if fillna:
                    train_df[new_col_name].fillna(-1,inplace=True)
                    test_df[new_col_name].fillna(-1,inplace=True)
                

def encode_AG2(main_columns, uids, train_df, test_df):
    for main_column in main_columns:  
        for col in uids:
            comb = pd.concat([train_df[[col]+[main_column]],test_df[[col]+[main_column]]],axis=0)
            mp = comb.groupby(col)[main_column].agg(['nunique'])['nunique'].to_dict()
            train_df[col+'_'+main_column+'_ct'] = train_df[col].map(mp).astype('float32')
            test_df[col+'_'+main_column+'_ct'] = test_df[col].map(mp).astype('float32')
            
groby_col = ["zhongbifeng_daxiaoPankouStart_daxiaoPankou","zhongbifeng_daxiaoPankouStartMid_daxiaoPankouMid",
            "zhongbifeng_daxiaoType_daxiaoTypeMid","zhongbifeng_daxiaoTypeStart_daxiaoType",
            "zhongbifeng_daxiaoPankouStartMid_daxiaoPankouMid","zhongbifeng_daxiaoPankou_real_daxiaoPankouMid_real",
            "zhongbifeng_daxiaoPankouStart_real_daxiaoPankou_real","zhongbifeng_daxiaoPankouStartMid_real_daxiaoPankouMid_real",
            "zhongbifeng_daxiaoTypeReal_daxiaoTypeMidReal","zhongbifeng_daxiaoTypeStartReal_daxiaoTypeReal",
            "zhongbifeng_daxiaoPankouStartMid_daxiaoPankouMid","daxiaoMasterStartOdd_shuiPing_daxiaoGuestStartOdd_shuiPing_daxiaoPankouStart",
            "zhongbifeng_daxiaoTypeStartMidReal_daxiaoTypeMidReal","zhongbifeng_daxiaoPankouStartMid_daxiaoPankouMid",
            "daxiaoMasterOdd_shuiPing_daxiaoGuestOdd_shuiPing_daxiaoPankou","daxiaoMasterStartOddMid_shuiPing_daxiaoGuestStartOddMid_shuiPing_daxiaoPankouStartMid",
            "daxiaoMasterOddMid_shuiPing_daxiaoGuestOddMid_shuiPing_daxiaoPankouMid"]

count_col = ["daxiaoMasterStartOdd","daxiaoGuestStartOdd",
            "daxiaoMasterOdd","daxiaoGuestOdd",
            "daxiaoMasterStartOddMid","daxiaoGuestStartOddMid",
            "daxiaoMasterOddMid",'daxiaoMasterOddMid']

In [None]:
group_cross=[[i,j] for i in groby_col for j in count_col]
for item in group_cross:
    temp_df = pd.concat([train[[item[0], item[1]]], test[[item[0],item[1]]]])
    print(item[0],item[1])
    temp_df[item[0]+'_'+item[1]+'_mean']=temp_df.groupby(item[0])[item[1]].transform('mean')
    temp_df[item[0]+'_'+item[1]+'_std']=temp_df.groupby(item[0])[item[1]].transform('std')

    temp_dfstd = temp_df[item[0]+'_'+item[1]+'_std'].to_dict()
    temp_dfmean = temp_df[item[0]+'_'+item[1]+'_mean'].to_dict()
    train[item[0]+'_'+item[1]+'_std'] = train[item[0]].map(temp_dfstd).astype('float32')
    test[item[0]+'_'+item[1]+'_mean']  = test[item[0]].map(temp_dfmean).astype('float32')

In [None]:
train.head()

In [None]:
encode_Count(train,test,groby_col)

In [None]:
import gc
train_x =  train.drop(columns=['result'])
train_y =  train['result']

test_x =  test.drop(columns=['result'])
test_y =  test['result']
del train,test
gc.collect()

In [None]:
test.info()

In [None]:
import category_encoders as ce

cat_features =["daxiaoPankouStart","daxiaoPankou","daxiaoPankouStartMid","daxiaoPankouMid",
      "daxiaoTypeStart", "daxiaoType", "daxiaoTypeStartMid", "daxiaoTypeMid",
      "daxiaoPankouStart_real", "daxiaoPankou_real", "daxiaoPankouStartMid_real", "daxiaoPankouMid_real",
      "daxiaoTypeStartReal", "daxiaoTypeReal", "daxiaoTypeStartMidReal", "daxiaoTypeMidReal","zhongbifeng",
      "daxiaoPankouShengJiang","daxiaoPankouMidShengJiang"
     ]

target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(train_x[cat_features], train_y)

train_x[cat_features] = target_enc.transform(train_x[cat_features])

test_x[cat_features] = target_enc.transform(test_x[cat_features])

In [None]:
from sklearn.datasets import load_iris
import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split,cross_val_score	#划分数据 交叉验证

# 训练模型

param = {'learning_rate': 0.04624866821131782,
        'gamma': 0.4521452209872597, 
        'reg_alpha': 2,
        'reg_lambda': 7, 
        'n_estimators': 410,
        'colsample_bynode': 0.2158053139547304,
        'colsample_bylevel': 0.747234611546242,
        'subsample': 0.9462638943432846, 
        'min_child_weight': 117,
        'colsample_bytree': 0.29748741231156306,
        'max_depth': 15}

x_train, x_valid, y_train, y_valid = train_test_split(
    train_x, train_y, test_size=0.2)

model = xgb.model = XGBClassifier(objective='binary:logistic', eval_metric="auc",
                          tree_method='gpu_hist', **param)
model.fit(x_train, y_train)

# 对测试集进行预测
y_pred = model.predict(test_x)

print(type(y_pred))
# 计算准确率
accuracy = accuracy_score(test_y, y_pred)
print("accuarcy: %.2f%%" % (accuracy*100.0))

print(cross_val_score(model, train_x, train_y, cv = 5, scoring = 'roc_auc').mean())

fig, ax = plt.subplots(figsize=(10,10))
xgb.plot_importance(model, max_num_features=60, height=0.5, ax=ax,importance_type='gain')
plt.show()

train_x.info()

cols = train_x.columns.values.tolist()

importanceMap = model.get_booster().get_score(importance_type='gain')

select = []

for col in cols:
    if importanceMap.get(col) <= 5 :
        select.append(col)

test_x = test_x.drop(select, axis=1)
train_x = train_x.drop(select, axis=1)

In [None]:
# import optuna
# from optuna import Trial
# from optuna.samplers import TPESampler
# from sklearn.model_selection import train_test_split,cross_val_score	#划分数据 交叉验证

# X = train_x
# y = train_y

# #define objective function
# def objective(trial: Trial,X,y) -> float:
    
#     #create a data split for validation
#     train_X,test_X,train_y,test_y = train_test_split(X, y, test_size = 0.30, random_state = 101)
    
#     #create a parameter space
#     #Note: I started very general with wide intervals and honed in with repeated studies.
#     #The history dataframe in the next cell is your friend.
#     #Read the table to pare down your intervals after each study.
    
#     param = {                
#                  'learning_rate':trial.suggest_uniform('learning_rate', 0.03, 0.06),
#                  'gamma':trial.suggest_uniform('gamma', .2, .5),
#                  'reg_alpha':trial.suggest_int('reg_alpha', 1, 5),
#                  'reg_lambda':trial.suggest_int('reg_lambda', 1, 7),
#                  'n_estimators':trial.suggest_int('n_estimators', 300, 500),
#                  'colsample_bynode':trial.suggest_uniform('colsample_bynode', .2, .4),
#                  'colsample_bylevel':trial.suggest_uniform('colsample_bylevel', .65, .75),
#                  'subsample':trial.suggest_uniform('subsample', .25, .95),               
#                  'min_child_weight':trial.suggest_int('min_child_weight', 100, 200),
#                  'colsample_bytree':trial.suggest_uniform('colsample_bytree',0.2, .4),
#                  'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20])
#     }
    
#     #set up the baseline model parameters.
#     #be careful about how high you let max_depth go
#     #this can lead to long training times and overfitting
    
#     model = XGBClassifier(objective='binary:logitraw', eval_metric = "auc",tree_method = 'gpu_hist', use_label_encoder=False, **param)
    
#     #fit to training sample
#     model.fit(train_X, train_y)
    
#     #return the cv score
#     return cross_val_score(model, test_X, test_y, cv = 5, scoring = 'roc_auc').mean()

# #run an Optuna study
# study = optuna.create_study(direction='maximize', sampler=TPESampler())
# study.optimize(lambda trial : objective(trial, X, y), n_trials = 25)

# #print our best outcome
# print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

In [None]:
from sklearn.metrics import accuracy_score
import lightgbm as lgb

lgb_train = lgb.Dataset(train_x, train_y)

# 参数设置
params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,		#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 2019,	#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67,
          }


print('Starting training...')
# 模型训练
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20)


# 模型预测
y_pred = gbm.predict(test_x, num_iteration=gbm.best_iteration)

pred_labels = np.rint(y_pred)
accuracy = accuracy_score(test_y, pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))

cv_results = lgb.cv(params, lgb_train, num_boost_round=100, nfold=5, 
                    verbose_eval=20, early_stopping_rounds=40)

np.array(cv_results["auc-mean"]).mean()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
lgb.plot_importance(gbm, max_num_features=60, height=0.5, ax=ax)
plt.show()

In [None]:
# import optuna
# from optuna import Trial
# from optuna.samplers import TPESampler
# from sklearn.model_selection import train_test_split,cross_val_score	#划分数据 交叉验证
# from sklearn.metrics import accuracy_score
# import lightgbm as lgb
# from optuna import Trial

# data = train_x
# target = train_y

# def objective(trial):
#     train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25)
#     dtrain = lgb.Dataset(train_x, label=train_y)
 
#     param = {
#         'objective': 'binary',
#         'metric': 'binary_logloss',
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
#         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
#         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
#         'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
#     }
 
#     gbm = lgb.train(param, dtrain)
#     cv_results = lgb.cv(params, lgb_train, num_boost_round=50, nfold=5, 
#                     verbose_eval=20, early_stopping_rounds=40)
#     return np.array(cv_results["auc-mean"]).mean()
 
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=20)
 
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
test_x.info()

In [None]:
train_x.info()

In [None]:
from itertools import combinations, permutations
categories=train_x.select_dtypes(include='object')
categories=categories.astype(str)
cross_features=list(combinations(categories.columns.values.tolist(), 2))
for item in cross_features:
    categories[item[0]+'_'+item[1]]=categories[item[0]].astype(str)+categories[item[1]].astype(str)
categories.shape

columns = train_x.select_dtypes(include='object').columns
continues=train_x.drop(columns,axis=1)

cross_features=list(combinations(continues.columns.values.tolist(), 2))
for item in cross_features:
    continues[item[0]+'+'+item[1]]=continues[item[0]]+continues[item[1]]
    continues[item[0]+'-'+item[1]]=continues[item[0]]-continues[item[1]]
    continues[item[0]+'/'+item[1]]=continues[item[0]]/continues[item[1]]
    continues[item[0]+'*'+item[1]]=continues[item[0]]*continues[item[1]]

continues.shape
    
train_x = total
try:
    train_x = reduce_mem_usage(train_x)
except Exception as ex:
    print("a")
del categories,continues
gc.collect()

In [None]:
train_x.info()

In [None]:
from itertools import combinations, permutations
categories=test_x.select_dtypes(include='object')
categories=categories.astype(str)
cross_features=list(combinations(categories.columns.values.tolist(), 2))
for item in cross_features:
    categories[item[0]+'_'+item[1]]=categories[item[0]].astype(str)+categories[item[1]].astype(str)
categories.shape

columns = test_x.select_dtypes(include='object').columns
continues=test_x.drop(columns,axis=1)

cross_features=list(combinations(continues.columns.values.tolist(), 2))
for item in cross_features:
    continues[item[0]+'+'+item[1]]=continues[item[0]]+continues[item[1]]
    continues[item[0]+'-'+item[1]]=continues[item[0]]-continues[item[1]]
    continues[item[0]+'/'+item[1]]=continues[item[0]]/continues[item[1]]
    continues[item[0]+'*'+item[1]]=continues[item[0]]*continues[item[1]]

continues.shape

test_x = total

try:
    test_x = reduce_mem_usage(test_x)
except Exception as ex:
    print("a")
    
del categories,continues
gc.collect()

In [None]:
test_x.info()

In [None]:
import category_encoders as ce

cat_features = columns

target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(train_x[cat_features], train_y)

train_x[cat_features] = target_enc.transform(train_x[cat_features])

test_x[cat_features] = target_enc.transform(test_x[cat_features])

In [None]:
# import category_encoders as ce

# cat_features=["daxiaoPankouStart","daxiaoPankou","daxiaoPankouStartMid","daxiaoPankouMid",
#       "daxiaoTypeStart", "daxiaoType", "daxiaoTypeStartMid", "daxiaoTypeMid",
#       "daxiaoPankouStart_real", "daxiaoPankou_real", "daxiaoPankouStartMid_real", "daxiaoPankouMid_real",
#       "daxiaoTypeStartReal", "daxiaoTypeReal", "daxiaoTypeStartMidReal", "daxiaoTypeMidReal","zhongbifeng",
#       "daxiaoPankouShengJiang","daxiaoPankouMidShengJiang"
#      ]

# cat_features=["daxiaoTypeStart", "daxiaoType", "daxiaoTypeStartMid", "daxiaoTypeMid",
#       "daxiaoTypeStartReal", "daxiaoTypeReal", "daxiaoTypeStartMidReal", "daxiaoTypeMidReal"
#      ]

# target_enc = ce.TargetEncoder(cols=cat_features)
# target_enc.fit(train[cat_features], train_y)

# train_x[cat_features] = target_enc.transform(train_x[cat_features])

# test_x[cat_features] = target_enc.transform(test_x[cat_features])

# features=["daxiaoPankouStart","daxiaoPankou","daxiaoPankouStartMid","daxiaoPankouMid",
#       "daxiaoPankouStart_real", "daxiaoPankou_real", "daxiaoPankouStartMid_real", "daxiaoPankouMid_real","zhongbifeng",
#       "daxiaoPankouShengJiang","daxiaoPankouMidShengJiang"
#      ]

# for col in features:
#     train_x[col] = train_x[col].astype(np.float32)

#     test_x[col] = test_x[col].astype(np.float32)

In [None]:
try:
    test_x = reduce_mem_usage(test_x)
    train_x = reduce_mem_usage(train_x)
except Exception as ex:
    print("a")
    

In [None]:
import gc
X=train_x
y=train_y
del train_x,train_y
gc.collect()

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,				#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 2019,	#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67,
          }

NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 1000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=100)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()

print(f"\nMean AUC = {score}")

In [None]:
import seaborn as sns
feature_importance_gain=pd.DataFrame()
feature_importance_gain['feature']=columns
feature_importance_gain['fold_1']=clf.feature_importance(importance_type='gain')
feature_importance_gain['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(1)]].mean(axis=1)
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importance_gain.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('100 TOP feature importance over {} folds average'.format(folds.n_splits));

In [205]:
import sweetviz as sv
# 可以选择目标特征
my_report = sv.analyze(train, target_feat ='result')
my_report.show_html()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=65.0), HTML(value='')), l…

  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)



Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [None]:
import sweetviz as sv
# 可以选择目标特征
my_report = sv.analyze(train, target_feat ='result')
my_report.show_html()

In [None]:
daxiaoPankouMid_real


In [None]:
pre_test = test[(test['daxiaoMasterStartOdd'] >= 0.75) & (test['daxiaoMasterStartOdd'] < 1.20)]
pre_test = pre_test[(pre_test['daxiaoMasterOdd'] >= 0.75) & (pre_test['daxiaoMasterOdd'] < 1.20)]
pre_test = pre_test[(pre_test['daxiaoGuestOddMid'] >= 0.75) & (pre_test['daxiaoGuestOddMid'] < 1.20)]

In [None]:
train_x

In [283]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["soccerData"]
mycol = mydb["midRawData007daxiao"]

data = pd.DataFrame(list(mycol.find()))
data = data.drop(['_id', 'time','place'], axis=1)

In [284]:
data = reduce_mem_usage(data)

Memory usage of dataframe is 181.39 MB
Memory usage after optimization is: 66.83 MB
Decreased by 63.2%


In [285]:
train = data[:250000]

test =  data[250000:]

In [286]:
test = test.drop(['masterOdd_Start_Interwette_daxiao', 'pankou_Start_Interwette_daxiao','guestOdd_Start_Interwette_daxiao',
                      'masterOdd_End_Interwette_daxiao', 'pankouOdd_End_Interwette_daxiao','guestOdd_End_Interwette_daxiao'], axis=1)

test = test.drop(['masterOdd_Start_平博_daxiao', 'pankou_Start_平博_daxiao','guestOdd_Start_平博_daxiao',
                      'masterOdd_End_平博_daxiao', 'pankouOdd_End_平博_daxiao','guestOdd_End_平博_daxiao'], axis=1)

test = test.drop(['masterOdd_Start_澳门_daxiao', 'pankou_Start_澳门_daxiao','guestOdd_Start_澳门_daxiao',
                      'masterOdd_End_澳门_daxiao', 'pankouOdd_End_澳门_daxiao','guestOdd_End_澳门_daxiao'], axis=1)

test = test.drop(['masterOdd_Start_Crown_daxiao', 'pankou_Start_Crown_daxiao','guestOdd_Start_Crown_daxiao',
                      'masterOdd_End_Crown_daxiao', 'pankouOdd_End_Crown_daxiao','guestOdd_End_Crown_daxiao'], axis=1)

test = test.drop(['masterOdd_Start_365_daxiao', 'pankou_Start_365_daxiao','guestOdd_Start_365_daxiao',
                      'masterOdd_End_365_daxiao', 'pankouOdd_End_365_daxiao','guestOdd_End_365_daxiao'], axis=1)

test = test.drop(['masterOdd_Start_12B_daxiao', 'pankou_Start_12B_daxiao','guestOdd_Start_12B_daxiao',
                      'masterOdd_End_12B_daxiao', 'pankouOdd_End_12B_daxiao','guestOdd_End_12B_daxiao'], axis=1)

test = test.drop(['masterOdd_Start_利记_daxiao', 'pankou_Start_利记_daxiao','guestOdd_Start_利记_daxiao',
                      'masterOdd_End_利记_daxiao', 'pankouOdd_End_利记_daxiao','guestOdd_End_利记_daxiao'], axis=1)

In [287]:
train = train.drop(['masterOdd_Start_Interwette_daxiao', 'pankou_Start_Interwette_daxiao','guestOdd_Start_Interwette_daxiao',
                      'masterOdd_End_Interwette_daxiao', 'pankouOdd_End_Interwette_daxiao','guestOdd_End_Interwette_daxiao'], axis=1)

train = train.drop(['masterOdd_Start_平博_daxiao', 'pankou_Start_平博_daxiao','guestOdd_Start_平博_daxiao',
                      'masterOdd_End_平博_daxiao', 'pankouOdd_End_平博_daxiao','guestOdd_End_平博_daxiao'], axis=1)

train = train.drop(['masterOdd_Start_澳门_daxiao', 'pankou_Start_澳门_daxiao','guestOdd_Start_澳门_daxiao',
                      'masterOdd_End_澳门_daxiao', 'pankouOdd_End_澳门_daxiao','guestOdd_End_澳门_daxiao'], axis=1)

train = train.drop(['masterOdd_Start_Crown_daxiao', 'pankou_Start_Crown_daxiao','guestOdd_Start_Crown_daxiao',
                      'masterOdd_End_Crown_daxiao', 'pankouOdd_End_Crown_daxiao','guestOdd_End_Crown_daxiao'], axis=1)

train = train.drop(['masterOdd_Start_365_daxiao', 'pankou_Start_365_daxiao','guestOdd_Start_365_daxiao',
                      'masterOdd_End_365_daxiao', 'pankouOdd_End_365_daxiao','guestOdd_End_365_daxiao'], axis=1)

train = train.drop(['masterOdd_Start_12B_daxiao', 'pankou_Start_12B_daxiao','guestOdd_Start_12B_daxiao',
                      'masterOdd_End_12B_daxiao', 'pankouOdd_End_12B_daxiao','guestOdd_End_12B_daxiao'], axis=1)

train = train.drop(['masterOdd_Start_利记_daxiao', 'pankou_Start_利记_daxiao','guestOdd_Start_利记_daxiao',
                      'masterOdd_End_利记_daxiao', 'pankouOdd_End_利记_daxiao','guestOdd_End_利记_daxiao'], axis=1)

In [288]:
train = train.dropna()
test = test.dropna()

In [289]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133504 entries, 3 to 249999
Data columns (total 32 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   masterGoal                  133504 non-null  int8    
 1   guestGoal                   133504 non-null  int8    
 2   masterOdd_Start_易胜_daxiao   133504 non-null  float32 
 3   pankou_Start_易胜_daxiao      133504 non-null  category
 4   guestOdd_Start_易胜_daxiao    133504 non-null  float32 
 5   masterOdd_End_易胜_daxiao     133504 non-null  float32 
 6   pankouOdd_End_易胜_daxiao     133504 non-null  category
 7   guestOdd_End_易胜_daxiao      133504 non-null  float32 
 8   masterOdd_Start_韦德_daxiao   133504 non-null  float32 
 9   pankou_Start_韦德_daxiao      133504 non-null  category
 10  guestOdd_Start_韦德_daxiao    133504 non-null  float32 
 11  masterOdd_End_韦德_daxiao     133504 non-null  float32 
 12  pankouOdd_End_韦德_daxiao     133504 non-null  category
 13 

In [290]:
def daxiao_num(x):
    x_list = x.split("/")
    num = 0
    for i in x_list:
        num += float(i)
    return (float(num)/len(x_list))

In [291]:
def getResult(masterGoal, guestGoal,pankou):
    if masterGoal + guestGoal - pankou >= 0:
        return 1
    if masterGoal + guestGoal - pankou < 0:
        return 0

In [292]:
# pankou = train.select_dtypes(include='float').columns
# for col in pankou:
#     test = test[(test[col] >= 0.60) & (test[col] < 1.50)]
#     train = train[(train[col] >= 0.60) & (train[col] < 1.50)]

In [293]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133504 entries, 3 to 249999
Data columns (total 32 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   masterGoal                  133504 non-null  int8    
 1   guestGoal                   133504 non-null  int8    
 2   masterOdd_Start_易胜_daxiao   133504 non-null  float32 
 3   pankou_Start_易胜_daxiao      133504 non-null  category
 4   guestOdd_Start_易胜_daxiao    133504 non-null  float32 
 5   masterOdd_End_易胜_daxiao     133504 non-null  float32 
 6   pankouOdd_End_易胜_daxiao     133504 non-null  category
 7   guestOdd_End_易胜_daxiao      133504 non-null  float32 
 8   masterOdd_Start_韦德_daxiao   133504 non-null  float32 
 9   pankou_Start_韦德_daxiao      133504 non-null  category
 10  guestOdd_Start_韦德_daxiao    133504 non-null  float32 
 11  masterOdd_End_韦德_daxiao     133504 non-null  float32 
 12  pankouOdd_End_韦德_daxiao     133504 non-null  category
 13 

In [294]:
pankou = train.select_dtypes(include='object').columns
for col in pankou:
    test[col] = test[col].map(daxiao_num)
    train[col] = train[col].map(daxiao_num)

In [295]:
def getShuiPing(x):
    result = 9
    if x <= 0.75:
        result = 0
    if 0.75< x and x<=0.85:
        result = 1
    if 0.85<x and x<=0.90:
        result = 2
    if 0.90<x and x<=0.95:
        result = 3
    if 0.95<x and x<=1.00:
        result = 4
    if 1.00<x and x<=1.08:
        result = 5
    if 1.08<x and x<=1.20:
        result = 6
    if x > 1.20:
        result = 7
    return result

In [296]:
pankou = train.select_dtypes(include='float').columns
for col in pankou:
    nm = col+'_'+'shuiPing'
    train[nm] = train[col].map(getShuiPing)
    test[nm] = test[col].map(getShuiPing)

In [297]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133504 entries, 3 to 249999
Data columns (total 32 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   masterGoal                  133504 non-null  int8    
 1   guestGoal                   133504 non-null  int8    
 2   masterOdd_Start_易胜_daxiao   133504 non-null  float32 
 3   pankou_Start_易胜_daxiao      133504 non-null  category
 4   guestOdd_Start_易胜_daxiao    133504 non-null  float32 
 5   masterOdd_End_易胜_daxiao     133504 non-null  float32 
 6   pankouOdd_End_易胜_daxiao     133504 non-null  category
 7   guestOdd_End_易胜_daxiao      133504 non-null  float32 
 8   masterOdd_Start_韦德_daxiao   133504 non-null  float32 
 9   pankou_Start_韦德_daxiao      133504 non-null  category
 10  guestOdd_Start_韦德_daxiao    133504 non-null  float32 
 11  masterOdd_End_韦德_daxiao     133504 non-null  float32 
 12  pankouOdd_End_韦德_daxiao     133504 non-null  category
 13 

In [298]:
test['result'] = test.apply(lambda x: getResult(
        x['masterGoal'], x['guestGoal'], x['pankouOdd_End_金宝博_daxiao']), axis=1)

train['result'] = train.apply(lambda x: getResult(
        x['masterGoal'], x['guestGoal'], x['pankouOdd_End_金宝博_daxiao']), axis=1)

TypeError: unsupported operand type(s) for -: 'int' and 'str'

In [None]:
train = train.drop(columns=['masterGoal', 'guestGoal'])
test = test.drop(columns=['masterGoal', 'guestGoal'])

In [None]:
import gc

def encode_LE(col,train,test):
    df_comb = pd.concat([train[col],test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    nm = col
    if df_comb.max()>32000: 
        train[nm] = df_comb[:len(train)].astype('int32')
        test[nm] = df_comb[len(train):].astype('int32')
    else:
        train[nm] = df_comb[:len(train)].astype('int16')
        test[nm] = df_comb[len(train):].astype('int16')
    del df_comb; x=gc.collect()
    
def encode_CB(col1,col2,df1,df2,name=""):
    if name == "":
        nm = col1+'_'+col2
    else:
        nm = name
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str)
#     encode_LE(nm,df1,df2)

def encode_CB3(col1,col2,col3,df1,df2,name=""):
    if name == "":
        nm = col1+'_'+col2+'_'+col3
    else:
        nm = name
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)+'_'+df1[col3].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str)+'_'+df2[col3].astype(str)
#     encode_LE(nm,df1,df2)

In [None]:
def getType(yapanMasterStartOdd, yapanGuestStartOdd, yapanPankouStart):
    linTypeStart = get18(yapanMasterStartOdd, yapanGuestStartOdd)
    return str(linTypeStart) + "_" + yapanPankouStart

def get18(master,guest):
    if master > guest:
        return 18
    if master < guest:
        return 81
    if master == guest:
        return 99

In [None]:
train.info()

In [None]:
# masters = ["易胜", "韦德", "明陞", "10B", "金宝博"]

# for master in masters:
#     test['daxiaoOldTypeStart'+master] = test.apply(lambda x: getType(x["masterOdd_Start" + "_" + master + "_daxiao"], x["guestOdd_Start" + "_" + master + "_daxiao"], str(x["pankou_Start" + "_" + master + "_daxiao"])), axis=1)
#     train['daxiaoOldTypeStart'+master] = train.apply(lambda x: getType(x["masterOdd_Start" + "_" + master + "_daxiao"], x["guestOdd_Start" + "_" + master + "_daxiao"], str(x["pankou_Start" + "_" + master + "_daxiao"])), axis=1)
    
#     test['daxiaoOldTypeEnd'+master] = test.apply(lambda x: getType(x["masterOdd_End" + "_" + master + "_daxiao"], x["guestOdd_End" + "_" + master + "_daxiao"], str(x["pankouOdd_End" + "_" + master + "_daxiao"])), axis=1)
#     train['daxiaoOldTypeEnd'+master] = train.apply(lambda x: getType(x["masterOdd_End" + "_" + master + "_daxiao"], x["guestOdd_End" + "_" + master + "_daxiao"], str(x["pankouOdd_End" + "_" + master + "_daxiao"])), axis=1)
   
#     features.append("daxiaoOldTypeStart" + master)
#     features.append("daxiaoOldTypeEnd" + master)

In [None]:
masters = ["易胜", "韦德", "明陞", "10B", "金宝博"]

features = []
for master in masters:
    test['daxiaoTypeStart'+master] = test["masterOdd_Start" + "_" + master + "_daxiao"].astype(str) + test["guestOdd_Start" + "_" + master + "_daxiao"].astype(str) + test["pankou_Start" + "_" + master + "_daxiao"].astype(str)
    train['daxiaoTypeStart'+master]=train["masterOdd_Start" + "_" + master + "_daxiao"].astype(str) + train["guestOdd_Start" + "_" + master + "_daxiao"].astype(str) + train["pankou_Start" + "_" + master + "_daxiao"].astype(str)

    test['daxiaoTypeEnd'+master]=test["masterOdd_End" + "_" + master + "_daxiao"].astype(str) + test["guestOdd_End" + "_" + master + "_daxiao"].astype(str) + test["pankouOdd_End" + "_" + master + "_daxiao"].astype(str)
    train['daxiaoTypeEnd'+master]=train["masterOdd_End" + "_" + master + "_daxiao"].astype(str) + train["guestOdd_End" + "_" + master + "_daxiao"].astype(str) + train["pankouOdd_End" + "_" + master + "_daxiao"].astype(str)

    features.append("daxiaoTypeStart" + master)
    features.append("daxiaoTypeEnd" + master)

In [None]:
pd.set_option('display.max_columns', None)
test.head()

In [140]:
from itertools import combinations

cross_features=list(combinations(features, 3))

for item in cross_features:
    nm = item[0]+'_'+item[1]+"_"+item[2]
    train[nm]=train[item[0]].astype(str)+train[item[1]].astype(str)+train[item[2]].astype(str)
    test[nm]=test[item[0]].astype(str)+test[item[1]].astype(str)+test[item[2]].astype(str)

In [278]:
train_x =  train.drop(columns=['result'])
train_y =  train['result']

test_x =  test.drop(columns=['result'])
test_y =  test['result']

In [279]:
import gc
cat_features = train.select_dtypes(include='object').columns
cat_features

Index(['daxiaoTypeStart易胜', 'daxiaoTypeEnd易胜', 'daxiaoTypeStart韦德',
       'daxiaoTypeEnd韦德', 'daxiaoTypeStart明陞', 'daxiaoTypeEnd明陞',
       'daxiaoTypeStart10B', 'daxiaoTypeEnd10B', 'daxiaoTypeStart金宝博',
       'daxiaoTypeEnd金宝博'],
      dtype='object')

In [280]:
from sklearn.model_selection import  StratifiedKFold,KFold
import category_encoders as ce

def mean_woe_target_encoder(train,test,target,col,n_splits=10):
    folds = StratifiedKFold(n_splits)

    y_oof = np.zeros(train.shape[0])
    y_oof_2= np.zeros(train.shape[0])
    y_test_oof = np.zeros(test.shape[0]).reshape(-1,1)
    y_test_oof2 = np.zeros(test.shape[0]).reshape(-1,1)

    splits = folds.split(train, target)
    
    for fold_n, (train_index, valid_index) in enumerate(splits):
        print(col)
        X_train, X_valid = train[col].iloc[train_index], train[col].iloc[valid_index]
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        clf=ce.target_encoder.TargetEncoder()
    
        clf.fit(X_train.values,y_train.values)    
        y_pred_valid = clf.transform(X_valid.values)

        y_oof[valid_index] = y_pred_valid.values.reshape(1,-1)

        tp=(clf.transform(test[col].values)/(n_splits*1.0)).values
        tp=tp.reshape(-1,1)
        y_test_oof+=tp    
    
        del X_train, X_valid, y_train, y_valid
        gc.collect()    
        
    for fold_n, (train_index, valid_index) in enumerate(splits):
        X_train, X_valid = train[col].iloc[train_index], train[col].iloc[valid_index]
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        clf=ce.woe.WOEEncoder()
    
        clf.fit(X_train.values,y_train.values)    
        y_pred_valid = clf.transform(X_valid.values)
        y_oof2[valid_index] = y_pred_valid.values.reshape(1,-1)
    
        tp=(clf.transform(test[col].values)/(n_splits*1.0)).values
        tp=tp.reshape(-1,1)
        y_test_oof2+=tp    
        del X_train, X_valid, y_train, y_valid
        gc.collect()     
    return y_oof,y_oof_2,y_test_oof,y_test_oof2

In [281]:
for col in cat_features:
  y_oof,y_oof_2,y_test_oof,y_test_oof2 = mean_woe_target_encoder(train_x,test_x,train_y,col,n_splits=10)
  train_x[col] = y_oof
  test_x[col] = y_test_oof

daxiaoTypeStart易胜


  elif pd.api.types.is_categorical(cols):


FloatingPointError: underflow encountered in exp

In [282]:
import category_encoders as ce

target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(train_x[cat_features], train_y)

train_x[cat_features] = target_enc.transform(train_x[cat_features])
test_x[cat_features] = target_enc.transform(test_x[cat_features])

FloatingPointError: underflow encountered in exp

In [None]:
import gc
X=train_x
y=train_y
# del train_x,train_y
# gc.collect()

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,				#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 2019,	#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67
          }

NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(test_x.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 1000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=200)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    print(y_pred_valid)
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(test_x) / NFOLDS
    del X_train, X_valid, y_train, y_valid
    gc.collect()

print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(test_y, y_preds)}")

In [None]:
import seaborn as sns
sns.set(font='LiSu')
feature_importance_gain=pd.DataFrame()
feature_importance_gain['feature']=columns
feature_importance_gain['fold_1']=clf.feature_importance(importance_type='gain')
feature_importance_gain['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(1)]].mean(axis=1)
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importance_gain.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('100 TOP feature importance over {} folds average'.format(folds.n_splits));

In [87]:
test_x.head()

Unnamed: 0,masterOdd_Start_易胜_daxiao,pankou_Start_易胜_daxiao,guestOdd_Start_易胜_daxiao,masterOdd_End_易胜_daxiao,pankouOdd_End_易胜_daxiao,guestOdd_End_易胜_daxiao,masterOdd_Start_韦德_daxiao,pankou_Start_韦德_daxiao,guestOdd_Start_韦德_daxiao,masterOdd_End_韦德_daxiao,pankouOdd_End_韦德_daxiao,guestOdd_End_韦德_daxiao,masterOdd_Start_明陞_daxiao,pankou_Start_明陞_daxiao,guestOdd_Start_明陞_daxiao,masterOdd_End_明陞_daxiao,pankouOdd_End_明陞_daxiao,guestOdd_End_明陞_daxiao,masterOdd_Start_10B_daxiao,pankou_Start_10B_daxiao,guestOdd_Start_10B_daxiao,masterOdd_End_10B_daxiao,pankouOdd_End_10B_daxiao,guestOdd_End_10B_daxiao,masterOdd_Start_金宝博_daxiao,pankou_Start_金宝博_daxiao,guestOdd_Start_金宝博_daxiao,masterOdd_End_金宝博_daxiao,pankouOdd_End_金宝博_daxiao,guestOdd_End_金宝博_daxiao,masterOdd_Start_易胜_daxiao_shuiPing,pankou_Start_易胜_daxiao_shuiPing,guestOdd_Start_易胜_daxiao_shuiPing,masterOdd_End_易胜_daxiao_shuiPing,pankouOdd_End_易胜_daxiao_shuiPing,guestOdd_End_易胜_daxiao_shuiPing,masterOdd_Start_韦德_daxiao_shuiPing,pankou_Start_韦德_daxiao_shuiPing,guestOdd_Start_韦德_daxiao_shuiPing,masterOdd_End_韦德_daxiao_shuiPing,pankouOdd_End_韦德_daxiao_shuiPing,guestOdd_End_韦德_daxiao_shuiPing,masterOdd_Start_明陞_daxiao_shuiPing,pankou_Start_明陞_daxiao_shuiPing,guestOdd_Start_明陞_daxiao_shuiPing,masterOdd_End_明陞_daxiao_shuiPing,pankouOdd_End_明陞_daxiao_shuiPing,guestOdd_End_明陞_daxiao_shuiPing,masterOdd_Start_10B_daxiao_shuiPing,pankou_Start_10B_daxiao_shuiPing,guestOdd_Start_10B_daxiao_shuiPing,masterOdd_End_10B_daxiao_shuiPing,pankouOdd_End_10B_daxiao_shuiPing,guestOdd_End_10B_daxiao_shuiPing,masterOdd_Start_金宝博_daxiao_shuiPing,pankou_Start_金宝博_daxiao_shuiPing,guestOdd_Start_金宝博_daxiao_shuiPing,masterOdd_End_金宝博_daxiao_shuiPing,pankouOdd_End_金宝博_daxiao_shuiPing,guestOdd_End_金宝博_daxiao_shuiPing,daxiaoTypeStart易胜,daxiaoTypeStart韦德,daxiaoTypeStart明陞,daxiaoTypeStart10B,daxiaoTypeStart金宝博
250003,0.66,3.5,1.26,0.86,3.75,1.01,0.92,3.5,0.85,0.8,3.75,1.0,0.65,3.5,1.18,0.71,3.5,1.12,0.86,3.5,0.9,0.95,4.0,0.79,0.77,3.5,0.99,0.81,3.75,1.01,0,7,7,2,7,5,3,7,1,1,7,4,0,7,6,0,7,6,2,7,2,3,7,1,1,7,4,1,7,5,4129.0,1944.0,1992.0,6586.0,2301.0
250007,0.95,2.5,0.85,1.0,2.5,0.8,1.0,2.5,0.833,1.0,2.5,0.833,1.03,2.5,0.87,1.04,2.5,0.86,0.98,2.5,0.83,0.98,2.5,0.79,1.03,2.5,0.87,1.02,2.5,0.88,3,7,1,4,7,1,4,7,1,4,7,1,5,7,2,5,7,2,4,7,1,4,7,1,5,7,2,5,7,2,8152.0,3281.0,5934.0,7179.0,4450.0
250008,0.95,3.0,0.87,0.95,3.0,0.9,0.47,2.5,1.5,0.5,2.5,1.45,0.93,3.0,0.89,0.91,3.0,0.91,0.92,3.0,0.85,0.86,3.0,0.9,0.85,3.0,0.91,0.83,3.0,0.99,3,7,2,3,7,2,0,7,7,0,7,7,3,7,2,3,7,3,3,7,1,2,7,2,1,7,3,1,7,4,5947.0,481.0,4503.0,4731.0,2501.0
250010,0.94,2.0,0.92,0.96,2.0,0.89,0.95,2.0,0.91,1.0,2.0,0.85,0.94,2.0,0.94,1.02,2.0,0.86,0.94,2.0,0.9,0.98,2.0,0.88,0.98,2.0,0.92,1.0,2.0,0.9,3,7,3,4,7,2,3,7,3,4,7,1,3,7,3,5,7,2,3,7,2,4,7,2,4,7,3,4,7,2,6042.0,3295.0,5649.0,7259.0,5157.0
250011,0.87,2.5,1.0,0.68,2.5,1.21,0.87,2.5,0.95,0.88,2.75,0.95,0.86,2.5,1.02,0.76,2.5,1.14,0.87,2.5,0.98,0.87,2.75,0.98,0.85,2.5,1.03,0.89,2.75,0.99,2,7,4,0,7,7,2,7,3,2,7,3,2,7,5,1,7,6,2,7,4,2,7,4,1,7,5,2,7,4,1284.0,1572.0,2656.0,4958.0,3162.0


In [80]:
import sweetviz as sv
# 可以选择目标特征
my_report = sv.analyze(train, target_feat ='result')
my_report.show_html()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=67.0), HTML(value='')), l…

  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0.0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)
  font.set_text(s, 0, flags=flags)



Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
