In [1]:
import json
import pandas as pd

import pymongo
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [2]:
def reduce_mem_usage(df, verbose=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                c_prec = df[col].apply(lambda x: np.finfo(x).precision).max()
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max and c_prec == np.finfo(np.float16).precision:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max and c_prec == np.finfo(np.float32).precision:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
import numpy as np
 
class Performance:
    """
    定义一个类，用来分类器的性能度量
    """
    def __init__(self, labels, scores, threshold=0.5):
        """
        :param labels:数组类型，真实的标签
        :param scores:数组类型，分类器的得分
        :param threshold:检测阈值
        """
        self.labels = labels
        self.scores = scores
        self.threshold = threshold
        self.db = self.get_db()
        self.TP, self.FP, self.FN, self.TN = self.get_confusion_matrix()
 
    def accuracy(self):
        """
        :return: 正确率
        """
        return (self.TP + self.TN) / (self.TP + self.FN + self.FP + self.TN)
 
    def presision(self):
        """
        :return: 准确率
        """
        if (self.TP + self.FP) == 0:
            return 0
        return self.TP / (self.TP + self.FP)
    
    def presision2(self):
        """
        :return: 准确率
        """
        if (self.TN + self.FN) == 0:
            return 0
        return self.TN / (self.TN + self.FN)
    
    def get_db(self):
        db = []
        for i in range(len(self.labels)):
            db.append([self.labels[i], self.scores[i]])
        db = sorted(db, key=lambda x: x[1], reverse=True)
        return db

    def get_confusion_matrix(self):
        """
        计算混淆矩阵
        :return:
        """
        tp, fp, fn, tn = 0., 0., 0., 0.
        for i in range(len(self.labels)):
            if self.labels[i] == 1 and self.scores[i] >= self.threshold:
                tp += 1
            elif self.labels[i] == 0 and self.scores[i] >= self.threshold:
                fp += 1
            elif self.labels[i] == 1 and self.scores[i] < self.threshold:
                fn += 1
            else:
                tn += 1
        return [tp, fp, fn, tn]

In [4]:
def getGoalMid(masterGoal, guestGoal, masterMidGoal, guestMidGoal):
    return int(masterGoal) + int(guestGoal) - int(masterMidGoal) - int(guestMidGoal)


def removeSub(pankou):
    pankou = pankou.replace("升", "")
    pankou = pankou.replace("降", "")
    return pankou.strip()


def getResultOne(goalMid, pankou):
    row = pankou
    row = 0
    if goalMid - float(row) > 0:
        return 1
    if goalMid - float(row) <= 0:
        return 0
    
def getType(yapanMasterStartOdd, yapanGuestStartOdd, yapanPankouStart):
    linTypeStart = get18(yapanMasterStartOdd, yapanGuestStartOdd)
    return str(linTypeStart)


def get18(master, guest):
    if master > guest:
        return 18
    if master < guest:
        return 81
    if master == guest:
        return 99


def daxiao_num(x):
    x_list = x.split("/")
    num = 0
    for i in x_list:
        num += float(i)
    return (float(num)/len(x_list))


def realDaxiao(x, master, guest):
    return float(x) - float(master) - float(guest)

def realDaxiao75(x, last):
    master = last.split("-")[0]
    guest = last.split("-")[1]
    return float(x) - float(master) - float(guest)

def shengjiang(start, end):
    return ((end) - (start))

def round2(x):
    return round((x), 2)

In [5]:
def getShuiPing(x):
    result = 11
    if x < 0.75:
        result = 0
    if 0.75<= x and x<=0.85:
        result = 1
    if 0.85<x and x<=0.90:
        result = 2
    if 0.90<x and x<=0.95:
        result = 3
    if 0.95<x and x<=1.00:
        result = 4
    if 1.00<x and x<=1.08:
        result = 5
    if 1.08<x:
        result = 6
    return result

def num_fea_dis(df,features):
    for f in features:
        nm = f+'_'+'shuiPing'
        df[nm] = df[f].map(getShuiPing)
    return df

In [6]:
def getLastTime(x):
    if x >= 45 and x < 50 :
        return 0 
    if x >= 50 and x < 60 :
        return 1 
    if x >= 60 and x <= 75:
        return 2
    return 3

def getResult65(zhong,last):
    sum1 = 0
    for i in zhong.split('-'):
        sum1 += int(i)
    sum2 = 0
    for i in last.split('-'):
        sum2 += int(i)
    return sum2 - sum1

def getResultNew(x):
    if x >= 1:
        return 1
    return 0

def pankouFlow(x,y):
    return float(x) - float(y)

In [7]:
def fillNa(x, value):
    if pd.isnull(x):
        return value
    else:
        return x

    
def preF(test):
    test['goalMid'] = test.apply(lambda x: getGoalMid(
        x['masterGoal'], x['guestGoal'], x['midGuestGoal'], x['midMasterGoal']), axis=1)
    
    test = test[test['goalMid'] >= 0]
    
    test['bifeng'] = test['masterGoal'].astype(
        str) + "-" + test['guestGoal'].astype(str)
    
    test['zhongbifengNew'] = test['midMasterGoal'].astype(
        str) + "-" + test['midGuestGoal'].astype(str)
    
    test['allbifeng'] = test['lastBifeng65'].astype(
        str) + "-" + test['zhongbifengNew'].astype(str)
            
    pankou = ["pankouOdd_End_Zhong_3","pankou_Start_Zhong_3"] + ["pankou_Start_Ji_3","pankouOdd_End_Ji_3"] + ["pankouOdd_End_65_3","pankouOdd_End_75_3","pankouOdd_End_80_3"]

    for col in pankou:
        test[col] = test[col].astype(str)
        test[col] = test[col].map(daxiao_num)
        test[col] = test[col].astype(str)
        nm = col + "Real"
        test[nm] = test.apply(lambda x: realDaxiao(x[col],x['midGuestGoal'],x['midMasterGoal']), axis=1)
        test[nm] = test[nm].astype(str)
        
#     test['allbifengNew'] = test['allbifeng'].astype(
#         str) + "-" + test['pankouOdd_End_Zhong_3Real'].astype(str) + "-" + test['pankouOdd_End_Ji_3'].astype(str)
    
#     fes = ['masterOdd_Start_Ji_3', 'masterOdd_End_Ji_3',
#            'masterOdd_Start_Zhong_3', 'masterOdd_End_Zhong_3'] + ['guestOdd_Start_Ji_3', 'guestOdd_End_Ji_3',
#            'guestOdd_Start_Zhong_3', 'guestOdd_End_Zhong_3']
    
    
#     num_fea_dis(test, fes)
    
#     test['daxiaoTypeStart'] = test.apply(lambda x: getType(x['masterOdd_Start_Ji_3'], x['guestOdd_Start_Ji_3'],x['pankou_Start_Ji_3']), axis=1)
#     test['daxiaoType'] = test.apply(lambda x: getType(x['masterOdd_End_Ji_3'], x['guestOdd_End_Ji_3'],x['pankouOdd_End_Ji_3']), axis=1)
#     test['daxiaoTypeStartMid'] = test.apply(lambda x: getType(x['masterOdd_Start_Zhong_3'], x['guestOdd_Start_Zhong_3'],x['pankou_Start_Zhong_3Real']), axis=1)
#     test['daxiaoTypeMid'] = test.apply(lambda x: getType(x['masterOdd_End_Zhong_3'], x['guestOdd_End_Zhong_3'],x['pankouOdd_End_Zhong_3Real']), axis=1)
    

#     test['daxiaoTypeStart'] = test['masterOdd_Start_Ji_3_shuiPing'].astype(str) + test['daxiaoTypeStart']
#     test['daxiaoType'] = test['masterOdd_End_Ji_3_shuiPing'].astype(str) + test['daxiaoType']
    
#     test['daxiaoTypeStartMid'] = test['masterOdd_Start_Zhong_3_shuiPing'].astype(str) + test['daxiaoTypeStartMid']
#     test['daxiaoTypeMid'] = test['masterOdd_End_Zhong_3_shuiPing'].astype(str) + test['daxiaoTypeMid']
 
#     test['daxiaoTypeALL'] = test['daxiaoTypeStart'] + test['daxiaoType'] + test['zhongbifengNew']
#     test['daxiaoTypeMidALL'] = test['daxiaoTypeStartMid'] + test['daxiaoTypeMid'] + test['zhongbifengNew']
    
#     test['daxiaoPankou'] = test['pankou_Start_Ji_3'] + test['pankouOdd_End_Ji_3']
#     test['daxiaoPankouMid'] = test['pankou_Start_Zhong_3Real'] + test['pankouOdd_End_Zhong_3Real']
#     test['daxiaoPankouALL'] = test['daxiaoPankou'] + test['daxiaoPankouMid'] + test['zhongbifengNew']

#     test['MasterOddFlowPankou'] = test["pankouOdd_End_Ji_3"].astype(float) - test["pankou_Start_Ji_3"].astype(float)
#     test['MasterOddFlowPankouMid'] = test["pankou_Start_Zhong_3"].astype(float) - test["pankouOdd_End_Zhong_3"].astype(float)
#     test['MasterOddFlow'] = test["masterOdd_Start_Ji_3"] - test["masterOdd_End_Ji_3"]
    
#     test['daxiaoTypeStartMidLast'] = test['masterOdd_Start_Zhong_3_shuiPing'].astype(str) + test['daxiaoTypeStartMid'] + test['lastBifeng'].astype(str)
#     test['daxiaoTypeMidLast'] = test['masterOdd_End_Zhong_3_shuiPing'].astype(str) + test['daxiaoTypeMid'] + test['lastBifeng'].astype(str)
#     test['daxiaoTypeStartLast'] = test['masterOdd_Start_Ji_3_shuiPing'].astype(str) + test['daxiaoTypeStart'] + test['lastBifeng'].astype(str)
#     test['daxiaoTypeLast'] = test['masterOdd_End_Ji_3_shuiPing'].astype(str) + test['daxiaoType'] + test['lastBifeng'].astype(str)
#     test['daxiaoPankouLast'] = test['pankou_Start_Ji_3'] + test['pankouOdd_End_Ji_3'] + test['lastBifeng'].astype(str)
#     test['daxiaoPankouMidLast'] = test['pankou_Start_Zhong_3Real'] + test['pankouOdd_End_Zhong_3Real'] + test['lastBifeng'].astype(str)
#     test['daxiaoPankouALLLast'] = test['daxiaoPankou'] + test['daxiaoPankouMid'] + test['zhongbifengNew'] + test['lastBifeng'].astype(str)
        
#     test = test.drop(['masterOdd_Start_Ji_3_shuiPing', 'masterOdd_End_Ji_3_shuiPing',
#                       'masterOdd_Start_Zhong_3_shuiPing','masterOdd_End_Zhong_3_shuiPing'], axis=1)
    
#     test = test.drop(['guestOdd_Start_Ji_3_shuiPing', 'guestOdd_End_Ji_3_shuiPing',
#                       'guestOdd_Start_Zhong_3_shuiPing','guestOdd_End_Zhong_3_shuiPing'], axis=1)
    
    test = test.drop(columns=['masterGoal', 'guestGoal', "goalMid"])
    test = test.drop(columns=['midGuestGoal', 'midMasterGoal',"time"])
    
    test["result4565"] =  test.apply(lambda x: getResult65(x['zhongbifengNew'], x['lastBifeng65']), axis=1)
    test = test[test["result4565"] == 1]
    
    test["result6580"] =  test.apply(lambda x: getResult65(x['lastBifeng65'], x['lastBifeng80']), axis=1)
    test = test[test["result6580"] == 0]
    
    test["result"] =  test.apply(lambda x: getResult65(x['lastBifeng80'], x['bifeng']), axis=1)
    test = test[test["result"] >= 0]
    test["result"] = test["result"].map(getResultNew)
    test = test.drop(["bifeng"], axis=1)
    return test

In [8]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["soccerData"]
mycol = mydb["win007657580"]
data_mid = pd.DataFrame(list(mycol.find({"time":{"$gte":"2015-01-01 00:00:00","$lt":"2021-01-01 00:00:00"}})))
data_mid = data_mid.drop(['_id','place'], axis=1)
data_mid = data_mid.dropna()
data_mid = reduce_mem_usage(data_mid)
data_mid = preF(data_mid)

In [9]:
train = data_mid

# train.to_csv("train_5.txt")

In [10]:
train.head()

Unnamed: 0,lianShai,guestOdd_End_65_3,guestOdd_End_75_3,guestOdd_End_80_3,guestOdd_End_Ji_3,guestOdd_End_Zhong_3,guestOdd_Start_Ji_3,guestOdd_Start_Zhong_3,lastBifeng65,lastBifeng75,...,pankouOdd_End_Zhong_3Real,pankou_Start_Zhong_3Real,pankou_Start_Ji_3Real,pankouOdd_End_Ji_3Real,pankouOdd_End_65_3Real,pankouOdd_End_75_3Real,pankouOdd_End_80_3Real,result4565,result6580,result
4,西青U19,0.78,0.89,0.77,0.96,0.83,0.96,0.83,1-2,1-2,...,2.0,2.0,1.75,1.75,2.25,1.75,1.5,1,0,1
5,西丁,0.79,0.63,0.42,0.8,0.74,0.85,0.99,1-1,1-1,...,1.5,1.25,1.25,1.25,1.75,1.5,1.5,1,0,1
12,球会友谊,0.85,0.71,0.47,0.93,0.83,0.88,0.83,1-1,1-1,...,1.5,1.5,2.0,2.0,1.75,1.5,1.5,1,0,0
17,印度U18,0.9,0.92,0.61,0.8,0.86,0.95,1.05,1-3,1-3,...,1.5,1.25,-0.25,-0.25,1.75,1.5,1.5,1,0,1
21,西丙4,0.98,0.61,0.45,0.96,1.05,0.93,1.09,2-0,2-0,...,1.0,1.0,1.0,1.0,1.5,1.5,1.5,1,0,1


In [11]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["soccerData"]
mycol = mydb["win007657580"]
test = pd.DataFrame(list(mycol.find({"time":{"$gte":"2021-01-01 00:00:00","$lt":"2022-01-01 00:00:00"}})))
test = test.dropna()
test = test.drop(['_id','place'], axis=1)
print(test.shape)
test = preF(test)
print(test.shape)

(34259, 30)
(7543, 37)


In [12]:
pd.set_option('display.max_columns', None)
train.head()

Unnamed: 0,lianShai,guestOdd_End_65_3,guestOdd_End_75_3,guestOdd_End_80_3,guestOdd_End_Ji_3,guestOdd_End_Zhong_3,guestOdd_Start_Ji_3,guestOdd_Start_Zhong_3,lastBifeng65,lastBifeng75,lastBifeng80,masterOdd_End_65_3,masterOdd_End_75_3,masterOdd_End_80_3,masterOdd_End_Ji_3,masterOdd_End_Zhong_3,masterOdd_Start_Ji_3,masterOdd_Start_Zhong_3,pankouOdd_End_65_3,pankouOdd_End_75_3,pankouOdd_End_80_3,pankouOdd_End_Ji_3,pankouOdd_End_Zhong_3,pankou_Start_Ji_3,pankou_Start_Zhong_3,zhongbifengNew,allbifeng,pankouOdd_End_Zhong_3Real,pankou_Start_Zhong_3Real,pankou_Start_Ji_3Real,pankouOdd_End_Ji_3Real,pankouOdd_End_65_3Real,pankouOdd_End_75_3Real,pankouOdd_End_80_3Real,result4565,result6580,result
4,西青U19,0.78,0.89,0.77,0.96,0.83,0.96,0.83,1-2,1-2,1-2,1.02,0.91,1.03,0.8,0.97,0.8,0.97,4.25,3.75,3.5,3.75,4.0,3.75,4.0,0-2,1-2-0-2,2.0,2.0,1.75,1.75,2.25,1.75,1.5,1,0,1
5,西丁,0.79,0.63,0.42,0.8,0.74,0.85,0.99,1-1,1-1,1-1,1.01,1.2,1.61,1.0,1.06,0.95,0.81,2.75,2.5,2.5,2.25,2.5,2.25,2.25,1-0,1-1-1-0,1.5,1.25,1.25,1.25,1.75,1.5,1.5,1,0,1
12,球会友谊,0.85,0.71,0.47,0.93,0.83,0.88,0.83,1-1,1-1,1-1,1.01,1.17,1.63,0.87,1.03,0.92,1.03,2.75,2.5,2.5,3.0,2.5,3.0,2.5,1-0,1-1-1-0,1.5,1.5,2.0,2.0,1.75,1.5,1.5,1,0,0
17,印度U18,0.9,0.92,0.61,0.8,0.86,0.95,1.05,1-3,1-3,1-3,0.9,0.88,1.23,0.9,0.94,0.75,0.75,4.75,4.5,4.5,2.75,4.5,2.75,4.25,1-2,1-3-1-2,1.5,1.25,-0.25,-0.25,1.75,1.5,1.5,1,0,1
21,西丙4,0.98,0.61,0.45,0.96,1.05,0.93,1.09,2-0,2-0,2-0,0.82,1.23,1.53,0.84,0.75,0.87,0.71,2.5,2.5,2.5,2.0,2.0,2.0,2.0,1-0,2-0-1-0,1.0,1.0,1.0,1.0,1.5,1.5,1.5,1,0,1


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21587 entries, 4 to 98981
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   lianShai                   21587 non-null  object 
 1   guestOdd_End_65_3          21587 non-null  float64
 2   guestOdd_End_75_3          21587 non-null  float64
 3   guestOdd_End_80_3          21587 non-null  float64
 4   guestOdd_End_Ji_3          21587 non-null  float64
 5   guestOdd_End_Zhong_3       21587 non-null  float64
 6   guestOdd_Start_Ji_3        21587 non-null  float64
 7   guestOdd_Start_Zhong_3     21587 non-null  float64
 8   lastBifeng65               21587 non-null  object 
 9   lastBifeng75               21587 non-null  object 
 10  lastBifeng80               21587 non-null  object 
 11  masterOdd_End_65_3         21587 non-null  float64
 12  masterOdd_End_75_3         21587 non-null  float64
 13  masterOdd_End_80_3         21587 non-null  flo

In [14]:
# from itertools import combinations, permutations
# features = ['lianShai','daxiaoPankouALL','daxiaoTypeALL','allbifengNew','daxiaoType']
# cross_features=list(combinations(features, 2))
# a1 = [] 
# for item in cross_features:
#     train[item[0]+'_'+item[1]]=train[item[0]].astype(str)+train[item[1]].astype(str)
#     test[item[0]+'_'+item[1]]=test[item[0]].astype(str)+test[item[1]].astype(str)
#     a1.append(item[0]+'_'+item[1])

In [15]:
# cols = train.select_dtypes(include='object').columns
# from itertools import combinations, permutations
# features = cols
# cross_features=list(combinations(features, 2))
# for item in cross_features:
#     train[item[0]+'_'+item[1]]=train[item[0]].astype(str)+train[item[1]].astype(str)
#     test[item[0]+'_'+item[1]]=test[item[0]].astype(str)+test[item[1]].astype(str)

In [16]:
# from itertools import combinations, permutations
# features = ['lianShai','daxiaoPankouALL','daxiaoTypeALL','allbifengNew','daxiaoType','lastBifeng']
# cross_features=list(combinations(features, 2))
# a1 = [] 
# for item in cross_features:
#     train[item[0]+'_'+item[1]]=train[item[0]].astype(str)+train[item[1]].astype(str)
#     test[item[0]+'_'+item[1]]=test[item[0]].astype(str)+test[item[1]].astype(str)
#     a1.append(item[0]+'_'+item[1])

In [17]:
# features2 = ['masterOdd_End_Ji_3', 'masterOdd_End_Zhong_3', 'masterOdd_End_75_3']
# cross_features=list(combinations(features2, 2))
# a2 = []
# for item in cross_features:
#         train[item[0]+'+'+item[1]]=train[item[0]]+train[item[1]]
#         train[item[0]+'-'+item[1]]=train[item[0]]-train[item[1]]
#         train[item[0]+'/'+item[1]]=train[item[0]]/train[item[1]]
#         train[item[0]+'*'+item[1]]=train[item[0]]*train[item[1]]
        
#         test[item[0]+'+'+item[1]] = test[item[0]]+test[item[1]]
#         test[item[0]+'-'+item[1]] = test[item[0]]-test[item[1]]
#         test[item[0]+'/'+item[1]] = test[item[0]]/test[item[1]]
#         test[item[0]+'*'+item[1]] = test[item[0]]*test[item[1]]
        
#         a1.append(item[0]+'+'+item[1])
#         a1.append(item[0]+'-'+item[1])
#         a1.append(item[0]+'/'+item[1])
#         a1.append(item[0]+'*'+item[1])

In [18]:
# from itertools import combinations, permutations
# features = ['lianShai','daxiaoPankouALL','daxiaoTypeALL','daxiaoTypeMidALL','daxiaoType','pankouOdd_End_Ji_3Real','MasterOddFlow']
# cross_features=list(combinations(features, 2))
# a1 = [] 
# for item in cross_features:
#     train[item[0]+'_'+item[1]]=train[item[0]].astype(str)+train[item[1]].astype(str)
#     test[item[0]+'_'+item[1]]=test[item[0]].astype(str)+test[item[1]].astype(str)
#     a1.append(item[0]+'_'+item[1])

In [19]:
train.head()

Unnamed: 0,lianShai,guestOdd_End_65_3,guestOdd_End_75_3,guestOdd_End_80_3,guestOdd_End_Ji_3,guestOdd_End_Zhong_3,guestOdd_Start_Ji_3,guestOdd_Start_Zhong_3,lastBifeng65,lastBifeng75,lastBifeng80,masterOdd_End_65_3,masterOdd_End_75_3,masterOdd_End_80_3,masterOdd_End_Ji_3,masterOdd_End_Zhong_3,masterOdd_Start_Ji_3,masterOdd_Start_Zhong_3,pankouOdd_End_65_3,pankouOdd_End_75_3,pankouOdd_End_80_3,pankouOdd_End_Ji_3,pankouOdd_End_Zhong_3,pankou_Start_Ji_3,pankou_Start_Zhong_3,zhongbifengNew,allbifeng,pankouOdd_End_Zhong_3Real,pankou_Start_Zhong_3Real,pankou_Start_Ji_3Real,pankouOdd_End_Ji_3Real,pankouOdd_End_65_3Real,pankouOdd_End_75_3Real,pankouOdd_End_80_3Real,result4565,result6580,result
4,西青U19,0.78,0.89,0.77,0.96,0.83,0.96,0.83,1-2,1-2,1-2,1.02,0.91,1.03,0.8,0.97,0.8,0.97,4.25,3.75,3.5,3.75,4.0,3.75,4.0,0-2,1-2-0-2,2.0,2.0,1.75,1.75,2.25,1.75,1.5,1,0,1
5,西丁,0.79,0.63,0.42,0.8,0.74,0.85,0.99,1-1,1-1,1-1,1.01,1.2,1.61,1.0,1.06,0.95,0.81,2.75,2.5,2.5,2.25,2.5,2.25,2.25,1-0,1-1-1-0,1.5,1.25,1.25,1.25,1.75,1.5,1.5,1,0,1
12,球会友谊,0.85,0.71,0.47,0.93,0.83,0.88,0.83,1-1,1-1,1-1,1.01,1.17,1.63,0.87,1.03,0.92,1.03,2.75,2.5,2.5,3.0,2.5,3.0,2.5,1-0,1-1-1-0,1.5,1.5,2.0,2.0,1.75,1.5,1.5,1,0,0
17,印度U18,0.9,0.92,0.61,0.8,0.86,0.95,1.05,1-3,1-3,1-3,0.9,0.88,1.23,0.9,0.94,0.75,0.75,4.75,4.5,4.5,2.75,4.5,2.75,4.25,1-2,1-3-1-2,1.5,1.25,-0.25,-0.25,1.75,1.5,1.5,1,0,1
21,西丙4,0.98,0.61,0.45,0.96,1.05,0.93,1.09,2-0,2-0,2-0,0.82,1.23,1.53,0.84,0.75,0.87,0.71,2.5,2.5,2.5,2.0,2.0,2.0,2.0,1-0,2-0-1-0,1.0,1.0,1.0,1.0,1.5,1.5,1.5,1,0,1


In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7543 entries, 3 to 34257
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   lianShai                   7543 non-null   object 
 1   guestOdd_End_65_3          7543 non-null   float64
 2   guestOdd_End_75_3          7543 non-null   float64
 3   guestOdd_End_80_3          7543 non-null   float64
 4   guestOdd_End_Ji_3          7543 non-null   float64
 5   guestOdd_End_Zhong_3       7543 non-null   float64
 6   guestOdd_Start_Ji_3        7543 non-null   float64
 7   guestOdd_Start_Zhong_3     7543 non-null   float64
 8   lastBifeng65               7543 non-null   object 
 9   lastBifeng75               7543 non-null   object 
 10  lastBifeng80               7543 non-null   object 
 11  masterOdd_End_65_3         7543 non-null   float64
 12  masterOdd_End_75_3         7543 non-null   float64
 13  masterOdd_End_80_3         7543 non-null   floa

In [21]:
from sklearn.model_selection import  StratifiedKFold,KFold
import category_encoders as ce
import gc
def mean_woe_target_encoder(train,test,target,col,n_splits=10):
    folds = StratifiedKFold(n_splits)

    y_oof = np.zeros(train.shape[0])
    y_oof_2= np.zeros(train.shape[0])
    y_test_oof = np.zeros(test.shape[0]).reshape(-1,1)
    y_test_oof2 = np.zeros(test.shape[0]).reshape(-1,1)

    splits = folds.split(train, target)
    
    for fold_n, (train_index, valid_index) in enumerate(splits):
        X_train, X_valid = train[col].iloc[train_index], train[col].iloc[valid_index]
        y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
        clf=ce.target_encoder.TargetEncoder()
    
        clf.fit(X_train.values,y_train.values)    
        y_pred_valid = clf.transform(X_valid.values)

        y_oof[valid_index] = y_pred_valid.values.reshape(1,-1)

        tp=(clf.transform(test[col].values)/(n_splits*1.0)).values
        tp=tp.reshape(-1,1)
        y_test_oof+=tp    
    
        del X_train, X_valid, y_train, y_valid
        gc.collect()    
    return y_oof,y_test_oof

In [22]:
train_x =  train.drop(columns=['result'])
train_y =  train['result']

test_x =  test.drop(columns=['result'])
test_y = test['result']

In [23]:
for col in train.select_dtypes(include='object').columns:
  y_oof,y_test_oof = mean_woe_target_encoder(train_x,test_x,train_y,col,n_splits=10)
  train_x[col] = y_oof
  test_x[col] = y_test_oof

In [24]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21587 entries, 4 to 98981
Data columns (total 36 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   lianShai                   21587 non-null  float64
 1   guestOdd_End_65_3          21587 non-null  float64
 2   guestOdd_End_75_3          21587 non-null  float64
 3   guestOdd_End_80_3          21587 non-null  float64
 4   guestOdd_End_Ji_3          21587 non-null  float64
 5   guestOdd_End_Zhong_3       21587 non-null  float64
 6   guestOdd_Start_Ji_3        21587 non-null  float64
 7   guestOdd_Start_Zhong_3     21587 non-null  float64
 8   lastBifeng65               21587 non-null  float64
 9   lastBifeng75               21587 non-null  float64
 10  lastBifeng80               21587 non-null  float64
 11  masterOdd_End_65_3         21587 non-null  float64
 12  masterOdd_End_75_3         21587 non-null  float64
 13  masterOdd_End_80_3         21587 non-null  flo

In [25]:
cat_feats = train.select_dtypes(include='category').columns

In [26]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import StratifiedKFold
import gc,joblib
X = train_x
y = train_y
# del train_x,train_y
# gc.collect()


params = {'num_leaves': 240,  # 结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary',  # 定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,  # 提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction": 0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,  # l1正则
          'lambda_l2': 5.985747612243422e-07,  # l2正则
          "verbosity": -1,
          "nthread": -1,  # 线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'average_precision','auc'},  # 评价函数选择
          "random_state": 2019,  # 随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67
          }

NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(test_x.shape[0])
y_preds_train  = np.zeros(train_x.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 1000, valid_sets=[
                    dtrain, dvalid], verbose_eval=200, early_stopping_rounds=400)

    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()

    y_pred_valid = clf.predict(X_valid)
    print(y_pred_valid)
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")

    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(test_x) / NFOLDS
    
    model = 'C:\\Users\\24525\\code\\spider\\model\\' + 'gbm_2' + str(fold_n) + "_singe.txt"
#     joblib.dump(clf,model)
    
    y_pred_valid = clf.predict(X_valid)
    y_preds_train[valid_index] = y_pred_valid
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(train_y, y_preds_train)}")

Training until validation scores don't improve for 400 rounds
[200]	training's auc: 0.998728	valid_1's auc: 0.544603
[400]	training's auc: 0.99986	valid_1's auc: 0.54366
Early stopping, best iteration is:
[13]	training's auc: 0.834207	valid_1's auc: 0.557967
[0.4180274  0.39552357 0.40908346 ... 0.38946006 0.40882831 0.41774   ]
Fold 1 | AUC: 0.5579668941117008
Training until validation scores don't improve for 400 rounds
[200]	training's auc: 0.998542	valid_1's auc: 0.554607
[400]	training's auc: 0.999731	valid_1's auc: 0.548382
Early stopping, best iteration is:
[115]	training's auc: 0.987741	valid_1's auc: 0.558895
[0.25513572 0.43246814 0.50207828 ... 0.28886773 0.29669303 0.31074217]
Fold 2 | AUC: 0.5588951767211813
Training until validation scores don't improve for 400 rounds
[200]	training's auc: 0.998742	valid_1's auc: 0.550873
[400]	training's auc: 0.999989	valid_1's auc: 0.543492
[600]	training's auc: 0.999999	valid_1's auc: 0.542689
Early stopping, best iteration is:
[201]	t

In [27]:
from sklearn.metrics import roc_auc_score,accuracy_score
test_new = pd.DataFrame({'result':test_y,'predict':y_preds})
a = test_new[(test_new["predict"] <= 0.50)]
pred_labels = np.rint(a['predict'])
accuracy = accuracy_score(a['result'], pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

accuarcy: 61.81%
(7298, 2)


In [28]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7543 entries, 3 to 34257
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   lianShai                   7543 non-null   object 
 1   guestOdd_End_65_3          7543 non-null   float64
 2   guestOdd_End_75_3          7543 non-null   float64
 3   guestOdd_End_80_3          7543 non-null   float64
 4   guestOdd_End_Ji_3          7543 non-null   float64
 5   guestOdd_End_Zhong_3       7543 non-null   float64
 6   guestOdd_Start_Ji_3        7543 non-null   float64
 7   guestOdd_Start_Zhong_3     7543 non-null   float64
 8   lastBifeng65               7543 non-null   object 
 9   lastBifeng75               7543 non-null   object 
 10  lastBifeng80               7543 non-null   object 
 11  masterOdd_End_65_3         7543 non-null   float64
 12  masterOdd_End_75_3         7543 non-null   float64
 13  masterOdd_End_80_3         7543 non-null   floa

In [29]:
from sklearn.metrics import roc_auc_score,accuracy_score

for i in [0.50,0.51,0.52,0.53,0.54,0.55,0.56,0.57,0.58,0.59,0.60,0.61,0.62,0.63,0.64,0.65,0.66,0.67,0.68,0.69,0.70]:
    test_new = pd.DataFrame({'result':test_y,'predict':y_preds})
    a = test_new[test_new["predict"] >= i]
    pred_labels = np.rint(a['predict'])
    accuracy = accuracy_score(a['result'], pred_labels)
    print(i,"accuarcy: %.2f%%" % (accuracy*100.0),a.shape[0])
    print((a.shape[0] * accuracy * 0.95) - (a.shape[0] * (1-accuracy)))

0.5 accuarcy: 41.22% 245
-48.05000000000001
0.51 accuarcy: 41.05% 190
-37.90000000000002
0.52 accuarcy: 41.79% 134
-24.800000000000004
0.53 accuarcy: 38.14% 97
-24.85
0.54 accuarcy: 44.83% 58
-7.300000000000001
0.55 accuarcy: 47.50% 40
-2.9499999999999993
0.56 accuarcy: 53.57% 28
1.25
0.57 accuarcy: 47.37% 19
-1.4500000000000028
0.58 accuarcy: 50.00% 14
-0.35000000000000053
0.59 accuarcy: 54.55% 11
0.6999999999999993
0.6 accuarcy: 71.43% 7
2.75
0.61 accuarcy: 50.00% 4
-0.10000000000000009
0.62 accuarcy: 33.33% 3
-1.05
0.63 accuarcy: 0.00% 1
-1.0
0.64 accuarcy: 0.00% 1
-1.0
0.65 accuarcy: nan% 0
nan
0.66 accuarcy: nan% 0
nan
0.67 accuarcy: nan% 0
nan
0.68 accuarcy: nan% 0
nan
0.69 accuarcy: nan% 0
nan
0.7 accuarcy: nan% 0
nan


In [30]:
test2 = test.copy(deep=True)
test2["result"] = test_y
test2['predict'] = y_preds
pred_labels = np.rint(test2['predict'])
test2["perRes"] = pred_labels

# import webbrowser
# for i in test2[(test2["predict"] >= 0.66)].to_dict(orient='records')[10:15]:
#     url = "http://vip.win007.com/OverDown_n.aspx?id=" + i["place"]
#     webbrowser.open(url)

In [31]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21587 entries, 4 to 98981
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   lianShai                   21587 non-null  object 
 1   guestOdd_End_65_3          21587 non-null  float64
 2   guestOdd_End_75_3          21587 non-null  float64
 3   guestOdd_End_80_3          21587 non-null  float64
 4   guestOdd_End_Ji_3          21587 non-null  float64
 5   guestOdd_End_Zhong_3       21587 non-null  float64
 6   guestOdd_Start_Ji_3        21587 non-null  float64
 7   guestOdd_Start_Zhong_3     21587 non-null  float64
 8   lastBifeng65               21587 non-null  object 
 9   lastBifeng75               21587 non-null  object 
 10  lastBifeng80               21587 non-null  object 
 11  masterOdd_End_65_3         21587 non-null  float64
 12  masterOdd_End_75_3         21587 non-null  float64
 13  masterOdd_End_80_3         21587 non-null  flo

In [32]:
moneyList = []
for per in [0.50,0.51,0.52,0.53]:
    money = 4000 
    for i in test2[(test2["predict"] >= per) & (test["masterOdd_End_75_3"] <= 2.00)].to_dict(orient='records'):
        hand = 200
        money = money - hand
        if money < 0:
            break
        if i["result"] == 1 and i["perRes"] == 1:
            money = money + hand * (1+i['masterOdd_End_80_3']+0.1)
#             money = money + hand * (2)

    print(money,per)

-32.0 0.5
-164.0 0.51
1220.0 0.52
-58.0 0.53


In [33]:
moneyList = []
test2 = test2[test2['lastBifeng']==test2['lastBifeng130']]
for per in [0.50,0.51,0.52,0.53,0.54,0.55,0.56,0.57,0.58,0.59,0.60,0.61,0.62,0.63,0.64,0.65,0.66,0.67,0.68,0.69,0.70]:
    money = 2000
    for i in test2[(test2["predict"] >= per) & (test["lastTime"] <= 75) ].to_dict(orient='records'):
        hand = 50
        money = money - hand

        if i["result"] == 1 and i["perRes"] == 1:
            money = money + hand * (1+i['masterOdd_End_130_3'])
#             money = money + hand * (2)
        if money < 0:
            break
    print(money,per)

KeyError: 'lastBifeng'

In [None]:
a = test2[(test2['pankouOdd_End_Zhong_3Real']==1.75) & (test2["predict"]  >=0.50)]
a = test2[(test2["predict"]  >=0.50)]
# a = a[(a['zhongbifengNew'] == '1-1') | (a['zhongbifengNew'] == '0-1') |(a['zhongbifengNew'] == '1-0')| (a['zhongbifengNew'] == '0-0')]
accuracy = accuracy_score(a['result'], np.rint(a["predict"]))
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)
a['pankouOdd_End_Zhong_3Real'].value_counts()

In [None]:
a = test2[(test2['pankouOdd_End_Zhong_3Real']==1.75) & (test2["predict"] >=0.50)]
# a = a[a['zhongbifengNew'] == '0-0']
# a = a[(a['zhongbifengNew'] != '1-1') & (a['zhongbifengNew'] != '0-1') & (a['zhongbifengNew'] != '1-0') & (a['zhongbifengNew'] != '0-0')]
# a = a[(a['zhongbifengNew'] == '1-1') | (a['zhongbifengNew'] == '0-1') |(a['zhongbifengNew'] == '1-0')| (a['zhongbifengNew'] == '0-0')]
# a = a[(a['zhongbifengNew'] == '1-0')]
# a = a[a['lastBifeng'] == '0-2']
# a = a[a['pankouOdd_End_Ji_3'] != "3"]
print(a.shape)
accuracy = accuracy_score(a['result'], np.rint(a["predict"]))
print("accuarcy: %.2f%%" % (accuracy*100.0))
a['lastTime'].value_counts()

In [None]:
a = test2[(test2['pankouOdd_End_Zhong_3Real']==2.0) & (test2["predict"]  >=0.50)]
# a = a[(a['zhongbifengNew'] != '1-1') & (a['zhongbifengNew'] != '0-1') & (a['zhongbifengNew'] != '1-0') & (a['zhongbifengNew'] != '0-0')]
# a = a[(a['zhongbifengNew'] == '1-1') | (a['zhongbifengNew'] == '0-1') |(a['zhongbifengNew'] == '1-0')| (a['zhongbifengNew'] == '0-0')]
accuracy = accuracy_score(a['result'], np.rint(a["predict"]))
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)
a['zhongbifengNew'].value_counts()

In [None]:
a = test2[((test2['pankouOdd_End_Zhong_3Real']==1.5) | (test2['pankouOdd_End_Zhong_3Real']==1.25)) & (test2["predict"] >=0.65)]
accuracy = accuracy_score(a['result'], np.rint(a["predict"]))
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

In [None]:
zhongList = test2["zhongbifengNew"].unique()

In [None]:
clf.feature_importance(importance_type='gain')

In [None]:
columns

In [None]:
import seaborn as sns
feature_importance_gain=pd.DataFrame()
feature_importance_gain['feature']=columns
feature_importance_gain['fold_1']=clf.feature_importance(importance_type='gain')
feature_importance_gain['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(1)]].mean(axis=1)
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importance_gain.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('100 TOP feature importance over {} folds average'.format(folds.n_splits));

In [None]:
ss

In [None]:
# import seaborn as sns
# sns.set()

# import sweetviz as sv
# # 可以选择目标特征
# my_report = sv.analyze(train, target_feat ='result')
# my_report.show_html()

In [None]:
lgb_params = {
    'objective': 'binary',
    'n_estimators': 20000,
    'random_state': 42,
    'learning_rate': 8e-3,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
}


xgb_params = {'n_estimators': 10000,
               'learning_rate': 0.03689407512484644,
               'max_depth': 8,
               'colsample_bytree': 0.3723914688159835,
               'subsample': 0.780714581166012,
               'eval_metric': 'auc',
               'use_label_encoder': False,
               'gamma': 0,
               'reg_lambda': 50.0,
               'gpu_id': 0,
               'objective': 'binary:logistic',
               'random_state': 42}


cat_params = {'iterations': 17298,
               'learning_rate': 0.03429054860458741,
               'reg_lambda': 0.3242286463210283,
               'subsample': 0.9433911589913944,
               'random_strength': 22.4849972385133,
               'depth': 8,
               'min_data_in_leaf': 4,
               'leaf_estimation_iterations': 8,
               'task_type':"GPU",
               'bootstrap_type':'Poisson',
               'verbose' : 500,
               'early_stopping_rounds' : 200,
               'eval_metric' : 'AUC',
               'thread_count':-1}

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

lgbm = LGBMClassifier(**lgb_params)

xgb = XGBClassifier(**xgb_params)

cat = CatBoostClassifier(**cat_params)

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier

lr = LogisticRegression()

sclf = StackingCVClassifier(classifiers=[lgbm, xgb,cat],
                            use_probas=True,
                            meta_classifier=lr,
                            random_state=42)

In [None]:
statcker = sclf.fit(train_x, train_y)

In [None]:
y_pred = statcker.predict_proba(test_x)[:, 1]
accuracy = roc_auc_score(test_y, y_pred)
print("accuarcy: %.2f%%" % (accuracy*100.0))

In [None]:
categorical_feats = train.select_dtypes(include='object').columns

In [None]:
data = pd.concat([train_x, train_y], axis=1)

In [None]:
test2 = test.copy(deep=True)
test2["result"] = test_y
test2['predict'] = y_preds
pred_labels = np.rint(test2['predict'])
test2["perRes"] = pred_labels


In [None]:
from sklearn.metrics import roc_auc_score,accuracy_score
test_new = pd.DataFrame({'result':test_y,'predict':y_pred})
a = test_new[(test_new["predict"] >= 0.66)]
pred_labels = np.rint(a['predict'])
accuracy = accuracy_score(a['result'], pred_labels)
print("accuarcy: %.2f%%" % (accuracy*100.0))
print(a.shape)

In [None]:
from sklearn.metrics import roc_auc_score,accuracy_score

for i in [0.50,0.51,0.52,0.53,0.54,0.55,0.56,0.57,0.58,0.59,0.60,0.61,0.62,0.63,0.64,0.65,0.66,0.67,0.68,0.69,0.70]:
    test_new = pd.DataFrame({'result':test_y,'predict':y_pred})
    a = test_new[test_new["predict"] >= i]
    pred_labels = np.rint(a['predict'])
    accuracy = accuracy_score(a['result'], pred_labels)
    print(i,"accuarcy: %.2f%%" % (accuracy*100.0),a.shape[0])
    print((a.shape[0] * accuracy * 0.95) - (a.shape[0] * (1-accuracy)))

In [None]:
def get_feature_importances(data, shuffle, seed=None):
    # Gather real features
    train_features = [f for f in data if f not in ['result']]
    # Go over fold and keep track of CV score (train and valid) and feature importances
    
    # Shuffle result if required
    y = data['result'].copy()
    if shuffle:
        # Here you could as well use a binomial distribution
        y = data['result'].copy().sample(frac=1.0)
    
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    dtrain = lgb.Dataset(data[train_features], y, free_raw_data=False, silent=True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'rf',
        'subsample': 0.623,
        'colsample_bytree': 0.7,
        'num_leaves': 127,
        'max_depth': 8,
        'seed': seed,
        'bagging_freq': 1,
        'n_jobs': 4
    }
    
    # Fit the model
    clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=200, categorical_feature=categorical_feats)
    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(train_features)
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    imp_df['trn_score'] = roc_auc_score(y, clf.predict(data[train_features]))
    
    return imp_df

In [None]:
# Seed the unexpected randomness of this world
np.random.seed(123)
# Get the actual importance, i.e. without shuffling
actual_imp_df = get_feature_importances(data=data, shuffle=False) 

In [None]:
null_imp_df = pd.DataFrame()
nb_runs = 80
import time
start = time.time()
dsp = ''
for i in range(nb_runs):
    # Get current run importances
    imp_df = get_feature_importances(data=data, shuffle=True)
    imp_df['run'] = i + 1 
    # Concat the latest importances with the old ones
    null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    # Erase previous message
    for l in range(len(dsp)):
        print('\b', end='', flush=True)
    # Display current run and time used
    spent = (time.time() - start) / 60
    dsp = 'Done with %4d of %4d (Spent %5.1f min)' % (i + 1, nb_runs, spent)
    print(dsp, end='', flush=True)

In [None]:
feature_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].mean()
    gain_score = np.log(1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, 75)))  # Avoid didvide by zero
    
    f_null_imps_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].mean()
    split_score = np.log(1e-10 + f_act_imps_split / (1 + np.percentile(f_null_imps_split, 75)))  # Avoid didvide by zero
    
    feature_scores.append((_f, split_score, gain_score))
 
scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', 'gain_score'])

In [None]:
correlation_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].values
    gain_score = 100 * (f_null_imps < np.percentile(f_act_imps, 25)).sum() / f_null_imps.size
    
    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].values
    split_score = 100 * (f_null_imps < np.percentile(f_act_imps, 25)).sum() / f_null_imps.size
    correlation_scores.append((_f, split_score, gain_score))
 
corr_scores_df = pd.DataFrame(correlation_scores, columns=['feature', 'split_score', 'gain_score'])

In [None]:
def score_feature_selection(df=None, train_features=None, cat_feats=None, target=None):
    # Fit LightGBM 
    dtrain = lgb.Dataset(df[train_features], target, free_raw_data=False, silent=True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'learning_rate': .1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'num_leaves': 31,
        'max_depth': -1,
        'seed': 13,
        'n_jobs': 4,
        'min_split_gain': .00001,
        'reg_alpha': .00001,
        'reg_lambda': .00001,
        'metric': 'auc'
    }
    
    # Fit the model
    hist = lgb.cv(
        params=lgb_params, 
        train_set=dtrain, 
        num_boost_round=2000,
        categorical_feature=cat_feats,
        nfold=5,
        stratified=True,
        shuffle=True,
        early_stopping_rounds=50,
        verbose_eval=0,
        seed=17
    )
    # Return the last mean / std values 
    return hist['auc-mean'][-1], hist['auc-stdv'][-1]
 
# features = [f for f in data.columns if f not in ['SK_ID_CURR', 'TARGET']]
# score_feature_selection(df=data[features], train_features=features, target=data['TARGET'])
 
for threshold in [0, 10, 20, 30 , 40, 50 ,60 , 70, 80 , 90, 95, 99]:
    split_feats     = [_f for _f, _score, _ in correlation_scores if _score >= threshold]
    split_cat_feats = [_f for _f, _score, _ in correlation_scores if (_score >= threshold) & (_f in categorical_feats)]
    
    gain_feats     = [_f for _f, _, _score in correlation_scores if _score >= threshold]
    gain_cat_feats = [_f for _f, _, _score in correlation_scores if (_score >= threshold) & (_f in categorical_feats)]
                                                                                             
    print('Results for threshold %3d' % threshold)
    split_results = score_feature_selection(df=data, train_features=split_feats, cat_feats=split_cat_feats, target=data['result'])
    print('\t SPLIT : %.6f +/- %.6f' % (split_results[0], split_results[1]))
    gain_results = score_feature_selection(df=data, train_features=gain_feats, cat_feats=gain_cat_feats, target=data['result'])
    print('\t GAIN  : %.6f +/- %.6f' % (gain_results[0], gain_results[1]))

In [None]:
col = []
for threshold in [70, 80 , 90, 95, 99]:
    split_feats     = [_f for _f, _score, _ in correlation_scores if _score >= threshold]
    split_cat_feats = [_f for _f, _score, _ in correlation_scores if (_score >= threshold) & (_f in categorical_feats)]
    col.append(split_feats)

In [None]:
for i in [0,1,2,3,4]:
    import lightgbm as lgb
    from sklearn.metrics import roc_auc_score,accuracy_score
    from sklearn.model_selection import StratifiedKFold
    import gc,joblib
    X = train_x[col[i]]
    y = train_y
    # del train_x,train_y
    # gc.collect()


    params = {'num_leaves': 240,  # 结果对最终效果影响较大，越大值越好，太大会出现过拟合
              'min_data_in_leaf': 30,
              'objective': 'binary',  # 定义的目标函数
              'max_depth': -1,
              'learning_rate': 0.03,
              "min_sum_hessian_in_leaf": 6,
              "boosting": "gbdt",
              "feature_fraction":  0.5992677823884304,  # 提取的特征比率
              "bagging_freq": 4,
              "bagging_fraction": 0.7100471696361973,
              "bagging_seed": 11,
              "lambda_l1": 8.545500456265467e-05,  # l1正则
              'lambda_l2': 5.985747612243422e-07,  # l2正则
              "verbosity": -1,
              "nthread": -1,  # 线程数量，-1表示全部线程，线程越多，运行的速度越快
              'metric': {'auc'},  # 评价函数选择
              "random_state": 2019,  # 随机数种子，可以防止每次运行的结果不一致
              # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
              'min_child_samples': 67
              }

    NFOLDS = 5
    folds = StratifiedKFold(n_splits=NFOLDS)

    columns = X.columns
    splits = folds.split(X, y)
    y_preds = np.zeros(test_x.shape[0])
    y_preds_train  = np.zeros(train_x.shape[0])
    score = 0

    feature_importances = pd.DataFrame()
    feature_importances['feature'] = columns

    for fold_n, (train_index, valid_index) in enumerate(splits):
        X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        dtrain = lgb.Dataset(X_train, label=y_train)
        dvalid = lgb.Dataset(X_valid, label=y_valid)

        clf = lgb.train(params, dtrain, 1000, valid_sets=[
                        dtrain, dvalid], verbose_eval=200, early_stopping_rounds=300)

        feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()

        y_pred_valid = clf.predict(X_valid)
        print(y_pred_valid)
        print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")

        score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
        y_preds += clf.predict(test_x[col[i]]) / NFOLDS

    #     model = 'C:\\Users\\24525\\code\\spider\\model\\' + 'gbm_2' + str(fold_n) + "_singe.txt"
    #     joblib.dump(clf,model)

        y_pred_valid = clf.predict(X_valid)
        y_preds_train[valid_index] = y_pred_valid

        del X_train, X_valid, y_train, y_valid
        gc.collect()

    print(f"\nMean AUC = {score}")
    print(f"Out of folds AUC = {roc_auc_score(train_y, y_preds_train)}")
    from sklearn.metrics import roc_auc_score,accuracy_score

    for i in [0.50,0.61,0.62,0.63,0.64,0.65,0.66,0.67,0.68]:
        test_new = pd.DataFrame({'result':test_y,'predict':y_preds})
        a = test_new[test_new["predict"] >= i]
        pred_labels = np.rint(a['predict'])
        accuracy = accuracy_score(a['result'], pred_labels)
        print(i,"accuarcy: %.2f%%" % (accuracy*100.0),a.shape[0])
        print((a.shape[0] * accuracy * 0.95) - (a.shape[0] * (1-accuracy)))