In [1]:
import json
import pandas as pd
import pymongo
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
def getGoalMid(masterGoal, guestGoal, masterMidGoal,guestMidGoal):
    if masterGoal + guestGoal - masterMidGoal - guestMidGoal >= 4:
        return 4
    else:
        return masterGoal + guestGoal - masterMidGoal - guestMidGoal
        
def removeSub(pankou):
    pankou = pankou.replace("升","")
    pankou = pankou.replace("降","")
    return pankou.strip()

def getResult(masterGoal, guestGoal, masterMidGoal,guestMidGoal,pankou):
    pankouList = removeSub(pankou).split("/")
    row = 0
    for p in pankouList:
        row += float(p)
    row = row / len(pankouList)
    
    if masterGoal + guestGoal - row >= 0:
        return 1
    if masterGoal + guestGoal - row < 0:
        return 0

def getType(yapanMasterStartOdd, yapanGuestStartOdd, yapanPankouStart):
    linTypeStart = get18(yapanMasterStartOdd, yapanGuestStartOdd)
    return str(linTypeStart) + "_" + yapanPankouStart

def get18(master,guest):
    if master > guest:
        return 18
    if master < guest:
        return 81
    if master == guest:
        return 99

def daxiao_num(x):
    x_list = x.split("/")
    num = 0
    for i in x_list:
        num += float(i)
    return str(float(num)/len(x_list))

def realDaxiao(x,master,guest):
    return str(float(x) - master - guest)

def shengjiang(start,end):
    return str(float(end) - float(start))

In [4]:
def fillNa(x,value):
    if pd.isnull(x):
        return value
    else:
        return x
    
def preF(test):
    
#     test = test.drop(['daxiaoMasterStartOddZao', 'daxiaoGuestStartOddZao','daxiaoPankouStartZao',
#                       'daxiaoMasterOddZao', 'daxiaoGuestOddZao','daxiaoPankouZao'], axis=1)
    

    cols_with_missing = (col for col in test.columns 
                                 if test[col].isnull().any())
    for col in cols_with_missing:
        test[col + '_was_missing'] = test[col].isnull()
        
    for col in ['daxiaoPankouZao','daxiaoPankouStartZao']:
        test[col] = test.apply(lambda x: fillNa(x[col], "-9999"), axis=1)

    for col in ['daxiaoMasterStartOddZao', 'daxiaoGuestStartOddZao','daxiaoMasterOddZao', 'daxiaoGuestOddZao']:
        test[col] = test.apply(lambda x: fillNa(x[col], -9999), axis=1)
        
    test['result'] = test.apply(lambda x: getResult(
        x['masterGoal'], x['guestGoal'], x['masterMidGoal'], x['guestMidGoal'], x['daxiaoPankouMid']), axis=1)
    
    test['goalMid'] = test.apply(lambda x: getGoalMid(
        x['masterGoal'], x['guestGoal'], x['masterMidGoal'], x['guestMidGoal']), axis=1)    
    
    test['zhongbifeng'] = test['masterMidGoal'] + test['guestMidGoal']


    
    test['daxiaoPankouStart'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStart']), axis=1)
    test['daxiaoPankou'] = test.apply(
        lambda x: removeSub(x['daxiaoPankou']), axis=1)
    test['daxiaoPankouStartMid'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStartMid']), axis=1)
    test['daxiaoPankouMid'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouMid']), axis=1)
    test['daxiaoPankouStartZao'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStartMid']), axis=1)
    test['daxiaoPankouZao'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouMid']), axis=1)
    

#     pankou = ["daxiaoPankouStart","daxiaoPankou","daxiaoPankouStartMid","daxiaoPankouMid"]
    
    pankou = ["daxiaoPankouStart","daxiaoPankou","daxiaoPankouStartMid","daxiaoPankouMid","daxiaoPankouStartZao","daxiaoPankouZao"]

    for col in pankou:
        test[col] = test[col].map(daxiao_num)
        nm = col+"_"+"real"
        test[nm] = test.apply(lambda x: realDaxiao(x[col],x['masterMidGoal'],x['guestMidGoal']), axis=1)
    
    
    test['daxiaoPankouShengJiang'] = test.apply(lambda x: shengjiang(x['daxiaoPankouStart'],x['daxiaoPankou']), axis=1)
    
    test['daxiaoPankouMidShengJiang'] = test.apply(lambda x: shengjiang(x['daxiaoPankouStartMid'],x['daxiaoPankouMid']), axis=1)

    test['daxiaoTypeStart'] = test.apply(lambda x: getType(
        x['daxiaoMasterStartOdd'], x['daxiaoGuestStartOdd'], x['daxiaoPankouStart']), axis=1)
    test['daxiaoType'] = test.apply(lambda x: getType(
        x['daxiaoMasterOdd'], x['daxiaoGuestOdd'], x['daxiaoPankou']), axis=1)
    test['daxiaoTypeStartMid'] = test.apply(lambda x: getType(
        x['daxiaoMasterStartOddMid'], x['daxiaoGuestStartOddMid'], x['daxiaoPankouStartMid']), axis=1)
    test['daxiaoTypeMid'] = test.apply(lambda x: getType(
        x['daxiaoMasterOddMid'], x['daxiaoGuestOddMid'], x['daxiaoPankouMid']), axis=1)
    
    test['daxiaoTypeStartZao'] = test.apply(lambda x: getType(
        x['daxiaoMasterStartOddZao'], x['daxiaoGuestStartOddZao'], x['daxiaoPankouStartZao']), axis=1)
    test['daxiaoTypeZao'] = test.apply(lambda x: getType(
        x['daxiaoMasterOddZao'], x['daxiaoGuestOddZao'], x['daxiaoPankouZao']), axis=1)
        
    test['daxiaoTypeStartReal'] = test.apply(lambda x: getType(
        x['daxiaoMasterStartOdd'], x['daxiaoGuestStartOdd'], x['daxiaoPankouStart_real']), axis=1)
    test['daxiaoTypeReal'] = test.apply(lambda x: getType(
        x['daxiaoMasterOdd'], x['daxiaoGuestOdd'], x['daxiaoPankou_real']), axis=1)
    
    test['daxiaoTypeStartMidReal'] = test.apply(lambda x: getType(
        x['daxiaoMasterStartOddMid'], x['daxiaoGuestStartOddMid'], x['daxiaoPankouStartMid_real']), axis=1)
    test['daxiaoTypeMidReal'] = test.apply(lambda x: getType(
        x['daxiaoMasterOddMid'], x['daxiaoGuestOddMid'], x['daxiaoPankouMid_real']), axis=1)
    
    test['daxiaoTypeStartZaoReal'] = test.apply(lambda x: getType(
        x['daxiaoMasterStartOddZao'], x['daxiaoGuestStartOddZao'], x['daxiaoPankouStartZao_real']), axis=1)
    test['daxiaoTypeZaoReal'] = test.apply(lambda x: getType(
        x['daxiaoMasterOddZao'], x['daxiaoGuestOddZao'], x['daxiaoPankouZao_real']), axis=1)

    test = test[test['goalMid'] >= 0]
    test = test[(test['daxiaoMasterStartOdd'] >= 0.75) & (test['daxiaoMasterStartOdd'] < 1.20)]
    test = test[(test['daxiaoMasterOdd'] >= 0.75) & (test['daxiaoMasterOdd'] < 1.20)]
    test = test[(test['daxiaoGuestOddMid'] >= 0.75) & (test['daxiaoGuestOddMid'] < 1.20)]
    test = test[test['zhongbifeng'].astype(float) <= test['daxiaoPankou'].astype(float)]
    test = test.drop(columns=['masterGoal', 'guestGoal', "goalMid"])
    
    return test

In [5]:
def getShuiPing(x):
    result = 11
    if x <= 0.75:
        result = 0
    if 0.75< x and x<=0.80:
        result = 1
    if 0.80<x and x<=0.85:
        result = 3
    if 0.85<x and x<=0.90:
        result = 4
    if 0.95<x and x<=1.00:
        result = 5
    if 1.00<x and x<=1.08:
        result = 6
    if 1.08<x and x<=1.10:
        result = 7
    if 1.10<x and x<=1.15:
        result = 8
    if 1.15<x and x<=1.20:
        result = 9
    if x > 1.20:
        result = 10
    return result
        
def num_fea_dis(df,features):
    for f in features:
        nm = f+'_'+'shuiPing'
        df[nm] = df[f].map(getShuiPing)
    return df

In [6]:
# myclient = pymongo.MongoClient("mongodb://localhost:27017/")
# mydb = myclient["soccerData"]
# mycol = mydb["midRawData007"]
# test = pd.DataFrame(list(mycol.find({"time":{"$gte":"2021-00-00 00:00:00"}})))
# test = test.drop(['_id','place',"time"], axis=1)

In [9]:
test = pd.read_csv('test.csv')
test = reduce_mem_usage(test)
test = preF(test)

fes = ['daxiaoMasterStartOdd','daxiaoGuestStartOdd',
       'daxiaoMasterOdd','daxiaoGuestOdd',
       'daxiaoMasterStartOddMid','daxiaoGuestStartOddMid',
       'daxiaoMasterOddMid','daxiaoGuestOddMid',
       "daxiaoMasterStartOddZao","daxiaoGuestStartOddZao",
       "daxiaoMasterOddZao","daxiaoGuestOddZao"]

# fes = ['daxiaoMasterStartOdd','daxiaoGuestStartOdd',
#        'daxiaoMasterOdd','daxiaoGuestOdd',
#        'daxiaoMasterStartOddMid','daxiaoGuestStartOddMid',
#        'daxiaoMasterOddMid','daxiaoGuestOddMid']

test= num_fea_dis(test,fes)

Memory usage of dataframe is 3.73 MB
Memory usage after optimization is: 2.12 MB
Decreased by 43.2%


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18526 entries, 0 to 22247
Data columns (total 60 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   masterMidGoal                        18526 non-null  int8   
 1   guestMidGoal                         18526 non-null  int8   
 2   daxiaoMasterStartOddZao              18526 non-null  float64
 3   daxiaoGuestStartOddZao               18526 non-null  float64
 4   daxiaoPankouStartZao                 18526 non-null  object 
 5   daxiaoMasterOddZao                   18526 non-null  float64
 6   daxiaoGuestOddZao                    18526 non-null  float64
 7   daxiaoPankouZao                      18526 non-null  object 
 8   daxiaoMasterStartOdd                 18526 non-null  float32
 9   daxiaoGuestStartOdd                  18526 non-null  float32
 10  daxiaoPankouStart                    18526 non-null  object 
 11  daxiaoMasterOdd             

In [11]:
# myclient = pymongo.MongoClient("mongodb://localhost:27017/")
# mydb = myclient["soccerData"]
# mycol = mydb["midRawData007"]

# train = pd.DataFrame(list(mycol.find({"time":{"$lt":"2021-00-00 00:00:00",}})))
# train = train.drop(['_id','place'], axis=1)

In [12]:
train = pd.read_csv('train.csv')
train = reduce_mem_usage(train)
train = preF(train)
train = num_fea_dis(train,fes)

Memory usage of dataframe is 51.56 MB
Memory usage after optimization is: 29.30 MB
Decreased by 43.2%


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244819 entries, 1 to 307211
Data columns (total 60 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   masterMidGoal                        244819 non-null  int8   
 1   guestMidGoal                         244819 non-null  int8   
 2   daxiaoMasterStartOdd                 244819 non-null  float32
 3   daxiaoGuestStartOdd                  244819 non-null  float32
 4   daxiaoPankouStart                    244819 non-null  object 
 5   daxiaoMasterOdd                      244819 non-null  float32
 6   daxiaoGuestOdd                       244819 non-null  float32
 7   daxiaoPankou                         244819 non-null  object 
 8   daxiaoMasterStartOddMid              244819 non-null  float32
 9   daxiaoGuestStartOddMid               244819 non-null  float32
 10  daxiaoPankouStartMid                 244819 non-null  object 
 11  daxiaoMasterO

In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18526 entries, 0 to 22247
Data columns (total 60 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   masterMidGoal                        18526 non-null  int8   
 1   guestMidGoal                         18526 non-null  int8   
 2   daxiaoMasterStartOddZao              18526 non-null  float64
 3   daxiaoGuestStartOddZao               18526 non-null  float64
 4   daxiaoPankouStartZao                 18526 non-null  object 
 5   daxiaoMasterOddZao                   18526 non-null  float64
 6   daxiaoGuestOddZao                    18526 non-null  float64
 7   daxiaoPankouZao                      18526 non-null  object 
 8   daxiaoMasterStartOdd                 18526 non-null  float32
 9   daxiaoGuestStartOdd                  18526 non-null  float32
 10  daxiaoPankouStart                    18526 non-null  object 
 11  daxiaoMasterOdd             

In [15]:
import gc

def encode_CB(col1,col2,df1,df2):
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str) 

def encode_CB3(col1,col2,col3,df1,df2,name=""):
    if name == "":
        nm = col1+'_'+col2+'_'+col3
    else:
        nm = name
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)+'_'+df1[col3].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str)+'_'+df2[col3].astype(str)


In [16]:
encode_CB3("daxiaoMasterStartOdd_shuiPing","daxiaoGuestStartOdd_shuiPing","daxiaoPankouStart",train,test,"typeLinStart")
encode_CB3("daxiaoMasterOdd_shuiPing","daxiaoGuestOdd_shuiPing","daxiaoPankou",train,test,"typeLinEnd")

encode_CB3("daxiaoMasterStartOddMid_shuiPing","daxiaoGuestStartOddMid_shuiPing","daxiaoPankouStartMid",train,test,"typeMidStart")
encode_CB3("daxiaoMasterOddMid_shuiPing","daxiaoGuestOddMid_shuiPing","daxiaoPankouMid",train,test,"typeMidEnd")

encode_CB3("daxiaoMasterStartOddZao_shuiPing","daxiaoGuestStartOddZao_shuiPing","daxiaoPankouStartZao",train,test,"typeZaoStart")
encode_CB3("daxiaoMasterOddZao_shuiPing","daxiaoGuestOddZao_shuiPing","daxiaoPankouZao",train,test,"typeZaoEnd")


encode_CB3("zhongbifeng","typeLinStart","typeLinEnd",train,test,"TypeLin")

encode_CB3("zhongbifeng","typeMidStart","typeMidEnd",train,test,"TypeMid")

encode_CB3("zhongbifeng","typeZaoStart","typeZaoEnd",train,test,"TypeZao")

encode_CB3("zhongbifeng","typeLinStart","typeMidEnd",train,test,"TypeLinMid")

encode_CB3("zhongbifeng","typeZaoStart","typeLinEnd",train,test,"TypeZaoLin")

encode_CB3("TypeLin","typeMidStart","typeMidEnd",train,test,"TypeAll")

encode_CB3("TypeAll","typeZaoStart","typeZaoEnd",train,test,"TypeAllZAO")

In [17]:
encode_CB("daxiaoPankou","daxiaoPankouMid",train,test)
encode_CB("daxiaoPankouStart","daxiaoPankouMid",train,test)

encode_CB("daxiaoPankouStart","daxiaoPankou",train,test)
encode_CB("daxiaoPankouStartMid","daxiaoPankouMid",train,test)

encode_CB("daxiaoPankouStartZao","daxiaoPankouZao",train,test)
encode_CB("daxiaoPankouStartZao","daxiaoPankou",train,test)
encode_CB("daxiaoPankouStartZao","daxiaoPankouMid",train,test)


encode_CB("daxiaoPankou_real","daxiaoPankouMid_real",train,test)
encode_CB("daxiaoPankouStart_real","daxiaoPankou_real",train,test)
encode_CB("daxiaoPankouStartMid_real","daxiaoPankouMid_real",train,test)

encode_CB("daxiaoType","daxiaoTypeMid",train,test)
encode_CB("daxiaoTypeStart","daxiaoType",train,test)
encode_CB("daxiaoTypeStartMid","daxiaoTypeMid",train,test)


encode_CB("daxiaoTypeReal","daxiaoTypeMidReal",train,test)
encode_CB("daxiaoTypeStartReal","daxiaoTypeReal",train,test)
encode_CB("daxiaoTypeStartMidReal","daxiaoTypeMidReal",train,test)

encode_CB("zhongbifeng","daxiaoPankouShengJiang",train,test)
encode_CB("zhongbifeng","daxiaoPankouMidShengJiang",train,test)


encode_CB3("zhongbifeng","daxiaoPankou","daxiaoPankouMid",train,test)
encode_CB3("zhongbifeng","daxiaoPankouStart","daxiaoPankou",train,test)
encode_CB3("zhongbifeng","daxiaoPankouStartMid","daxiaoPankouMid",train,test)

encode_CB3("zhongbifeng","daxiaoType","daxiaoTypeMid",train,test)

encode_CB3("zhongbifeng","daxiaoTypeStart","daxiaoType",train,test,"oldTypeLin")
encode_CB3("zhongbifeng","daxiaoTypeStartMid","daxiaoTypeMid",train,test,"oldTypeMid")

encode_CB3("oldTypeLin","daxiaoTypeStartMid","daxiaoTypeMid",train,test,"oldTypeLinMid")

encode_CB3("oldTypeLinMid","daxiaoTypeStartZao","daxiaoTypeZao",train,test,"oldTypeLinMidZao")


encode_CB3("zhongbifeng","daxiaoPankou_real","daxiaoPankouMid_real",train,test)
encode_CB3("zhongbifeng","daxiaoPankouStart_real","daxiaoPankou_real",train,test)
encode_CB3("zhongbifeng","daxiaoPankouStartMid_real","daxiaoPankouMid_real",train,test)

encode_CB3("zhongbifeng","daxiaoTypeReal","daxiaoTypeMidReal",train,test)
encode_CB3("zhongbifeng","daxiaoTypeStartReal","daxiaoTypeReal",train,test)
encode_CB3("zhongbifeng","daxiaoTypeStartMidReal","daxiaoTypeMidReal",train,test)

In [18]:
def encode_Count(df1, df2, col):
    df = df1[[col, 'result']]
    nm = col+'_COUNT'
    dicts = df.groupby(col)['result'].sum().to_dict()
    df1[nm] = df1[col].map(dicts).astype(int)
    
    df2[nm] = df2[col].map(dicts)
#     df2[nm] = df2.apply(lambda x: fillNa(x[nm], 9999), axis=1)
#     df2[nm] = df2[nm].astype(int)

    nm = col+'_ALL'
    dicts = df.groupby(col)['result'].count().to_dict()
    df1[nm] = df1[col].map(dicts)
    
    df2[nm] = df2[col].map(dicts)
#     df2[nm] = df2.apply(lambda x: fillNa(x[nm], 1), axis=1)
#     df2[nm] = df2[nm].astype(int)

    nm = col+'_CT'
    df1[nm] = df1[col+'_COUNT'] / df1[col+'_ALL']
    df2[nm] = df2[col+'_COUNT'] / df2[col+'_ALL']


for col in ["TypeMid", "TypeLinMid", "TypeZao", "TypeZaoLin","oldTypeLinMidZao"]:
    encode_Count(train, test, col)
    test = test.drop([col+'_ALL', col+'_COUNT'], axis=1)
    train = train.drop([col+'_ALL', col+'_COUNT'], axis=1)

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244819 entries, 1 to 307211
Columns: 110 entries, masterMidGoal to oldTypeLinMidZao_CT
dtypes: bool(6), float32(8), float64(9), int64(13), int8(3), object(71)
memory usage: 185.1+ MB


In [20]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18526 entries, 0 to 22247
Columns: 110 entries, masterMidGoal to oldTypeLinMidZao_CT
dtypes: bool(6), float32(8), float64(9), int64(13), int8(3), object(71)
memory usage: 14.0+ MB


In [None]:
train_x =  train.drop(columns=['result'])
train_y =  train['result']

test_x =  test.drop(columns=['result'])
test_y =  test['result']


In [None]:
import category_encoders as ce

cat_features = train_x.select_dtypes(include='object').columns
# test_x[cat_features] = test_x[cat_features].astype('category')
# train_x[cat_features] = train_x[cat_features].astype('category')

target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(train_x[cat_features], train_y)

train_x[cat_features] = target_enc.transform(train_x[cat_features])

test_x[cat_features] = target_enc.transform(test_x[cat_features])

import category_encoders as ce

cat_features = train_x.select_dtypes(include='category').columns

target_enc = ce.TargetEncoder(cols=cat_features)
target_enc.fit(train_x[cat_features], train_y)

train_x[cat_features] = target_enc.transform(train_x[cat_features])

test_x[cat_features] = target_enc.transform(test_x[cat_features])

In [None]:
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

lgb_train = lgb.Dataset(train_x, train_y)

# 参数设置
params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,		#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 2019,	#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67,
          }


print('Starting training...')
# 模型训练
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20)


# 模型预测
y_pred = gbm.predict(test_x, num_iteration=gbm.best_iteration)

accuracy = roc_auc_score(test_y, y_pred)
print("accuarcy: %.2f%%" % (accuracy*100.0))

cv_results = lgb.cv(params, lgb_train, num_boost_round=500, nfold=5, 
                    verbose_eval=20, early_stopping_rounds=40)

np.array(cv_results["auc-mean"]).mean()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
lgb.plot_importance(gbm, max_num_features=60, height=0.5, ax=ax)
plt.show()

In [None]:
import gc
X=train_x
y=train_y
# del train_x,train_y
# gc.collect()

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,				#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 2019,	#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67,
          }

NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 1000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=100)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(y_pred_valid)
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()

print(f"\nMean AUC = {score}")

In [None]:
import seaborn as sns
feature_importance_gain=pd.DataFrame()
feature_importance_gain['feature']=columns
feature_importance_gain['fold_1']=clf.feature_importance(importance_type='gain')
feature_importance_gain['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(1)]].mean(axis=1)
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importance_gain.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('100 TOP feature importance over {} folds average'.format(folds.n_splits));

In [None]:
import sweetviz as sv
# 可以选择目标特征
train = pd.concat([train_x,train_y],axis=1)
my_report = sv.analyze(train, target_feat ='result')
my_report.show_html()

In [None]:
import sweetviz as sv
# 可以选择目标特征
my_report = sv.analyze(test, target_feat ='result')
my_report.show_html()

In [None]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(loss_function="Logloss",
                           eval_metric="AUC",
                           learning_rate=0.01,
                           iterations=7000,
                           l2_leaf_reg=50,
                           random_seed=432013,
                           od_type="Iter",
                           depth=5,
                           early_stopping_rounds=15000,
                           border_count=64
                           #has_time= True 
                          )

clf.fit(train_x, train_y)

prediction = clf.predict_proba(test_x)[:,1]

accuracy = roc_auc_score(test_y, prediction)
print("accuarcy: %.2f%%" % (accuracy*100.0))

In [None]:
from lightgbm import LGBMClassifier

params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,		#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 2019,	#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67,
          }


xgb = XGBClassifier()

gbm = LGBMClassifier(**params)

classifiers = [clf,gbm]

from combo.models.classifier_stacking import Stacking

clf = Stacking(base_estimators=classifiers, n_folds=5, shuffle_data=False,
             keep_original=True, use_proba=True, random_state=2021)

clf.fit(train_x, train_y)
y_test_predict = clf.predict_proba(test_x)

accuracy = roc_auc_score(test_y, y_test_predict)
print("accuarcy: %.2f%%" % (accuracy*100.0))