In [9]:
import json
import pandas as pd
import pymongo
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

In [10]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [11]:
def getGoalMid(masterGoal, guestGoal, masterMidGoal,guestMidGoal):
    if masterGoal + guestGoal - masterMidGoal - guestMidGoal >= 4:
        return 4
    else:
        return masterGoal + guestGoal - masterMidGoal - guestMidGoal
        
def removeSub(pankou):
    pankou = pankou.replace("升","")
    pankou = pankou.replace("降","")
    return pankou.strip()

def getResult(masterGoal, guestGoal, masterMidGoal,guestMidGoal,pankou):
    pankouList = removeSub(pankou).split("/")
    row = 0
    for p in pankouList:
        row += float(p)
    row = row / len(pankouList)
    
    if masterGoal + guestGoal - row >= 0:
        return 1
    if masterGoal + guestGoal - row < 0:
        return 0

def getType(yapanMasterStartOdd, yapanGuestStartOdd, yapanPankouStart):
    linTypeStart = get18(yapanMasterStartOdd, yapanGuestStartOdd)
    return str(linTypeStart) + "_" + yapanPankouStart

def get18(master,guest):
    if master > guest:
        return 18
    if master < guest:
        return 81
    if master == guest:
        return 99

def daxiao_num(x):
    x_list = x.split("/")
    num = 0
    for i in x_list:
        num += float(i)
    return str(float(num)/len(x_list))

def realDaxiao(x,master,guest):
    return str(float(x) - master - guest)

def shengjiang(start,end):
    return str(float(end) - float(start))

In [12]:
def fillNa(x,value):
    if pd.isnull(x):
        return value
    else:
        return x

def preF(test):
    
    cols_with_missing = (col for col in test.columns 
                                 if test[col].isnull().any())
    for col in cols_with_missing:
        test[col + '_was_missing'] = test[col].isnull()
        
    for col in ['daxiaoPankouZao','daxiaoPankouStartZao']:
        test[col] = test.apply(lambda x: fillNa(x[col], "-9999"), axis=1)

    for col in ['daxiaoMasterStartOddZao', 'daxiaoGuestStartOddZao','daxiaoMasterOddZao', 'daxiaoGuestOddZao']:
        test[col] = test.apply(lambda x: fillNa(x[col], -9999), axis=1)
        
    test['result'] = test.apply(lambda x: getResult(
        x['masterGoal'], x['guestGoal'], x['masterMidGoal'], x['guestMidGoal'], x['daxiaoPankouMid']), axis=1)
    
    test['goalMid'] = test.apply(lambda x: getGoalMid(
        x['masterGoal'], x['guestGoal'], x['masterMidGoal'], x['guestMidGoal']), axis=1)    
    
    test['zhongbifeng'] = test['masterMidGoal'] + test['guestMidGoal']

    test['daxiaoPankouStart'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStart']), axis=1)
    test['daxiaoPankou'] = test.apply(
        lambda x: removeSub(x['daxiaoPankou']), axis=1)
    test['daxiaoPankouStartMid'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStartMid']), axis=1)
    test['daxiaoPankouMid'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouMid']), axis=1)
    test['daxiaoPankouStartZao'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStartMid']), axis=1)
    test['daxiaoPankouZao'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouMid']), axis=1)
        
    pankou = ["daxiaoPankouStart","daxiaoPankou","daxiaoPankouStartMid","daxiaoPankouMid","daxiaoPankouStartZao","daxiaoPankouZao"]

    for col in pankou:
        test[col] = test[col].map(daxiao_num)
        nm = col+"_"+"real"
        test[nm] = test.apply(lambda x: realDaxiao(x[col],x['masterMidGoal'],x['guestMidGoal']), axis=1)
    
    
    test = test[test['goalMid'] >= 0]
    test = test[(test['daxiaoMasterStartOdd'] >= 0.75) & (test['daxiaoMasterStartOdd'] < 1.20)]
    test = test[(test['daxiaoMasterOdd'] >= 0.75) & (test['daxiaoMasterOdd'] < 1.20)]
    test = test[(test['daxiaoGuestOddMid'] >= 0.75) & (test['daxiaoGuestOddMid'] < 1.20)]
    test = test[test['zhongbifeng'].astype(float) <= test['daxiaoPankou'].astype(float)]
    test = test.drop(columns=['masterGoal', 'guestGoal', "goalMid"])
    return test

In [13]:
fes = ['daxiaoMasterStartOdd','daxiaoGuestStartOdd',
       'daxiaoMasterOdd','daxiaoGuestOdd',
       'daxiaoMasterStartOddMid','daxiaoGuestStartOddMid',
       'daxiaoMasterOddMid','daxiaoGuestOddMid',
       "daxiaoMasterStartOddZao","daxiaoGuestStartOddZao",
       "daxiaoMasterOddZao","daxiaoGuestOddZao"]


test = pd.read_csv('test.csv')
test = reduce_mem_usage(test)
test = preF(test)

Memory usage of dataframe is 3.73 MB
Memory usage after optimization is: 2.12 MB
Decreased by 43.2%


In [14]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18526 entries, 0 to 22247
Data columns (total 34 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   masterMidGoal                        18526 non-null  int8   
 1   guestMidGoal                         18526 non-null  int8   
 2   daxiaoMasterStartOddZao              18526 non-null  float64
 3   daxiaoGuestStartOddZao               18526 non-null  float64
 4   daxiaoPankouStartZao                 18526 non-null  object 
 5   daxiaoMasterOddZao                   18526 non-null  float64
 6   daxiaoGuestOddZao                    18526 non-null  float64
 7   daxiaoPankouZao                      18526 non-null  object 
 8   daxiaoMasterStartOdd                 18526 non-null  float32
 9   daxiaoGuestStartOdd                  18526 non-null  float32
 10  daxiaoPankouStart                    18526 non-null  object 
 11  daxiaoMasterOdd             

In [17]:
train = pd.read_csv('train.csv')
train = reduce_mem_usage(train)
train = preF(train)

Memory usage of dataframe is 51.56 MB
Memory usage after optimization is: 29.30 MB
Decreased by 43.2%


In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244819 entries, 1 to 307211
Data columns (total 34 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   masterMidGoal                        244819 non-null  int8   
 1   guestMidGoal                         244819 non-null  int8   
 2   daxiaoMasterStartOdd                 244819 non-null  float32
 3   daxiaoGuestStartOdd                  244819 non-null  float32
 4   daxiaoPankouStart                    244819 non-null  object 
 5   daxiaoMasterOdd                      244819 non-null  float32
 6   daxiaoGuestOdd                       244819 non-null  float32
 7   daxiaoPankou                         244819 non-null  object 
 8   daxiaoMasterStartOddMid              244819 non-null  float32
 9   daxiaoGuestStartOddMid               244819 non-null  float32
 10  daxiaoPankouStartMid                 244819 non-null  object 
 11  daxiaoMasterO

In [35]:
import gc

def encode_LE(col,train,test):
    df_comb = pd.concat([train[col],test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    nm = col
    if df_comb.max()>32000: 
        train[nm] = df_comb[:len(train)].astype('int32')
        test[nm] = df_comb[len(train):].astype('int32')
    else:
        train[nm] = df_comb[:len(train)].astype('int16')
        test[nm] = df_comb[len(train):].astype('int16')
    del df_comb; x=gc.collect()
    
def encode_CB(col1,col2,df1,df2):
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str) 
#     encode_LE(nm,df1,df2)
    
def encode_CB3(col1,col2,col3,df1,df2,name=""):
    if name == "":
        nm = col1+'_'+col2+'_'+col3
    else:
        nm = name
    df1[name] = df1[col1].astype(str)+'_'+df1[col2].astype(str)+'_'+df1[col3].astype(str)
    df2[name] = df2[col1].astype(str)+'_'+df2[col2].astype(str)+'_'+df2[col3].astype(str)
#     encode_LE(nm,df1,df2)


In [36]:
encode_CB3("zhongbifeng","daxiaoMasterStartOdd","daxiaoPankouStart",train,test,"typeLinStart")
encode_CB3("zhongbifeng","daxiaoMasterOdd","daxiaoPankou",train,test,"typeLinEnd")

encode_CB3("zhongbifeng","daxiaoMasterStartOddMid","daxiaoPankouStartMid",train,test,"typeMidStart")
encode_CB3("zhongbifeng","daxiaoMasterOddMid","daxiaoPankouMid",train,test,"typeMidEnd")

encode_CB3("zhongbifeng","daxiaoMasterStartOddZao","daxiaoPankouStartZao",train,test,"typeZaoStart")
encode_CB3("zhongbifeng","daxiaoMasterOddZao","daxiaoPankouZao",train,test,"typeZaoEnd")

In [48]:
def fillResultNew(x, value):
    if pd.isnull(x):
        return value
    else:
        return x

    
def get_wight(col,df1):
    nm = col +"_" +'3'
    dicts = df1.groupby(col)['result'].sum().to_dict()
    df1[nm] = df1[col].map(dicts).astype(int)

    nm = col +"_" +'ALL'
    dicts = df1.groupby(col)['result'].count().to_dict()
    df1[nm] = df1[col].map(dicts).astype(int)
    
    nm = col + "_wight"
    df1[nm] =  df1[col+'_3'] / df1[col+'_ALL']
    
def three_mood(df1, df2, col, start, end):
    wight = df1[col + "_wight"]
    nm = col + str(start) + "_" + str(end) + '_3'
    dicts = df1[start:end].groupby(col)['result'].sum().to_dict()
    df1[nm] = df1[col].map(dicts).astype(int)

    df2[nm] = df2[col].map(dicts)
    df2[nm] = df2.apply(lambda x: fillResultNew(x[nm], 0), axis=1)
    df2[nm] = df2[nm].astype(int)

    nm = col + str(start) + "_" + str(end) + '_ALL'
    dicts = df1[start:end].groupby(col)['result'].count().to_dict()
    df1[nm] = df1[col].map(dicts).astype(int)
    
    df2[nm] = df2[col].map(dicts)
    df2[nm] = df2.apply(lambda x: fillResultNew(x[nm], 0), axis=1)
    df2[nm] = df2[nm].astype(int)

    nm = col + str(start) + "_" + str(end) + '_0'
    df1[nm] = df1[col+'_ALL'] - df1[col+'_3']
    df2[nm] = df2[col+'_ALL'] - df2[col+'_3']

    nm_RES = col + str(start) + "_" + str(end) + '_RES'

    df1[nm_RES] = 1
    if df1[col + "_3"] - 3 * df1[col + "_0"] > 0:
        df1[nm] = 3
    if df1[col + "_3"] - 3 * df1[col + "_0"] > 0:
        df1[nm] = 0

    df2[nm_RES] = 1
    if df2[col + "_3"] - 3 * df2[col + "_0"] > 0:
        df2[nm] = 3
    if df2[col + "_3"] - 3 * df2[col + "_0"] > 0:
        df2[nm] = 0
    nm = col + str(start) + "_" + str(end) + '_CON'

    df1[nm] = 0

    if df1[nm_RES] == 3:
        df1[nm] = (df1[col+'_3'] - 3 * df1[col+'_0']) * wight

    if df1[nm_RES] == 0:
        df1[nm] = (df1[col+'_0'] - 3 * df1[col+'_3']) * (1-wight)

    df2[nm] = 0

    if df2[nm_RES] == 3:
        df2[nm] = (df2[col+'_3'] - 3 * df2[col+'_0']) * wight

    if df1[nm_RES] == 0:
        df2[nm] = (df1[col+'_0'] - 3 * df2[col+'_3']) * (1-wight)

In [49]:
from itertools import combinations
features = ["typeLinStart","typeLinEnd","typeMidStart","typeMidEnd","typeZaoStart","typeZaoEnd"]

cross_features=list(combinations(features, 3))

for item in cross_features:
    nm = item[0]+'_'+item[1]+"_"+item[2]
    train[nm]=train[item[0]].astype(str)+train[item[1]].astype(str)+train[item[2]].astype(str)
    test[nm]=test[item[0]].astype(str)+test[item[1]].astype(str)+test[item[2]].astype(str)

In [50]:
for item in cross_features:
    nm = item[0]+'_'+item[1]+"_"+item[2]
    get_wight(nm,train)
    three_mood(train,test,nm,0,50000)
    three_mood(train,test,nm,50000,100000)
    three_mood(train,test,nm,100000,150000)
    three_mood(train,test,nm,150000,200000)
    three_mood(train,test,nm,200000,train.shape[0]-1)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [43]:
train.head

Unnamed: 0,masterMidGoal,guestMidGoal,daxiaoMasterStartOdd,daxiaoGuestStartOdd,daxiaoPankouStart,daxiaoMasterOdd,daxiaoGuestOdd,daxiaoPankou,daxiaoMasterStartOddMid,daxiaoGuestStartOddMid,...,typeLinEnd_typeMidStart_typeMidEnd,typeLinEnd_typeMidStart_typeZaoStart,typeLinEnd_typeMidStart_typeZaoEnd,typeLinEnd_typeMidEnd_typeZaoStart,typeLinEnd_typeMidEnd_typeZaoEnd,typeLinEnd_typeZaoStart_typeZaoEnd,typeMidStart_typeMidEnd_typeZaoStart,typeMidStart_typeMidEnd_typeZaoEnd,typeMidStart_typeZaoStart_typeZaoEnd,typeMidEnd_typeZaoStart_typeZaoEnd
1,0,0,0.95,0.85,2.75,0.85,0.95,2.5,0.91,0.89,...,0_0.85_2.50_0.91_1.250_0.86_1.25,0_0.85_2.50_0.91_1.250_-9999.0_1.25,0_0.85_2.50_0.91_1.250_-9999.0_1.25,0_0.85_2.50_0.86_1.250_-9999.0_1.25,0_0.85_2.50_0.86_1.250_-9999.0_1.25,0_0.85_2.50_-9999.0_1.250_-9999.0_1.25,0_0.91_1.250_0.86_1.250_-9999.0_1.25,0_0.91_1.250_0.86_1.250_-9999.0_1.25,0_0.91_1.250_-9999.0_1.250_-9999.0_1.25,0_0.86_1.250_-9999.0_1.250_-9999.0_1.25
2,1,0,0.95,0.85,3.25,0.95,0.85,3.25,0.96,0.80,...,1_0.95_3.251_0.96_2.751_1.01_2.75,1_0.95_3.251_0.96_2.751_-9999.0_2.75,1_0.95_3.251_0.96_2.751_-9999.0_2.75,1_0.95_3.251_1.01_2.751_-9999.0_2.75,1_0.95_3.251_1.01_2.751_-9999.0_2.75,1_0.95_3.251_-9999.0_2.751_-9999.0_2.75,1_0.96_2.751_1.01_2.751_-9999.0_2.75,1_0.96_2.751_1.01_2.751_-9999.0_2.75,1_0.96_2.751_-9999.0_2.751_-9999.0_2.75,1_1.01_2.751_-9999.0_2.751_-9999.0_2.75
3,0,2,0.80,0.96,3.0,0.90,0.90,3.0,0.94,0.86,...,2_0.9_3.02_0.94_3.752_1.0_3.75,2_0.9_3.02_0.94_3.752_-9999.0_3.75,2_0.9_3.02_0.94_3.752_-9999.0_3.75,2_0.9_3.02_1.0_3.752_-9999.0_3.75,2_0.9_3.02_1.0_3.752_-9999.0_3.75,2_0.9_3.02_-9999.0_3.752_-9999.0_3.75,2_0.94_3.752_1.0_3.752_-9999.0_3.75,2_0.94_3.752_1.0_3.752_-9999.0_3.75,2_0.94_3.752_-9999.0_3.752_-9999.0_3.75,2_1.0_3.752_-9999.0_3.752_-9999.0_3.75
4,0,0,0.88,0.88,3.25,0.80,1.00,3.25,0.93,0.87,...,0_0.8_3.250_0.93_1.750_0.85_1.75,0_0.8_3.250_0.93_1.750_-9999.0_1.75,0_0.8_3.250_0.93_1.750_-9999.0_1.75,0_0.8_3.250_0.85_1.750_-9999.0_1.75,0_0.8_3.250_0.85_1.750_-9999.0_1.75,0_0.8_3.250_-9999.0_1.750_-9999.0_1.75,0_0.93_1.750_0.85_1.750_-9999.0_1.75,0_0.93_1.750_0.85_1.750_-9999.0_1.75,0_0.93_1.750_-9999.0_1.750_-9999.0_1.75,0_0.85_1.750_-9999.0_1.750_-9999.0_1.75
5,0,0,0.97,0.89,2.75,0.96,0.90,2.75,1.02,0.84,...,0_0.96_2.750_1.02_1.50_1.12_1.5,0_0.96_2.750_1.02_1.50_1.0099999904632568_1.5,0_0.96_2.750_1.02_1.50_1.0099999904632568_1.5,0_0.96_2.750_1.12_1.50_1.0099999904632568_1.5,0_0.96_2.750_1.12_1.50_1.0099999904632568_1.5,0_0.96_2.750_1.0099999904632568_1.50_1.0099999...,0_1.02_1.50_1.12_1.50_1.0099999904632568_1.5,0_1.02_1.50_1.12_1.50_1.0099999904632568_1.5,0_1.02_1.50_1.0099999904632568_1.50_1.00999999...,0_1.12_1.50_1.0099999904632568_1.50_1.00999999...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307206,2,0,0.93,0.87,3.0,0.87,0.93,2.5,1.05,0.81,...,2_0.87_2.52_1.05_3.752_0.93_3.75,2_0.87_2.52_1.05_3.752_0.8100000023841858_3.75,2_0.87_2.52_1.05_3.752_0.9399999976158142_3.75,2_0.87_2.52_0.93_3.752_0.8100000023841858_3.75,2_0.87_2.52_0.93_3.752_0.9399999976158142_3.75,2_0.87_2.52_0.8100000023841858_3.752_0.9399999...,2_1.05_3.752_0.93_3.752_0.8100000023841858_3.75,2_1.05_3.752_0.93_3.752_0.9399999976158142_3.75,2_1.05_3.752_0.8100000023841858_3.752_0.939999...,2_0.93_3.752_0.8100000023841858_3.752_0.939999...
307207,2,0,0.93,0.87,3.0,0.87,0.93,2.5,1.05,0.81,...,2_0.87_2.52_1.05_3.752_0.93_3.75,2_0.87_2.52_1.05_3.752_-9999.0_3.75,2_0.87_2.52_1.05_3.752_-9999.0_3.75,2_0.87_2.52_0.93_3.752_-9999.0_3.75,2_0.87_2.52_0.93_3.752_-9999.0_3.75,2_0.87_2.52_-9999.0_3.752_-9999.0_3.75,2_1.05_3.752_0.93_3.752_-9999.0_3.75,2_1.05_3.752_0.93_3.752_-9999.0_3.75,2_1.05_3.752_-9999.0_3.752_-9999.0_3.75,2_0.93_3.752_-9999.0_3.752_-9999.0_3.75
307208,1,0,0.98,0.82,2.75,1.00,0.80,2.5,0.77,1.09,...,1_1.0_2.51_0.77_2.251_0.82_2.25,1_1.0_2.51_0.77_2.251_0.8600000143051147_2.25,1_1.0_2.51_0.77_2.251_0.949999988079071_2.25,1_1.0_2.51_0.82_2.251_0.8600000143051147_2.25,1_1.0_2.51_0.82_2.251_0.949999988079071_2.25,1_1.0_2.51_0.8600000143051147_2.251_0.94999998...,1_0.77_2.251_0.82_2.251_0.8600000143051147_2.25,1_0.77_2.251_0.82_2.251_0.949999988079071_2.25,1_0.77_2.251_0.8600000143051147_2.251_0.949999...,1_0.82_2.251_0.8600000143051147_2.251_0.949999...
307209,1,0,0.98,0.82,2.75,1.00,0.80,2.5,0.77,1.09,...,1_1.0_2.51_0.77_2.251_0.82_2.25,1_1.0_2.51_0.77_2.251_-9999.0_2.25,1_1.0_2.51_0.77_2.251_-9999.0_2.25,1_1.0_2.51_0.82_2.251_-9999.0_2.25,1_1.0_2.51_0.82_2.251_-9999.0_2.25,1_1.0_2.51_-9999.0_2.251_-9999.0_2.25,1_0.77_2.251_0.82_2.251_-9999.0_2.25,1_0.77_2.251_0.82_2.251_-9999.0_2.25,1_0.77_2.251_-9999.0_2.251_-9999.0_2.25,1_0.82_2.251_-9999.0_2.251_-9999.0_2.25


In [None]:
test.info()

In [None]:
train_x =  train.drop(columns=['result'])
train_y =  train['result']

test_x =  test.drop(columns=['result'])
test_y =  test['result']


In [None]:
import category_encoders as ce

cat_features = train_x.select_dtypes(include='object').columns
# test_x[cat_features] = test_x[cat_features].astype('category')
# train_x[cat_features] = train_x[cat_features].astype('category')

def encode_LE(col,train,test):
    df_comb = pd.concat([train[col],test[col]],axis=0)
    df_comb,_ = df_comb.factorize(sort=True)
    nm = col
    if df_comb.max()>32000: 
        train[nm] = df_comb[:len(train)].astype('int32')
        test[nm] = df_comb[len(train):].astype('int32')
    else:
        train[nm] = df_comb[:len(train)].astype('int16')
        test[nm] = df_comb[len(train):].astype('int16')
    del df_comb; x=gc.collect()

for col in cat_features:
    encode_LE(col,train_x,test_x)
    
# target_enc = ce.TargetEncoder(cols=cat_features)
# target_enc.fit(train_x[cat_features], train_y)

# train_x[cat_features] = target_enc.transform(train_x[cat_features])

# test_x[cat_features] = target_enc.transform(test_x[cat_features])

# import category_encoders as ce

# cat_features = train_x.select_dtypes(include='category').columns

# target_enc = ce.TargetEncoder(cols=cat_features)
# target_enc.fit(train_x[cat_features], train_y)

# train_x[cat_features] = target_enc.transform(train_x[cat_features])

# test_x[cat_features] = target_enc.transform(test_x[cat_features])

In [None]:
from itertools import combinations, permutations
categories=features.select_dtypes(include='category')
categories=categories.astype(str)
cross_features=list(combinations(categories.columns.values.tolist(), 2))
for item in cross_features:
    categories[item[0]+'_'+item[1]]=categories[item[0]].astype(str)+categories[item[1]].astype(str)
categories.shape

In [None]:
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

lgb_train = lgb.Dataset(train_x, train_y)

# 参数设置
params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,		#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 2019,	#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67,
          }


print('Starting training...')
# 模型训练
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20)


# 模型预测
y_pred = gbm.predict(test_x, num_iteration=gbm.best_iteration)

accuracy = roc_auc_score(test_y, y_pred)
print("accuarcy: %.2f%%" % (accuracy*100.0))

cv_results = lgb.cv(params, lgb_train, num_boost_round=500, nfold=5, 
                    verbose_eval=20, early_stopping_rounds=40)

np.array(cv_results["auc-mean"]).mean()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
lgb.plot_importance(gbm, max_num_features=60, height=0.5, ax=ax)
plt.show()

In [None]:
import gc
X=train_x
y=train_y
# del train_x,train_y
# gc.collect()

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,				#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 2019,	#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67,
          }

NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_oof = np.zeros(X.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(params, dtrain, 1000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=100)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(y_pred_valid)
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()

print(f"\nMean AUC = {score}")

In [None]:
import seaborn as sns
feature_importance_gain=pd.DataFrame()
feature_importance_gain['feature']=columns
feature_importance_gain['fold_1']=clf.feature_importance(importance_type='gain')
feature_importance_gain['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(1)]].mean(axis=1)
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importance_gain.sort_values(by='average', ascending=False).head(50), x='average', y='feature');
plt.title('100 TOP feature importance over {} folds average'.format(folds.n_splits));

In [None]:
import sweetviz as sv
# 可以选择目标特征
my_report = sv.analyze(train, target_feat ='result')
my_report.show_html()

In [None]:
import sweetviz as sv
# 可以选择目标特征
my_report = sv.analyze(test, target_feat ='result')
my_report.show_html()

In [None]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(loss_function="Logloss",
                           eval_metric="AUC",
                           learning_rate=0.01,
                           iterations=7000,
                           l2_leaf_reg=50,
                           random_seed=432013,
                           od_type="Iter",
                           depth=5,
                           early_stopping_rounds=15000,
                           border_count=64
                           #has_time= True 
                          )

clf.fit(train_x, train_y)

prediction = clf.predict_proba(test_x)[:,1]

accuracy = roc_auc_score(test_y, prediction)
print("accuarcy: %.2f%%" % (accuracy*100.0))

In [None]:
from lightgbm import LGBMClassifier

params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,		#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 2019,	#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67,
          }


xgb = XGBClassifier()

gbm = LGBMClassifier(**params)

classifiers = [clf,gbm]

from combo.models.classifier_stacking import Stacking

clf = Stacking(base_estimators=classifiers, n_folds=5, shuffle_data=False,
             keep_original=True, use_proba=True, random_state=2021)

clf.fit(train_x, train_y)
y_test_predict = clf.predict_proba(test_x)

accuracy = roc_auc_score(test_y, y_test_predict)
print("accuarcy: %.2f%%" % (accuracy*100.0))