In [1]:
import json
import pandas as pd
import pymongo
import matplotlib as mpl
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                df[col] = df[col].astype(np.float32)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [3]:
def getGoalMid(masterGoal, guestGoal, masterMidGoal,guestMidGoal):
    if masterGoal + guestGoal - masterMidGoal - guestMidGoal >= 4:
        return 4
    else:
        return masterGoal + guestGoal - masterMidGoal - guestMidGoal
        
def removeSub(pankou):
    pankou = pankou.replace("升","")
    pankou = pankou.replace("降","")
    return pankou.strip()

def getResult(masterGoal, guestGoal, masterMidGoal,guestMidGoal,pankou):
    pankouList = removeSub(pankou).split("/")
    row = 0
    for p in pankouList:
        row += float(p)
    row = row / len(pankouList)
    
    if masterGoal + guestGoal - row >= 0:
        return 1
    if masterGoal + guestGoal - row < 0:
        return 0

def getType(yapanMasterStartOdd, yapanGuestStartOdd, yapanPankouStart):
    linTypeStart = get18(yapanMasterStartOdd, yapanGuestStartOdd)
    return str(linTypeStart) + "_" + yapanPankouStart

def get18(master,guest):
    if master > guest:
        return 18
    if master < guest:
        return 81
    if master == guest:
        return 99

def daxiao_num(x):
    x_list = x.split("/")
    num = 0
    for i in x_list:
        num += float(i)
    return str(float(num)/len(x_list))

def realDaxiao(x,master,guest):
    return str(float(x) - master - guest)

def shengjiang(start,end):
    return str(float(end) - float(start))

In [4]:
def fillNa(x,value):
    if pd.isnull(x):
        return value
    else:
        return x
    
def preF(test):
    cols_with_missing = (col for col in test.columns 
                                 if test[col].isnull().any())
    for col in cols_with_missing:
        test[col + '_was_missing'] = test[col].isnull()
        
    for col in ['daxiaoPankouZao','daxiaoPankouStartZao']:
        test[col] = test.apply(lambda x: fillNa(x[col], "-9999"), axis=1)

    for col in ['daxiaoMasterStartOddZao', 'daxiaoGuestStartOddZao','daxiaoMasterOddZao', 'daxiaoGuestOddZao']:
        test[col] = test.apply(lambda x: fillNa(x[col], -9999), axis=1)
        
    test['result'] = test.apply(lambda x: getResult(
        x['masterGoal'], x['guestGoal'], x['masterMidGoal'], x['guestMidGoal'], x['daxiaoPankouMid']), axis=1)
    
    test['goalMid'] = test.apply(lambda x: getGoalMid(
        x['masterGoal'], x['guestGoal'], x['masterMidGoal'], x['guestMidGoal']), axis=1)    
    
    test['zhongbifeng'] = test['masterMidGoal'] + test['guestMidGoal']


    
    test['daxiaoPankouStart'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStart']), axis=1)
    test['daxiaoPankou'] = test.apply(
        lambda x: removeSub(x['daxiaoPankou']), axis=1)
    test['daxiaoPankouStartMid'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStartMid']), axis=1)
    test['daxiaoPankouMid'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouMid']), axis=1)
    test['daxiaoPankouStartZao'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouStartMid']), axis=1)
    test['daxiaoPankouZao'] = test.apply(
        lambda x: removeSub(x['daxiaoPankouMid']), axis=1)
        
    pankou = ["daxiaoPankouStart","daxiaoPankou","daxiaoPankouStartMid","daxiaoPankouMid","daxiaoPankouStartZao","daxiaoPankouZao"]

    for col in pankou:
        test[col] = test[col].map(daxiao_num)
        nm = col+"_"+"real"
        test[nm] = test.apply(lambda x: realDaxiao(x[col],x['masterMidGoal'],x['guestMidGoal']), axis=1)
    
    
    test = test[test['goalMid'] >= 0]
    test = test[(test['daxiaoMasterStartOdd'] >= 0.75) & (test['daxiaoMasterStartOdd'] < 1.20)]
    test = test[(test['daxiaoMasterOdd'] >= 0.75) & (test['daxiaoMasterOdd'] < 1.20)]
    test = test[(test['daxiaoGuestOddMid'] >= 0.75) & (test['daxiaoGuestOddMid'] < 1.20)]
    test = test[test['zhongbifeng'].astype(float) <= test['daxiaoPankou'].astype(float)]
    test = test.drop(columns=['masterGoal', 'guestGoal', "goalMid"])
    
    return test

In [11]:
def getShuiPing(x):
    result = 11
    if x <= 0.75:
        result = 0
    if 0.75< x and x<=0.80:
        result = 1
    if 0.80<x and x<=0.85:
        result = 3
    if 0.85<x and x<=0.90:
        result = 4
    if 0.95<x and x<=1.00:
        result = 5
    if 1.00<x and x<=1.08:
        result = 6
    if 1.08<x and x<=1.10:
        result = 7
    if 1.10<x and x<=1.15:
        result = 8
    if 1.15<x and x<=1.20:
        result = 9
    if x > 1.20:
        result = 10
    return result
        
def num_fea_dis(df,features):
    for f in features:
        nm = f+'_'+'shuiPing'
        df[nm] = df[f].map(getShuiPing)
    return df

In [None]:
import gc

def encode_CB(col1,col2,df1,df2):
    nm = col1+'_'+col2
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str) 

def encode_CB3(col1,col2,col3,df1,df2,name=""):
    if name == "":
        nm = col1+'_'+col2+'_'+col3
    else:
        nm = name
    df1[nm] = df1[col1].astype(str)+'_'+df1[col2].astype(str)+'_'+df1[col3].astype(str)
    df2[nm] = df2[col1].astype(str)+'_'+df2[col2].astype(str)+'_'+df2[col3].astype(str)

def encode_Count(df1, df2, col):
    df = pd.concat([df1[[col, 'result']], df2[[col, 'result']]])
    nm = col+'_COUNT'
    dicts = df.groupby(col)['result'].sum().to_dict()
    df1[nm] = df1[col].map(dicts).astype(int)
    df2[nm] = df2[col].map(dicts).astype(int)

    nm = col+'_ALL'
    dicts = df.groupby(col)['result'].count().to_dict()
    df1[nm] = df1[col].map(dicts).astype(int)
    df2[nm] = df2[col].map(dicts).astype(int)

    nm = col+'_CT'
    df1[nm] = df1[col+'_COUNT'] / df1[col+'_ALL']
    df2[nm] = df2[col+'_COUNT'] / df2[col+'_ALL']

In [None]:
fes = ['daxiaoMasterStartOdd','daxiaoGuestStartOdd',
       'daxiaoMasterOdd','daxiaoGuestOdd',
       'daxiaoMasterStartOddMid','daxiaoGuestStartOddMid',
       'daxiaoMasterOddMid','daxiaoGuestOddMid',
       "daxiaoMasterStartOddZao","daxiaoGuestStartOddZao",
       "daxiaoMasterOddZao","daxiaoGuestOddZao"]

In [None]:
test = pd.read_csv('test.csv')
test = reduce_mem_usage(test)
test = preF(test)
test= num_fea_dis(test,fes)

In [None]:
train = pd.read_csv('train.csv')
train = reduce_mem_usage(train)
train = preF(train)
train = num_fea_dis(train,fes)

In [None]:
encode_CB3("daxiaoMasterStartOdd_shuiPing","daxiaoGuestStartOdd_shuiPing","daxiaoPankouStart",train,test,"typeLinStart")
encode_CB3("daxiaoMasterOdd_shuiPing","daxiaoGuestOdd_shuiPing","daxiaoPankou",train,test,"typeLinEnd")

encode_CB3("daxiaoMasterStartOddMid_shuiPing","daxiaoGuestStartOddMid_shuiPing","daxiaoPankouStartMid",train,test,"typeMidStart")
encode_CB3("daxiaoMasterOddMid_shuiPing","daxiaoGuestOddMid_shuiPing","daxiaoPankouMid",train,test,"typeMidEnd")

encode_CB3("daxiaoMasterStartOddZao_shuiPing","daxiaoGuestStartOddZao_shuiPing","daxiaoPankouStartZao",train,test,"typeZaoStart")
encode_CB3("daxiaoMasterOddZao_shuiPing","daxiaoGuestOddZao_shuiPing","daxiaoPankouZao",train,test,"typeZaoEnd")

encode_CB3("zhongbifeng","typeLinStart","typeLinEnd",train,test,"TypeLin")
encode_CB3("zhongbifeng","typeMidStart","typeMidEnd",train,test,"TypeMid")
encode_CB3("zhongbifeng","typeZaoStart","typeZaoEnd",train,test,"TypeZao")
encode_CB3("zhongbifeng","typeLinStart","typeMidEnd",train,test,"TypeLinMid")
encode_CB3("zhongbifeng","typeZaoStart","typeLinEnd",train,test,"TypeZaoLin")
encode_CB3("TypeLin","typeMidStart","typeMidEnd",train,test,"TypeAll")
encode_CB3("TypeAll","typeZaoStart","typeZaoEnd",train,test,"TypeAllZAO")

In [None]:
for col in ["TypeMid", "TypeLinMid", "TypeZao", "TypeZaoLin", "TypeAll", "TypeAllZAO"]:
    encode_Count(train, test, col)
    test = test.drop([col+'_ALL', col+'_COUNT'], axis=1)
    train = train.drop([col+'_ALL', col+'_COUNT'], axis=1)

In [None]:
train_x =  train.drop(columns=['result'])
train_y =  train['result']

test_x =  test.drop(columns=['result'])
test_y =  test['result']

In [None]:
cat_features = train_x.select_dtypes(include='object').columns
test_x[cat_features] = test_x[cat_features].astype('category')
train_x[cat_features] = train_x[cat_features].astype('category')

In [None]:
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

lgb_train = lgb.Dataset(train_x, train_y)

# 参数设置
params = {'num_leaves': 240, #结果对最终效果影响较大，越大值越好，太大会出现过拟合
          'min_data_in_leaf': 30,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.03,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction":  0.5992677823884304,	#提取的特征比率
          "bagging_freq": 4,
          "bagging_fraction":0.7100471696361973,
          "bagging_seed": 11,
          "lambda_l1": 8.545500456265467e-05,				#l1正则
          'lambda_l2': 5.985747612243422e-07,		#l2正则
          "verbosity": -1,
          "nthread": -1,		#线程数量，-1表示全部线程，线程越多，运行的速度越快
          'metric': {'binary_logloss', 'auc'},	##评价函数选择
          "random_state": 2019,	#随机数种子，可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          'min_child_samples': 67,
          }


print('Starting training...')
# 模型训练
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20)


# 模型预测
y_pred = gbm.predict(test_x, num_iteration=gbm.best_iteration)

accuracy = roc_auc_score(test_y, y_pred)
print("accuarcy: %.2f%%" % (accuracy*100.0))

In [None]:
# cv_results = lgb.cv(params, lgb_train, num_boost_round=500, nfold=5, 
#                     verbose_eval=20, early_stopping_rounds=40)

# np.array(cv_results["auc-mean"]).mean()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
lgb.plot_importance(gbm, max_num_features=60, height=0.5, ax=ax)
plt.show()