In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder, minmax_scale, scale
from sklearn import tree
from sklearn import linear_model
from sklearn import svm
from sklearn import neighbors
from sklearn import ensemble
from sklearn.feature_selection import SelectFromModel, VarianceThreshold

In [2]:
def evalerror(y, y_pred):
    loss = np.sum(np.square(y - y_pred))
    n = len(y)
    return loss / n

In [3]:
train = pd.read_csv('train/train.csv')
test = pd.read_csv('train/test.csv')
y = pd.read_csv('train/y.csv')

In [7]:
#GBDT特征候选集
clf_gt = ensemble.GradientBoostingRegressor(max_depth=6, n_estimators=400, random_state=1)
clf_gt.fit(train, y)
model = SelectFromModel(clf_gt, prefit=True) 
train = pd.DataFrame(model.transform(train))
test = pd.DataFrame(model.transform(test))

  y = column_or_1d(y, warn=True)


In [8]:
#初始化种群
def Init_Individual(feature):
    Individual = []
    for i in range(10):
        Gene = []
        for g in range(len(feature)):
            Gene.append(np.random.randint(0, 2))
        Individual.append(Gene)
    return np.array(Individual)


#适应性函数
def fitness(Individual, y, dataSet):
    lr = linear_model.LinearRegression()
    fit = []
    index = []
    gene = []
    for i in range(len(Individual)):
        Gene_sequence = pd.DataFrame(dataSet.columns, columns=['feature'])
        Gene_sequence['gene'] = Individual[i]
        Gene_sequence = list(Gene_sequence[Gene_sequence['gene'] == 1]['feature'])
        
        cv_model = cross_val_score(lr, dataSet[Gene_sequence], y,  cv=10, scoring='neg_mean_squared_error')
        fit.append(0.1 - np.mean(np.abs(cv_model)))
        index.append(i)
        gene.append(Individual[i])    
    
    Ind_fitness = pd.DataFrame(fit, columns=['fintness'])
#    Ind_fitness['Indi_index'] = index
    Ind_fitness['Gene'] = gene    
    return Ind_fitness


#轮盘赌选择最优个体
def Roulette_wheel(Fitness):    
    sumFits = np.sum(Fitness['fintness'])

    rndPoint = np.random.uniform(0, sumFits)
    accumulator = 0.0
    for ind, val in enumerate(Fitness['fintness']):
        accumulator += val
        if accumulator >= rndPoint:
            return np.array(Fitness[Fitness['fintness'] == val].values)[0]
        
#交叉算子
def Crossover_operator(Individual):
    idx1 = np.random.randint(0, len(Individual))
    idx2 = np.random.randint(0, len(Individual))
    while idx2 == idx1:      
        idx2 = np.random.randint(0, len(Individual)) 
             
    Father_gene = Individual[Individual['Indi_index'] == idx1]['Indi_Gene'].values
    Mother_gene = Individual[Individual['Indi_index'] == idx2]['Indi_Gene'].values
    
    crossPos_A = np.random.randint(0, len(Father_gene[0]))
    crossPos_B = np.random.randint(0, len(Father_gene[0]))  

    while crossPos_A == crossPos_B:      
        crossPos_B = np.random.randint(0, len(Father_gene[0]))  

    if crossPos_A > crossPos_B:
        crossPos_A, crossPos_B = crossPos_B, crossPos_A
        
    if crossPos_A < crossPos_B:
        temp = Father_gene[0][crossPos_A]
        Father_gene[0][crossPos_A] = Mother_gene[0][crossPos_A]
        Mother_gene[0][crossPos_A] = temp
        crossPos_A = crossPos_A + 1
    
    return Father_gene, Mother_gene  

#变异算子
def Mutation_operator(Individual):
    MUTATION_RATE = 0.165
    for i in range(len(Individual)):
        mutatePos = np.random.randint(0, len(Individual['Indi_Gene'][i]))
        theta = np.random.random()
        if theta < MUTATION_RATE:
            if Individual['Indi_Gene'][i][mutatePos] == 0:
                Individual['Indi_Gene'][i][mutatePos] = 1
            else:
                Individual['Indi_Gene'][i][mutatePos] = 0
    return Individual

In [9]:
#遗传算法
def Genetic_algorithm(Individual, train, y, iterm):
    for i in range(iterm):
        print('第 %d 代' % i)
        fit = fitness(Individual, y, train)
    
        Roulette_gene = []
        index = []
        for i in range(len(Individual)):
            Roulette_gene.append(Roulette_wheel(fit))
            index.append(i)
        
        Choice_gene = pd.DataFrame(Roulette_gene, columns=['fintness', 'Indi_Gene'])
        Choice_gene['Indi_index'] = index
        Choice_gene['fintness'] = 0.1 - Choice_gene['fintness']
        Choice_gene = Choice_gene.sort_values(['fintness'])
    
        Cro_gene = []
        for i in range(5):
            gene1, gene2 = Crossover_operator(Choice_gene)
            Cro_gene.append(gene1)
            Cro_gene.append(gene2)  
    
        Crossover_gene = pd.DataFrame(Cro_gene, columns=['Indi_Gene'])
        Crossover_gene['Indi_index'] = index
    
        New_gene = Mutation_operator(Crossover_gene)
        Individual = New_gene['Indi_Gene']
    fit['fintness'] = 0.1 - fit['fintness']
    return fit
        

In [18]:
#初始化种群
Individual = Init_Individual(train.columns)

#计算每个个体的适应性
fit = fitness(Individual, y, train)
fit['fintness'] = 0.1 - fit['fintness']
fit.sort_values(['fintness'], inplace=True)
fit.reset_index(inplace=True, drop=['index'])
fit

Unnamed: 0,fintness,Gene
0,0.040204,"[0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, ..."
1,0.040662,"[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, ..."
2,0.040862,"[0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, ..."
3,0.041487,"[1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, ..."
4,0.042401,"[0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, ..."
5,0.042978,"[1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, ..."
6,0.043455,"[1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, ..."
7,0.043758,"[0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, ..."
8,0.044766,"[1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, ..."
9,0.045582,"[1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, ..."


In [15]:
Gene_sequence = pd.DataFrame(train.columns, columns=['feature'])
Gene_sequence['gene'] = fit['Gene'][0]
Gene_sequence = list(Gene_sequence[Gene_sequence['gene'] == 1]['feature'])

In [14]:
def wmaeEval(preds, dtrain):
    label = dtrain.get_label()
    return 'error', np.sum(np.square(preds - label)) / len(label)
param = {}
param['eta'] = 0.01
param['max_depth'] = 3

param['subsample'] = 0.8
param['colsample_bytree'] = 0.3
num_round = 3300

xgbTrain = xgb.DMatrix(train[Gene_sequence], label=y)
modle = xgb.cv(param, xgbTrain, num_round, feval=wmaeEval, nfold=5)
print(modle.iloc[-1, 0])

0.0300686
