In [1]:
import pandas as pd
import numpy as np
import random

def undersampling(train, desired_apriori):
    
    target =  train.columns[len(train.columns)-1]
    # Get the indices per target value
    idx_0 = train[train[target] == 0].index
    idx_1 = train[train[target] != 0].index
    # Get original number of records per target value
    nb_0 = len(train.loc[idx_0])
    nb_1 = len(train.loc[idx_1])
    # Calculate the undersampling rate and resulting number of records with target=0
    undersampling_rate = ((1-desired_apriori)*nb_1)/(nb_0*desired_apriori)
    undersampled_nb_0 = int(undersampling_rate*nb_0)
    print('Rate to undersample records with target=0: {}'.format(undersampling_rate))
    print('Number of records with target=0 after undersampling: {}'.format(undersampled_nb_0))
    # Randomly select records with target=0 to get at the desired a priori
    undersampled_idx = random.sample(idx_0, undersampled_nb_0)
    # Construct list with remaining indices
    sampled_idx1 = random.sample(idx_1, len(idx_1))
    idx_list = undersampled_idx + sampled_idx1
    idx_list = random.sample(idx_list, len(idx_list))
    # Return undersample data frame
    train = train.loc[idx_list].reset_index(drop=True)
    return train

def easyensemble(df, desired_apriori, n_subsets=10):
    
    train_resample = undersampling(df, desired_apriori)
    
    for _ in range(n_subsets-1):
        sel_train = undersampling(df, desired_apriori)
        train_resample.append(sel_train)
#         print sel_train.iloc[0:4,:5]
#         print train_resample.iloc[0:4,:5]
    return train_resample


data_set = pd.read_csv("tap_fun_train.csv")
train = easyensemble(data_set, 0.5)
# print train
train.to_csv("under_sample_data.csv",index=None)

Rate to undersample records with target=0: 0.0205118689895
Number of records with target=0 after undersampling: 45988
Rate to undersample records with target=0: 0.0205118689895
Number of records with target=0 after undersampling: 45988
Rate to undersample records with target=0: 0.0205118689895
Number of records with target=0 after undersampling: 45988
Rate to undersample records with target=0: 0.0205118689895
Number of records with target=0 after undersampling: 45988
Rate to undersample records with target=0: 0.0205118689895
Number of records with target=0 after undersampling: 45988
Rate to undersample records with target=0: 0.0205118689895
Number of records with target=0 after undersampling: 45988
Rate to undersample records with target=0: 0.0205118689895
Number of records with target=0 after undersampling: 45988
Rate to undersample records with target=0: 0.0205118689895
Number of records with target=0 after undersampling: 45988
Rate to undersample records with target=0: 0.02051186898

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
    
all_under_sample = pd.read_csv('under_sample_data.csv')
dt1=pd.to_datetime(all_under_sample["register_time"])
all_under_sample["register_time"]=dt1.dt.dayofyear

all_data = all_under_sample.iloc[0:,1:len(all_under_sample.columns)-1]
#print all_df.head(5)
#标准化，返回值为标准化后的数据
all_under_sample_Standard = StandardScaler().fit_transform(all_data)
all_under_sample_data = pd.DataFrame(all_under_sample_Standard) 
all_under_sample_data.columns = all_under_sample.columns[1:len(all_under_sample.columns)-1]
all_under_sample_data['user_id'] = all_under_sample['user_id']
all_under_sample_data['prediction_pay_price'] = all_under_sample['prediction_pay_price']
all_under_sample_data.to_csv("under_sample_Standard_data.csv",index=None,columns=all_under_sample.columns)


In [3]:
pair = [[0,1],[1,3],[3,5],[5,8],[8,15],[15,25],[25,31],[31,42],
        [42,53],[53,60],[60,67],[67,74],[74,80],[80,82],[82,85]]
# for i in range(len(pair)):
#     print range(pair[i][0],pair[i][1])
#     print x_prime_train.iloc[0:,range(pair[i][0],pair[i][1])].columns

In [20]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from math import sqrt

def mse_mae_rmse(target, prediction):
    error = []
    for i in range(len(target)):
        error.append(target[i] - prediction[i])

    squaredError = []
    absError = []
    for val in error:
        squaredError.append(val * val)#target-prediction之差平方 
        absError.append(abs(val))#误差绝对值

    MSE = sum(squaredError) / len(squaredError)
    RMSE = sqrt(sum(squaredError) / len(squaredError))
    MAE =  sum(absError) / len(absError)
    return MSE, MAE, RMSE

all=pd.read_csv('under_sample_Standard_data.csv')
data = all.iloc[0:,1:]
x_prime = data.iloc[0:,0:len(data.columns)-1] #去掉y列，所以len-1
y = data.iloc[0:,len(data.columns)-1:]
x_prime_train, x_prime_test, y_train, y_test = train_test_split(x_prime, y, train_size=0.7, random_state=0)

#print pd.DataFrame(y_train.dtypes).apply(pd.value_counts) 
# print x_prime_train.head(50)
# print x_prime_test.head(50)
# print y_train.values
# print y_test

#for p in range(len(data.columns)-1):
for p in range(len(pair)):
    # 准备数据
#     x_train = x_prime_train.iloc[0:,range(p)+range(p+1,len(data.columns)-2)]#去掉y列之外再去掉一列特征，所以len-2
#     x_test = x_prime_test.iloc[0:,range(p)+range(p+1,len(data.columns)-2)]
    x_train = x_prime_train.iloc[0:,range(pair[p][0],pair[p][1])]
    x_test = x_prime_test.iloc[0:,range(pair[p][0],pair[p][1])]

    # 决策树学习
    model = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3)
    model.fit(x_train, y_train.astype('int'))

    # 训练集上的预测结果
    y_train_pred = model.predict(x_train)
    acc_train = accuracy_score(y_train.astype('int'), y_train_pred)
    y_test_pred = model.predict(x_test)
    acc_test = accuracy_score(y_test.astype('int'), y_test_pred)
    
    MSE1, MAE1, RMSE1 = mse_mae_rmse(y_train['prediction_pay_price'].values, 
                                     y_train_pred)
    print '训练集 RMSE = ', RMSE1
    MSE2, MAE2, RMSE2 = mse_mae_rmse(y_test['prediction_pay_price'].values, 
                                     y_test_pred)
    print '测试集 RMSE =  ', RMSE2
    #if acc_test < 0.84:
#     print '特征：', x_prime_train.iloc[0:,range(pair[p][0],pair[p][1])].columns
#     print '\t训练集准确率: %.4f%%' % (100*acc_train)
#     print '\t测试集准确率: %.4f%%\n' % (100*acc_test)
#     print y_test_pred

训练集 RMSE =  328.226057525
测试集 RMSE =  390.461048413
训练集 RMSE =  271.035240927
测试集 RMSE =  340.645100777
训练集 RMSE =  319.055703298
测试集 RMSE =  387.895378499
训练集 RMSE =  327.487696075
测试集 RMSE =  390.143358934
训练集 RMSE =  281.606357919
测试集 RMSE =  353.368402992
训练集 RMSE =  285.398421098
测试集 RMSE =  353.21364106
训练集 RMSE =  315.917273826
测试集 RMSE =  376.756440339
训练集 RMSE =  301.289211998
测试集 RMSE =  371.366825909
训练集 RMSE =  288.322092036
测试集 RMSE =  355.526660556
训练集 RMSE =  325.122783305
测试集 RMSE =  389.089413079
训练集 RMSE =  325.584375614
测试集 RMSE =  389.579699207
训练集 RMSE =  318.048343289
测试集 RMSE =  384.152902411
训练集 RMSE =  320.018749485
测试集 RMSE =  381.017570098
训练集 RMSE =  326.435050207
测试集 RMSE =  390.023056125
训练集 RMSE =  270.838261295
测试集 RMSE =  333.835095592


In [24]:
for p in range(len(pair)):
    # 准备数据
#     x_train = x_prime_train.iloc[0:,range(p)+range(p+1,len(data.columns)-2)]#去掉y列之外再去掉一列特征，所以len-2
#     x_test = x_prime_test.iloc[0:,range(p)+range(p+1,len(data.columns)-2)]
    x_train = x_prime_train.iloc[0:,range(0,pair[p][0])+range(pair[p][1],85)]#去掉y列之外再去掉一列特征，所以len-2
    x_test = x_prime_test.iloc[0:,range(0,pair[p][0])+range(pair[p][1],85)]

    # 决策树学习
    model = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=2)
    model.fit(x_train, y_train.astype('int'))

    # 训练集上的预测结果
    y_train_pred = model.predict(x_train)
    acc_train = accuracy_score(y_train.astype('int'), y_train_pred)
    y_test_pred = model.predict(x_test)
    acc_test = accuracy_score(y_test.astype('int'), y_test_pred)
    #if acc_test < 0.84:
#     print '不要特征：', x_prime_train.iloc[0:,range(pair[p][0],pair[p][1])].columns
#     print '\t训练集准确率: %.4f%%' % (100*acc_train)
#     print '\t测试集准确率: %.4f%%\n' % (100*acc_test)
#     print y_test_pred
    MSE1, MAE1, RMSE1 = mse_mae_rmse(y_train['prediction_pay_price'].values, 
                                     y_train_pred)
    print '训练集 RMSE = ', RMSE1
    MSE2, MAE2, RMSE2 = mse_mae_rmse(y_test['prediction_pay_price'].values, 
                                     y_test_pred)
    print '测试集 RMSE =   ', RMSE2

训练集 RMSE =  249.729062264
测试集 RMSE =    339.647304512
训练集 RMSE =  244.466772871
测试集 RMSE =    336.285844427
训练集 RMSE =  250.824546007
测试集 RMSE =    334.871245359
训练集 RMSE =  259.477868383
测试集 RMSE =    307.893749499
训练集 RMSE =  256.677502895
测试集 RMSE =    336.010799484
训练集 RMSE =  252.688366717
测试集 RMSE =    317.385210292
训练集 RMSE =  254.703211478
测试集 RMSE =    335.829019009
训练集 RMSE =  247.632351522
测试集 RMSE =    333.461381402
训练集 RMSE =  262.361012584
测试集 RMSE =    340.492859187
训练集 RMSE =  250.886491328
测试集 RMSE =    317.475146309
训练集 RMSE =  246.34211996
测试集 RMSE =    322.301213446
训练集 RMSE =  246.101747274
测试集 RMSE =    332.097501292
训练集 RMSE =  240.982927707
测试集 RMSE =    326.551623769
训练集 RMSE =  254.284144672
测试集 RMSE =    335.602812235
训练集 RMSE =  262.291086362
测试集 RMSE =    343.202716002


In [25]:
    x_train = x_prime_train#.iloc[0:,range(0,pair[p][0])+range(pair[p][1],107)]#去掉y列之外再去掉一列特征，所以len-2
    x_test = x_prime_test#.iloc[0:,range(0,pair[p][0])+range(pair[p][1],107)]

    # 决策树学习
    model = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3)
    model.fit(x_train, y_train.astype('int'))

    # 训练集上的预测结果
    y_train_pred = model.predict(x_train)
    acc_train = accuracy_score(y_train.astype('int'), y_train_pred)
    y_test_pred = model.predict(x_test)
    acc_test = accuracy_score(y_test.astype('int'), y_test_pred)
    #if acc_test < 0.84:
#     print '不要特征：', x_prime_train.iloc[0:,range(pair[p][0],pair[p][1])].columns
#     print '\t训练集准确率: %.4f%%' % (100*acc_train)
#     print '\t测试集准确率: %.4f%%\n' % (100*acc_test)
#     print y_test_pred
    MSE1, MAE1, RMSE1 = mse_mae_rmse(y_train['prediction_pay_price'].values, 
                                     y_train_pred)
    print '训练集 RMSE = ', RMSE1
    MSE2, MAE2, RMSE2 = mse_mae_rmse(y_test['prediction_pay_price'].values, 
                                     y_test_pred)
    print '测试集 RMSE =   ', RMSE2

训练集 RMSE =  266.373758367
测试集 RMSE =    335.249481122


In [26]:
print x_train.values

[[-0.79576042 -0.17682214 -0.16079714 ... -0.45977738 -0.09281969
  -0.33096724]
 [-1.17006939 -0.03401706 -0.16079714 ... -0.33765098 -0.08167265
   0.04617915]
 [ 1.73082508  1.33304537  2.19386698 ...  2.24445012  0.09803126
   1.55476473]
 ...
 [-1.07649215 -0.1769601  -0.16079714 ... -0.46612161 -0.09281969
  -0.33096724]
 [ 1.16936164 -0.17653309 -0.16079714 ... -0.46612161 -0.09281969
  -0.33096724]
 [-1.07649215 -0.17404646 -0.16079714 ... -0.44867498 -0.09281969
  -0.33096724]]
