In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn import manifold,datasets, linear_model,gaussian_process,svm,metrics,manifold,preprocessing
from sklearn.model_selection import train_test_split,GridSearchCV, KFold, cross_validate
from sklearn.metrics import make_scorer, mean_squared_error
from scipy.stats import pearsonr
import warnings 
warnings.filterwarnings('ignore')
import cv2
import math 
plt.style.use('seaborn')

import scipy.io as sio

import gc

pd.options.display.max_rows = 1000
pd.set_option('display.max_columns', None)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# 1. 通过ridge回归效果来决定各种特征的处理方式

In [2]:
def R_value(y_test, y_pred):
    return pearsonr(y_test, y_pred)

def mean_squared_error(y_test, y_pred):
    return mean_squared_error(y_test, y_pred)


def ridge_evaluator(X,y):
    scoring_new = {'mse': 'neg_root_mean_squared_error','R_value':  'r2'}

    alpha_can = np.logspace(-3, 3, 10)
    # 原始特征
    kfolds = KFold(n_splits=5, shuffle=True)
    ridge = linear_model.Ridge()
    ridge_model1 = GridSearchCV(ridge, param_grid={'alpha': alpha_can}, cv=5)
    score1 = cross_validate(ridge_model1, X, y, scoring=scoring_new, cv=kfolds)
    return score1

In [3]:
data = sio.loadmat('./datasets/candidate_data.mat')
for key in ['X_baseline', 'X_aug_median', 'X_aug_mean', 'X_onehot', 'X_cat_dim1', 'X_cat_dim2']:
    min_max_scaler = MinMaxScaler()
    data[key] = min_max_scaler.fit_transform(data[key])

In [7]:
# 测试baseline特征性能
score = ridge_evaluator(data['X_baseline'],data['Y'][0,:])
print("Baseline performance: mse:"+str(-score['test_mse'].mean())+", R_value:"+str(score['test_R_value'].mean()))

Baseline performance: mse:126.41598241775655, R_value:0.2861158599634314


In [6]:
# 挑选均值填充和中值填充的特征
## 均值
score = ridge_evaluator(np.concatenate([data['X_baseline'],data['X_aug_mean']],axis = 1),data['Y'][0,:])
print("Mean performance: mse:"+str(-score['test_mse'].mean())+", R_value:"+str(score['test_R_value'].mean()))

## 中值
score = ridge_evaluator(np.concatenate([data['X_baseline'],data['X_aug_median']],axis = 1),data['Y'][0,:])
print("Median performance: mse:"+str(-score['test_mse'].mean())+", R_value:"+str(score['test_R_value'].mean()))

Mean performance: mse:123.5037721739481, R_value:0.3247394008566504
Median performance: mse:123.6676953954631, R_value:0.3138993196141975


In [7]:
# 挑选one-hot特征和降维后的特征
## onehot
score = ridge_evaluator(np.concatenate([data['X_baseline'],data['X_aug_median'],data['X_onehot']],axis = 1),data['Y'][0,:])
print("Onehot performance: mse:"+str(-score['test_mse'].mean())+", R_value:"+str(score['test_R_value'].mean()))
## 降到30维
score = ridge_evaluator(np.concatenate([data['X_baseline'],data['X_aug_median'],data['X_cat_dim1']],axis = 1),data['Y'][0,:])
print("30 dim performance: mse:"+str(-score['test_mse'].mean())+", R_value:"+str(score['test_R_value'].mean()))
## 降到50维
score = ridge_evaluator(np.concatenate([data['X_baseline'],data['X_aug_median'],data['X_cat_dim2']],axis = 1),data['Y'][0,:])
print("50 dim performance: mse:"+str(-score['test_mse'].mean())+", R_value:"+str(score['test_R_value'].mean()))

Onehot performance: mse:78.59966266981655, R_value:0.7177073841882633
30 dim performance: mse:108.864413457866, R_value:0.47324294313253185
50 dim performance: mse:100.75573842384455, R_value:0.5483374514459746


In [10]:
# 保存最后的特征
data = sio.loadmat('./datasets/candidate_data.mat')
X = np.concatenate([data['X_baseline'],data['X_aug_median'],data['X_onehot']],axis = 1)
Y = data['Y'][0,:]

sio.savemat('ori_data.mat',{'X':X,'Y':Y})