In [4]:
import sys
import time
import pandas as pd
from sklearn import model_selection
from sklearn import ensemble, linear_model, svm, metrics, cluster, tree
from xgboost import XGBRegressor
from matplotlib import pyplot as plt
import scipy
import os
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import preprocessing
from sklearn.feature_selection import *
from sklearn.model_selection import GridSearchCV, KFold

start_time = time.time()

n_cv = 5
n_jobs = 6
random_state = 3111696
d_set = "data/Erlotinib.csv"

print("Loading dataset: %s" % d_set)
df = pd.read_csv("depmap/" + d_set, sep=",")
columns = list(df.columns.values)
label = df[columns[-1]].copy()
data = df.drop(columns[-1], axis=1)
print(data.shape)
print(label.shape)
print("Dataset loaded")

Loading dataset: data/Erlotinib.csv
(370, 17738)
(370,)
Dataset loaded


In [3]:
pipe_regressor = Pipeline([
  ('rfe', RFE(XGBRegressor(random_state=random_state, n_estimators=50), n_features_to_select=100)),
  ('regression', XGBRegressor(random_state=random_state, n_estimators=100))
])

parameters = [
    {
        'regression__max_depth': [3, 4, 5],
        'regression__subsample': [0.7],
        'regression__colsample_bytree': [0.5]
    }
]

optimized_regressor = GridSearchCV(pipe_regressor, parameters, \
                                       cv=KFold(n_splits=n_cv, shuffle=True, random_state=random_state), \
                                       error_score=0, scoring='r2', verbose=True, n_jobs=n_jobs, \
                                       pre_dispatch="1*n_jobs")

print("Fitting transformed dataset...")
optimized_regressor.fit(data, label)

best_regressor = optimized_regressor.best_estimator_
best_result = optimized_regressor.cv_results_

print(optimized_regressor.best_params_)
best_score = optimized_regressor.best_score_
print(" ")
print("R2 score: %.2f" % round(best_score, 2))

end_time = time.time()
print('Total time taken: %d seconds' % int(end_time - start_time))

Fitting transformed dataset...
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


KeyboardInterrupt: 

In [55]:
optimized_regressor.best_estimator_.named_steps["regression"].feature_importances_


array([0.03376623, 0.03290043, 0.03896104, 0.02683983, 0.02251082,
       0.02164502, 0.01904762, 0.01904762, 0.01818182, 0.01212121,
       0.01731602, 0.00519481, 0.01471861, 0.00606061, 0.01731602,
       0.00606061, 0.01298701, 0.01558442, 0.00952381, 0.00692641,
       0.01125541, 0.0025974 , 0.01385281, 0.00606061, 0.01212121,
       0.004329  , 0.01731602, 0.00606061, 0.01212121, 0.01298701,
       0.01038961, 0.01125541, 0.01125541, 0.00865801, 0.00865801,
       0.00692641, 0.01212121, 0.00692641, 0.004329  , 0.01038961,
       0.00779221, 0.00692641, 0.01038961, 0.00779221, 0.01125541,
       0.00606061, 0.00606061, 0.00692641, 0.00779221, 0.00692641,
       0.01038961, 0.00779221, 0.00952381, 0.01298701, 0.00692641,
       0.00692641, 0.01038961, 0.00779221, 0.0034632 , 0.00779221,
       0.00519481, 0.00952381, 0.0034632 , 0.0034632 , 0.00606061,
       0.01125541, 0.01038961, 0.01385281, 0.004329  , 0.0034632 ,
       0.00519481, 0.00606061, 0.01038961, 0.00865801, 0.01038

In [51]:
dir(pipe_regressor.named_steps["sfm"])

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_get_param_names',
 '_get_support_mask',
 'estimator',
 'estimator_',
 'fit',
 'fit_transform',
 'get_params',
 'get_support',
 'inverse_transform',
 'max_features',
 'norm_order',
 'partial_fit',
 'prefit',
 'set_params',
 'threshold',
 'threshold_',
 'transform']