In [None]:
import sys
import time
import pandas as pd
from sklearn import model_selection
from sklearn import ensemble, linear_model, svm, metrics, cluster, tree
from xgboost import XGBRegressor
from matplotlib import pyplot as plt
import scipy
import os
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.feature_selection import *
from sklearn.decomposition import PCA, NMF
from sklearn.model_selection import GridSearchCV, KFold

start_time = time.time()

n_cv = 5
n_jobs = 4
random_state = 3111696
d_set = "Galaxy215-[rTRAIL.csv].tabular"

print("Processing dataset: %s" % d_set)
df = pd.read_csv("depmap/" + d_set, sep="\t")
columns = list(df.columns.values)
label = df[columns[-1]].copy()
data = df.drop(columns[-1], axis=1)
print(data.shape)
print(label.shape)

print('Feature extraction started...')

clf = ensemble.RandomForestRegressor(n_estimators=200)
model = SelectFromModel(clf)
model.fit(data, label)

extracted_features_data = model.transform(data)
print("Size low dimensional data: ", extracted_features_data.shape)

print('Feature finished')

Processing dataset: Galaxy215-[rTRAIL.csv].tabular
(604, 48117)
(604,)
Feature extraction started...


In [19]:

pipe_regressor_ld = Pipeline([
  ('regression_ld', XGBRegressor())
])

parameters_ld = [
    {   
        'regression_ld__max_depth': [4],
        'regression_ld__booster': ['gbtree'],
        'regression_ld__learning_rate': [0.1],
        'regression_ld__subsample': [0.7, 0.6, 0.5, 0.4],
        'regression_ld__colsample_bytree': [0.7, 0.6, 0.5, 0.4],
        'regression_ld__reg_lambda': [0],
        'regression_ld__reg_alpha': [0],
        'regression_ld__n_estimators': [50, 100, 200],
        'regression_ld__random_state': [random_state]
    }
]

optimized_regressor_ld = GridSearchCV(pipe_regressor_ld, parameters_ld, \
                                       cv=KFold(n_splits=n_cv, shuffle=True, random_state=random_state), \
                                       error_score=0, scoring='r2', verbose=True, n_jobs=n_jobs, \
                                       pre_dispatch="1*n_jobs")

optimized_regressor_ld.fit(extracted_features_data, label)
best_regressor_ld = optimized_regressor_ld.best_estimator_
best_result_ld = optimized_regressor_ld.cv_results_

print(optimized_regressor_ld.best_params_)
best_score_ld = optimized_regressor_ld.best_score_
print(best_score_ld)

end_time = time.time()
print('Total time taken: %d seconds' % int(end_time - start_time))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  46 tasks      | elapsed:   29.7s
[Parallel(n_jobs=4)]: Done 196 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed:  2.7min finished


{'regression_ld__booster': 'gbtree', 'regression_ld__colsample_bytree': 0.5, 'regression_ld__learning_rate': 0.1, 'regression_ld__max_depth': 4, 'regression_ld__n_estimators': 200, 'regression_ld__random_state': 3111696, 'regression_ld__reg_alpha': 0, 'regression_ld__reg_lambda': 0, 'regression_ld__subsample': 0.6}
0.032424726565594135
Total time taken: 432 seconds
