In [115]:
import os
import sys
import time
import scipy
import pandas as pd
import numpy as np
from sklearn import ensemble, linear_model, svm, metrics, model_selection, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold

start_time = time.time()

file_path = "alldata.csv"

alldata = pd.read_csv(file_path, sep="\t")

data = alldata.iloc[:,8:].astype(float)

normals = (alldata.condition == 'normal') 
progeria = (alldata.condition == 'hgps')

data_normal = data[normals]
data_prog = data[progeria]

age_normal = alldata[normals].age 
age_prog = alldata[progeria].age

print(data_normal.shape)
print(age_normal.shape)

print(data_prog.shape)
print(age_prog.shape)

(133, 27142)
(133,)
(10, 27142)
(10,)


In [121]:
print('Training started...')

n_jobs = 6
n_cv = 5
random_state = 3111696

pipe_regressor = Pipeline([
  ('preprocessing', preprocessing.MinMaxScaler()),
  ('regression', linear_model.LinearRegression())
])

parameters = [
    {   
        'regression__normalize': [True, False]
    }
]

'''pipe_regressor = Pipeline([
  ('preprocessing', preprocessing.MinMaxScaler()),
  ('regression', linear_model.ElasticNet())
])

parameters = [
    {   
        'regression__normalize': [True, False],
        'regression__alpha': [0, 0.001, 0.1],
        'regression__l1_ratio': [0.5, 0.75, 1.0],
    }
]

pipe_regressor = Pipeline([
  ('regression', svm.SVR())
])

parameters = [
    {   
        'regression__C':[0.001, 1],
        'regression__kernel':['poly'],
        'regression__degree':[1],
        'regression__epsilon':[3.5, 3.25, 3.0],
        'regression__gamma':['auto', 'scale']
    }
]'''

optimized_regressor = GridSearchCV(pipe_regressor, parameters, \
                                       cv=KFold(n_splits=n_cv, shuffle=True, random_state=random_state), \
                                       error_score=0, scoring='r2', verbose=True, n_jobs=n_jobs, \
                                       pre_dispatch="1*n_jobs")

optimized_regressor.fit(data_normal, age_normal)
best_regressor = optimized_regressor.best_estimator_
best_result = optimized_regressor.cv_results_
print(optimized_regressor.best_params_)

best_score = optimized_regressor.best_score_
print("R2 score for training: %.2f" % best_score)

print('Training finished')

Training started...
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  10 out of  10 | elapsed:    3.0s finished


{'regression__normalize': False}
R2 score for training: 0.70
Training finished
