In [1]:
import os
import sys
import time
import scipy
import pandas as pd
import numpy as np
from sklearn import ensemble, linear_model, tree, svm, metrics, model_selection, preprocessing, neighbors
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold, RepeatedKFold

file_path = "alldata.csv"

alldata = pd.read_csv(file_path, sep="\t")

data = alldata.iloc[:,8:].astype(float)

normals = (alldata.condition == 'normal') 
progeria = (alldata.condition == 'hgps')

data_normal = data[normals]
data_prog = data[progeria]

age_normal = alldata[normals].age 
age_prog = alldata[progeria].age

print(data_normal.shape)
print(age_normal.shape)

print(data_prog.shape)
print(age_prog.shape)

(133, 27142)
(133,)
(10, 27142)
(10,)


In [6]:
training_data_normal = data_normal
training_data_normal["age"] = age_normal

training_data_normal.to_csv('training_data_normal.tsv', sep='\t', index=False)
print(training_data_normal.shape)

print(data_prog.shape)
data_prog.to_csv('test_data_progeria.tsv', sep='\t', index=False, header=None)

tst = pd.read_csv("test_data_progeria.tsv", sep='\t', header=None)
print(tst)

test_data_progeria_labels = data_prog.copy()
test_data_progeria_labels['age'] = age_prog
test_data_progeria_labels.to_csv('test_data_progeria_labels.tsv', sep='\t', index=False, header=None)

#print(test_data_progeria_labels.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(133, 27143)
(10, 27142)
   0       1      2      3      4      5      6       7      8      9      \
0  0.277  18.963  0.333    0.0  0.814  0.000    0.0  34.686  5.738  1.487   
1  0.068  15.965  0.402    0.0  1.590  0.398    0.0  19.358  6.761  2.464   
2  0.164  19.677  0.386    0.0  0.989  0.132    0.0  35.587  5.944  1.603   
3  0.204  18.016  0.510    0.0  0.597  0.202    0.0  33.716  5.934  2.148   
4  0.107  18.419  0.418    0.0  0.993  0.151    0.0  35.167  5.971  1.726   
5  0.147  17.866  0.362    0.0  0.591  0.056    0.0  32.428  5.792  1.862   
6  0.180  18.088  0.435    0.0  0.665  0.054    0.0  37.091  6.769  1.994   
7  0.068  18.901  0.480    0.0  0.754  0.050    0.0  38.376  5.372  2.510   
8  0.080  21.639  0.291    0.0  0.643  0.058    0.0  39.586  6.059  1.784   
9  0.067  17.284  0.331    0.0  0.511  0.040    0.0  30.493  5.216  1.856   

   ...    27132  27133  27134  27135  27136   27137  27138   27139  27140  \
0  ...      0.0  0.071    0.0    0.0  5.528  23.81

In [50]:
print('Training started...')
start_time = time.time()

from sklearn.feature_selection import *
from sklearn.decomposition import *
from xgboost import XGBRegressor

n_jobs = 2
n_cv = 5
n_repeats = 5
random_state = 3111696

pipe_regressor = Pipeline([
  ('fs', SelectKBest(f_regression)),
  ('regression', linear_model.LinearRegression())
])

parameters = [
    {   
        'fs__k': [5890, 5895, 5900],
        'regression__normalize': [True]
    }
]

'''pipe_regressor = Pipeline([
  ('fs', SelectKBest(f_regression)),
  ('regression', linear_model.ElasticNet())
])

parameters = [
    {   
        'fs__k': [5890],
        'regression__normalize': [True],
        'regression__alpha': [0.0001],
    }
]'''

'''pipe_regressor = Pipeline([
  ('fs', SelectKBest(f_regression)),
  ('regression', svm.SVR())
])

parameters = [
    {   
        'fs__k': [5890, 5895, 5900],
        #'regression__C':[1],
        'regression__kernel':['poly'],
        'regression__degree':[1],
        'regression__epsilon':[6.5, 6.75, 7.0, 7.25, 7.5],
        #'regression__gamma':['auto']
    }
]'''


'''pipe_regressor = Pipeline([
  ('preprocessing', preprocessing.MinMaxScaler()),
  ('fs', SelectKBest(f_regression)),
  ('regression', neighbors.KNeighborsRegressor())
])

parameters = [
    {   
        'fs__k': [1000]
    }
]'''

'''pipe_regressor = Pipeline([
  ('preprocessing', preprocessing.MinMaxScaler()),
  ('fs', SelectKBest(f_regression)),
  ('regression', svm.LinearSVR())
])

parameters = [
    {   
        'fs__k': [4000, 5000, 6000, 7000],
        #'fs__k': [40, 50, 60, 70, 80, 90, 100],
        #'regression__C': [1.0, 2.0, 3.0],
        #'regression__loss': ['epsilon-insensitive ', 'squared_epsilon_insensitive'],
        #'regression__intercept_scaling': [0.0, 0.1, 0.25, 0.75, 1.0],
        #'regression__max_features': [None, 'auto', 'sqrt', 'log2'],
        #'regression__algorithm': ['auto'],
        #'regression__max_depth': [3, 4, 5],
        #'regression__alpha': [0.00001, 0.0001],
        #'regression__tol': [0.00001, 0.0001, 0.001],
        #'regression__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
        #'regression__max_subpopulation': [1],
        #'regression__copy_X': [True],
        #'regression__dual': [True, False]
    }
]'''

optimized_regressor = GridSearchCV(pipe_regressor, parameters,
                                       cv=KFold(n_splits=n_cv, shuffle=True, random_state=random_state), \
                                       error_score=0, scoring='r2', verbose=True, n_jobs=n_jobs,
                                       pre_dispatch="1*n_jobs")

optimized_regressor.fit(data_normal, age_normal)
best_regressor = optimized_regressor.best_estimator_
best_result = optimized_regressor.cv_results_
print(optimized_regressor.best_params_)

best_score = optimized_regressor.best_score_
print("R2 score for training: %.2f" % best_score)

print('Training finished')

print("Test on Progeria data...")
prediction = best_regressor.predict(data_prog)
test_r2 = metrics.r2_score(age_prog, prediction)
print("R2 score for test: %.2f" % test_r2)

mean_abs_error = np.mean([abs(a - b) for a, b in zip(age_prog, prediction)])
print("Mean absolute error for test: %.2f" % mean_abs_error)

median_abs_error = np.median([abs(a - b) for a, b in zip(age_prog, prediction)])
print("Median absolute error for test: %.2f" % median_abs_error)

end_time = time.time()
print("Time taken: %d" % int(end_time - start_time))

## Reproduce paper
## https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1599-6#Sec9

Training started...
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.


{'fs__k': 5895, 'regression__normalize': True}
R2 score for training: 0.75
Training finished
Test on Progeria data...
R2 score for test: -57.33
Mean absolute error for test: 14.05
Median absolute error for test: 12.36
Time taken: 1


[Parallel(n_jobs=2)]: Done  15 out of  15 | elapsed:    1.0s finished
  corr /= X_norms
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
