In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

tab_data = '/Users/cbun/Dropbox/Cancer-Test-Problems/FOLFOX-Study/folfox-rma-all-label.tab'


## Helper Methods
`generate_datasets`: Randomly split and filter training/test data

`top_important_features`: Return a sorted list of the top features

In [5]:

def generate_datasets(X, y, training_pct=80, feature_list=None):
    # Generate training / test sets
    if feature_list: # Filter
        X = X[feature_list]
    mask = np.random.rand(len(folfox_data)) < training_pct / 100.0
    return {
        'training': X[mask],
        'training_truth': y[mask],
        'test': X[~mask],
        'test_truth': y[~mask]
    }

def top_important_features(X_train, num_features, rf_estimator):
    fi = rf_estimator.feature_importances_
    cols = X_train.columns
    features = [{'importance':f, 'name':n} for f,n in zip(fi,cols)]
    top = sorted(features, reverse=True)[:num_features]
    return top


## Import Data
Data is transposed and pandas did not like the mixed-types, so move some data around.

In [26]:
folfox_df = pd.read_csv(tab_data, sep='\t', skiprows=[1]).drop('identifier', 1)

# Transpose and relabel
folfox_data = folfox_df.transpose()[1:]
folfox_data.columns = folfox_data.iloc[0]
folfox_data = folfox_data.ix[1:, :]

# Get targets and encode as integers
from sklearn import preprocessing
response_row = pd.read_csv(tab_data, sep='\t', nrows=1)
response_col = response_row.transpose()[3:].ix[:,0]
le = preprocessing.LabelEncoder()
le.fit(response_col)
folfox_data_y = le.transform(response_col)


#### Heare's what the data looks like


In [30]:
print 'Full dataset: {} x {}'.format(*folfox_data.shape)
print folfox_data.head()

Full dataset: 83 x 54675
probe       1007_s_at   1053_at    117_at    121_at 1255_g_at   1294_at  \
gsm710801_R  11.19218  6.774634  8.343318  8.031042  3.327017  7.830977   
gsm710802_R  10.49652  7.804514  5.595647  8.247604  3.103164   6.50772   
gsm710803_R  10.59073  7.161205  5.526806  7.672077  3.365848  7.236786   
gsm710804_R  9.866778  7.137549  5.941534  7.515458  3.084327  8.178657   
gsm710805_R  9.834763  6.863142  5.754606  7.203127  3.174936  7.027632   

probe         1316_at   1320_at 1405_i_at   1431_at       ...        \
gsm710801_R  4.880257  4.675249    6.4275  8.615556       ...         
gsm710802_R   5.29335  4.727968  5.068135  3.759907       ...         
gsm710803_R  5.066042  4.735936  5.995342  3.760872       ...         
gsm710804_R  4.621971  4.551727  8.448129  6.275264       ...         
gsm710805_R  4.704785  4.258746  5.669158  3.750633       ...         

probe       AFFX-r2-Ec-bioD-3_at AFFX-r2-Ec-bioD-5_at AFFX-r2-P1-cre-3_at  \
gsm710801_R         

### Random forests with recursive feature elimination
This function will run iteratively, each time, trimming a percentage of the unimportant features.  It will continue to run until it reaches a `min_features` limit

In [20]:
def rf_eliminate(X, y, elim_pct=20,
                 start_n_estimators=10000, n_estimators=2000,
                 min_features=15, iter_num=0):
    if iter_num == 0:
        n_estimators = start_n_estimators

    # Get training / test sets
    data = generate_datasets(X, y)
    X_train = data['training']
    y_train = data['training_truth']
    X_test = data['test']
    y_test = data['test_truth']

    # Run RF
    rf = RandomForestClassifier(n_estimators=n_estimators)
    rf.fit(X_train, y_train)
    score = rf.score(X_test, y_test)
    print '{0}\nPass #{1}\n{0}'.format('='*20, iter_num+1)
    print 'Training set: {}x{}'.format(*X_train.shape)
    print 'Score: {}'.format(score)

    # Get top features
    num_features = int(len(X_train.columns) * (1 - (elim_pct / 100.0)))
    top_info = top_important_features(X_train, num_features, rf)
    if num_features < min_features:
        top_probes = top_important_features(X_train, len(X_train.columns), rf)
        print 'Minimum Features reached'
        print 'Top Probes:'
        for i,probe in enumerate(top_probes):
            print '{}\t{}\t{}'.format(i+1, probe['name'], probe['importance'])
        return
    top_info = top_important_features(X_train, num_features, rf)
    top_fnames =  [probe['name'] for probe in top_info]
    # Trim feature set and re-run
    
    trimmed_X = X[top_fnames]
    rf_eliminate(trimmed_X, y, elim_pct=elim_pct, n_estimators=n_estimators,
                 min_features=min_features, iter_num=iter_num+1)
    

### Experiment 1
* Initial number of estimators: 2000
* number of estimators thereafter: 500
* Percentage of features to eliminate each iteration: 20%

In [31]:
rf_eliminate(folfox_data, folfox_data_y, start_n_estimators=2000, n_estimators=500)

Pass #1
Training set: 65x54675
Score: 0.777777777778
Pass #2
Training set: 70x43740
Score: 0.692307692308
Pass #3
Training set: 59x34992
Score: 0.708333333333
Pass #4
Training set: 65x27993
Score: 0.666666666667
Pass #5
Training set: 66x22394
Score: 0.823529411765
Pass #6
Training set: 67x17915
Score: 0.5625
Pass #7
Training set: 63x14332
Score: 0.75
Pass #8
Training set: 72x11465
Score: 0.727272727273
Pass #9
Training set: 71x9172
Score: 0.75
Pass #10
Training set: 72x7337
Score: 0.636363636364
Pass #11
Training set: 69x5869
Score: 0.642857142857
Pass #12
Training set: 72x4695
Score: 0.909090909091
Pass #13
Training set: 72x3756
Score: 0.636363636364
Pass #14
Training set: 62x3004
Score: 1.0
Pass #15
Training set: 68x2403
Score: 0.8
Pass #16
Training set: 71x1922
Score: 0.75
Pass #17
Training set: 66x1537
Score: 0.764705882353
Pass #18
Training set: 60x1229
Score: 1.0
Pass #19
Training set: 71x983
Score: 0.833333333333
Pass #20
Training set: 61x786
Score: 0.772727272727
Pass #21
Train

### Experiment 2
* Initial number of estimators: 10000
* number of estimators thereafter: 200
* Percentage of features to eliminate each iteration: 50%

In [28]:
rf_eliminate(folfox_data, folfox_data_y, start_n_estimators=10000, n_estimators=2000, elim_pct=50)

Pass #1
Training set: 59x54675
Score: 0.625
Pass #2
Training set: 66x27337
Score: 0.705882352941
Pass #3
Training set: 65x13668
Score: 0.777777777778
Pass #4
Training set: 69x6834
Score: 0.785714285714
Pass #5
Training set: 64x3417
Score: 0.736842105263
Pass #6
Training set: 64x1708
Score: 0.842105263158
Pass #7
Training set: 68x854
Score: 0.866666666667
Pass #8
Training set: 61x427
Score: 0.954545454545
Pass #9
Training set: 63x213
Score: 0.9
Pass #10
Training set: 61x106
Score: 0.909090909091
Pass #11
Training set: 69x53
Score: 0.785714285714
Pass #12
Training set: 68x26
Score: 1.0
Minimum Features reached
Top Probes:
1	201455_s_at	0.0862959506777
2	218079_s_at	0.0846730966588
3	208771_s_at	0.0791105057464
4	244527_at	0.0430365568772
5	224886_at	0.040049900509
6	217913_at	0.0395160611498
7	237749_at	0.0393502551354
8	231850_x_at	0.0383067342518
9	208975_s_at	0.0380180602502
10	201454_s_at	0.0373384750416
11	1563801_at	0.0358980540044
12	226797_at	0.0358756828041
13	237054_at	0.035817

### //TODO: Cross Validation Stuff
Use a 10-fold stratified cross-validation and grid parameter search


In [None]:

from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold

rf = RandomForestClassifier()
cross_validation = StratifiedKFold(folfox_training_truth, n_folds=10, shuffle=True)
parameter_grid = {'n_estimators': [200, 500, 1000, 1500, 2000],
                  'min_samples_split': [2, 3, 4]}

grid_search = GridSearchCV(rf, param_grid=parameter_grid, cv=cross_validation)
grid_search.fit(folfox_training, folfox_training_truth)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

rf_best = grid_search.best_estimator_


Best score: 0.671875
Best parameters: {'min_samples_split': 2, 'n_estimators': 2000}
