In [111]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
tab_data = '/Users/cbun/Dropbox/Cancer-Test-Problems/FOLFOX-Study/folfox-rma-all-label.tab'

## Helper Methods
`generate_datasets`: Randomly split and filter training/test data

`top_important_features`: Return a sorted list of the top features

In [112]:

def generate_datasets(X, y, training_pct=80, feature_list=None):
    # Generate training / test sets
    if feature_list: # Filter
        X = X[feature_list]
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=training_pct/100.0)
    train1 = len([y for y in y_train if y == 1])
    train0 = len([y for y in y_train if y == 0])
    test1 = len([y for y in y_test if y == 1])
    test0 = len([y for y in y_test if y == 0])
    print "Training set: {} samples. Y: 0=>{}, 1=>{}".format(len(X_train), train0, train1)
    print "Test set: {} samples. Y: 0=>{}, 1=>{}".format(len(X_test), test0, test1)
    return X_train, X_test, y_train, y_test

def top_important_features(X_train, num_features, rf_estimator):
    fi = rf_estimator.feature_importances_
    cols = X_train.columns
    features = [{'importance':f, 'name':n} for f,n in zip(fi,cols)]
    top = sorted(features, reverse=True)[:num_features]
    return top


## Import Data
Data is transposed and pandas did not like the mixed-types, so move some data around.

In [113]:
folfox_df = pd.read_csv(tab_data, sep='\t', skiprows=[1]).drop('identifier', 1)

# Transpose and relabel
folfox_data = folfox_df.transpose()[1:]
folfox_data.columns = folfox_data.iloc[0]
folfox_data = folfox_data.ix[1:, :]

# Get targets and encode as integers
from sklearn import preprocessing
response_row = pd.read_csv(tab_data, sep='\t', nrows=1)
response_col = response_row.transpose()[3:].ix[:,0]
le = preprocessing.LabelEncoder()
le.fit(response_col)
folfox_data_y = le.transform(response_col)

# Split training/test set 
#data = generate_datasets()


#### Hear's what the data looks like


In [30]:
print 'Full dataset: {} x {}'.format(*folfox_data.shape)
print folfox_data.head()

Full dataset: 83 x 54675
probe       1007_s_at   1053_at    117_at    121_at 1255_g_at   1294_at  \
gsm710801_R  11.19218  6.774634  8.343318  8.031042  3.327017  7.830977   
gsm710802_R  10.49652  7.804514  5.595647  8.247604  3.103164   6.50772   
gsm710803_R  10.59073  7.161205  5.526806  7.672077  3.365848  7.236786   
gsm710804_R  9.866778  7.137549  5.941534  7.515458  3.084327  8.178657   
gsm710805_R  9.834763  6.863142  5.754606  7.203127  3.174936  7.027632   

probe         1316_at   1320_at 1405_i_at   1431_at       ...        \
gsm710801_R  4.880257  4.675249    6.4275  8.615556       ...         
gsm710802_R   5.29335  4.727968  5.068135  3.759907       ...         
gsm710803_R  5.066042  4.735936  5.995342  3.760872       ...         
gsm710804_R  4.621971  4.551727  8.448129  6.275264       ...         
gsm710805_R  4.704785  4.258746  5.669158  3.750633       ...         

probe       AFFX-r2-Ec-bioD-3_at AFFX-r2-Ec-bioD-5_at AFFX-r2-P1-cre-3_at  \
gsm710801_R         

### Random forests with recursive feature elimination
This function will run iteratively, each time, trimming a percentage of the unimportant features.  It will continue to run until it reaches a `min_features` limit

In [114]:
def rf_eliminate(X_train, y_train, X_test, y_test, elim_pct=20,
                 start_n_estimators=10000, n_estimators=2000,
                 min_features=15, iter_num=0):
    if iter_num == 0:
        n_estimators = start_n_estimators

    # Get training / test sets
    # X_train, X_var, y_train, y_test = generate_datasets(X, y)

    # Run RF
    rf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True)
    rf.fit(X_train, y_train)
    score = rf.score(X_test, y_test)
    print '{0}\nPass #{1}\n{0}'.format('='*20, iter_num+1)
    print 'Training set: {}x{}'.format(*X_train.shape)
    print 'OOB Score: {}'.format(rf.oob_score_)
    print 'Accuracy: {}'.format(score)
    print 

    # Get top features
    num_features = int(len(X_train.columns) * (1 - (elim_pct / 100.0)))
    top_info = top_important_features(X_train, num_features, rf)
    if num_features < min_features:
        top_probes = top_important_features(X_train, len(X_train.columns), rf)
        print 'Minimum Features reached'
        print 'Top Probes:'
        for i,probe in enumerate(top_probes):
            print '{}\t{}\t{}'.format(i+1, probe['name'], probe['importance'])
        return 
    top_info = top_important_features(X_train, num_features, rf)
    top_fnames =  [probe['name'] for probe in top_info]
    # Trim feature set and re-run
    
    trimmed_X_train = X_train[top_fnames]
    trimmed_X_test = X_test[top_fnames]
    rf_eliminate(trimmed_X_train, y_train, trimmed_X_test, y_test, elim_pct=elim_pct, n_estimators=n_estimators,
                 min_features=min_features, iter_num=iter_num+1)
    

In [115]:
X_train, X_test, y_train, y_test = generate_datasets(folfox_data, folfox_data_y)

Training set: 66 samples. Y: 0=>36, 1=>30
Test set: 17 samples. Y: 0=>5, 1=>12


### Experiment 1
* Initial number of estimators: 2000
* number of estimators thereafter: 500
* Percentage of features to eliminate each iteration: 20%

In [116]:
rf_eliminate(X_train, y_train, X_test, y_test, start_n_estimators=2000, n_estimators=500)

Pass #1
Training set: 66x54675
OOB Score: 0.636363636364
Accuracy: 0.647058823529

Pass #2
Training set: 66x43740
OOB Score: 0.606060606061
Accuracy: 0.647058823529

Pass #3
Training set: 66x34992
OOB Score: 0.651515151515
Accuracy: 0.647058823529

Pass #4
Training set: 66x27993
OOB Score: 0.681818181818
Accuracy: 0.647058823529

Pass #5
Training set: 66x22394
OOB Score: 0.69696969697
Accuracy: 0.647058823529

Pass #6
Training set: 66x17915
OOB Score: 0.757575757576
Accuracy: 0.647058823529

Pass #7
Training set: 66x14332
OOB Score: 0.742424242424
Accuracy: 0.647058823529

Pass #8
Training set: 66x11465
OOB Score: 0.772727272727
Accuracy: 0.588235294118

Pass #9
Training set: 66x9172
OOB Score: 0.80303030303
Accuracy: 0.647058823529

Pass #10
Training set: 66x7337
OOB Score: 0.787878787879
Accuracy: 0.647058823529

Pass #11
Training set: 66x5869
OOB Score: 0.787878787879
Accuracy: 0.647058823529

Pass #12
Training set: 66x4695
OOB Score: 0.787878787879
Accuracy: 0.647058823529

Pass #1

### Experiment 2
* Initial number of estimators: 10000
* number of estimators thereafter: 2000
* Percentage of features to eliminate each iteration: 50%

In [118]:
rf_eliminate(X_train, y_train, X_test, y_test, start_n_estimators=10000, n_estimators=2000, elim_pct=50)

Pass #1
Training set: 66x54675
OOB Score: 0.590909090909
Accuracy: 0.647058823529

Pass #2
Training set: 66x27337
OOB Score: 0.712121212121
Accuracy: 0.647058823529

Pass #3
Training set: 66x13668
OOB Score: 0.757575757576
Accuracy: 0.647058823529

Pass #4
Training set: 66x6834
OOB Score: 0.787878787879
Accuracy: 0.647058823529

Pass #5
Training set: 66x3417
OOB Score: 0.80303030303
Accuracy: 0.705882352941

Pass #6
Training set: 66x1708
OOB Score: 0.848484848485
Accuracy: 0.705882352941

Pass #7
Training set: 66x854
OOB Score: 0.878787878788
Accuracy: 0.647058823529

Pass #8
Training set: 66x427
OOB Score: 0.893939393939
Accuracy: 0.647058823529

Pass #9
Training set: 66x213
OOB Score: 0.924242424242
Accuracy: 0.647058823529

Pass #10
Training set: 66x106
OOB Score: 0.939393939394
Accuracy: 0.823529411765

Pass #11
Training set: 66x53
OOB Score: 0.924242424242
Accuracy: 0.588235294118

Pass #12
Training set: 66x26
OOB Score: 0.924242424242
Accuracy: 0.470588235294

Minimum Features re

In [117]:
rf_eliminate(X_train, y_train, X_test, y_test, start_n_estimators=10000, n_estimators=2000, elim_pct=75)

Pass #1
Training set: 66x54675
OOB Score: 0.606060606061
Accuracy: 0.647058823529

Pass #2
Training set: 66x13668
OOB Score: 0.742424242424
Accuracy: 0.705882352941

Pass #3
Training set: 66x3417
OOB Score: 0.833333333333
Accuracy: 0.705882352941

Pass #4
Training set: 66x854
OOB Score: 0.893939393939
Accuracy: 0.705882352941

Pass #5
Training set: 66x213
OOB Score: 0.924242424242
Accuracy: 0.764705882353

Pass #6
Training set: 66x53
OOB Score: 0.924242424242
Accuracy: 0.588235294118

Minimum Features reached
Top Probes:
1	226797_at	0.0346285073731
2	239897_at	0.0320172007759
3	226583_at	0.0317596476386
4	218079_s_at	0.0317246198582
5	208771_s_at	0.0276584730259
6	1556195_a_at	0.0253082247988
7	240187_at	0.0250950067663
8	218343_s_at	0.0236146270237
9	215704_at	0.0233113903379
10	219536_s_at	0.0221151129455
11	238568_s_at	0.0213012710018
12	219453_at	0.0208923243755
13	231336_at	0.0207808879756
14	243785_at	0.0195694291073
15	1563801_at	0.0194078021501
16	34031_i_at	0.0188908417961
17	

### //TODO: Cross Validation Stuff
Use a 10-fold stratified cross-validation and grid parameter search


In [None]:

from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold

rf = RandomForestClassifier()
cross_validation = StratifiedKFold(folfox_training_truth, n_folds=10, shuffle=True)
parameter_grid = {'n_estimators': [200, 500, 1000, 1500, 2000],
                  'min_samples_split': [2, 3, 4]}

grid_search = GridSearchCV(rf, param_grid=parameter_grid, cv=cross_validation)
grid_search.fit(folfox_training, folfox_training_truth)

print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

rf_best = grid_search.best_estimator_


Best score: 0.671875
Best parameters: {'min_samples_split': 2, 'n_estimators': 2000}
