In [1]:
from sklearn import cross_validation
from scikits.statsmodels.tools import categorical
from sklearn import linear_model
import time

In [2]:
execfile('ML_Challenge_data_preprocessing.py')

('Generated X with shape ', (347698, 1000))
('Generated y with shape ', (347698,))


What is the baseline for evaluating a classifier? Random guesses at rate of our best assumption of the true data's rate of promoter / enhancer.

In [3]:
num_promoters = len(filter(lambda label: label == 0, y))
# y : numpy array of binary labels (promoter = 0, enhancer = 1)
print 'Number of promoters: ' + str(num_promoters)
print 'Percentage of test data: ' + str(float(num_promoters)/y.shape[0])
# So we are looking to achieve accuracy above 0.79

Number of promoters: 274299
Percentage of test data: 0.788900137476


We have categorical independent variables, so we convert them to binary dummy variables. The result is a feature vector of length 5000, where each variable represents the presence of the base pair 'A', 'T', 'C', 'G' or 'N'. The 5000 length array can be interpreted as the sequence item is the index / 5 and the modulo of the index with 5 represents the base pair as the index into the array of the base pairs.

In [73]:
num_pair_types = 5 # ATCGN
sample_size = 10000
X_cat = np.zeros((sample_size, X.shape[1]*num_pair_types))
print X_cat.shape
for i in range(sample_size):
    # hack because some sequences have 'N' and some do not
    one_hot = categorical(np.append(X[i], 'N'), drop = True)
    one_hot = one_hot[0:X.shape[1]]
    one_hot = one_hot.flatten()
    X_cat[i] = one_hot

print len(X_cat[0])
y_sample = y[0:sample_size]

(10000, 5000)
5000


In [74]:
test_size = 0.1
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X_cat, y_sample, test_size=test_size, random_state=0)
print X_train.shape
print y_train.shape

(9000, 5000)
(9000,)


# LOGISTIC REGRESSION

In [76]:
import time

start_time = time.clock()
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

time spent to train 0.9: 137.447075 seconds.


In [78]:
smaller_X_test = X_test#[0:1000]
smaller_y_test = y_test#[0:1000]

logreg.score(smaller_X_test, smaller_y_test)

0.83733103249928098

# SUPPORT VECTOR MACHINE

In [30]:
from sklearn.svm import SVC

start_time = time.clock()
# rbf seems best so far achieving up to 0.88
clf = SVC(kernel = 'rbf')
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
time spent to train 0.9: 148.402174 seconds.


In [32]:
clf.score(X_test, y_test)

0.88

In [None]:
# K-NEAREST NEIGHBORS

In [39]:
from sklearn import neighbors

max_neighbors=42
inc_by = 3
# starting from 2 - 20 neighbors went approximately linearly from 0.83 to 0.877
num_neighbors = range(20,max_neighbors,inc_by)
accuracies = []
for k in num_neighbors:
    clf = neighbors.KNeighborsClassifier(k, weights='distance')
    clf.fit(X_train, y_train)
    start_fit = time.clock()
    score = clf.score(X_test, y_test)
    finish_fit = time.clock()
    accuracies.append(score)    
    print 'Accuracy for ' + str(k) + ' neighbors: ' + str(score)
    print 'Time spent calculating neighbors: ' + str(finish_fit - start_fit)
    print ''

Accuracy for 20 neighbors: 0.877
Time spent calculating neighbors: 73.462508

Accuracy for 23 neighbors: 0.881
Time spent calculating neighbors: 74.32444

Accuracy for 26 neighbors: 0.876
Time spent calculating neighbors: 72.982404

Accuracy for 29 neighbors: 0.877
Time spent calculating neighbors: 73.476836

Accuracy for 32 neighbors: 0.879
Time spent calculating neighbors: 73.740198

Accuracy for 35 neighbors: 0.877
Time spent calculating neighbors: 67.984419

Accuracy for 38 neighbors: 0.881
Time spent calculating neighbors: 71.111214

Accuracy for 41 neighbors: 0.881
Time spent calculating neighbors: 73.089661



KNN plateaus at ~23 neighbors (0.88 for sample size of 5000, trained on 0.90%)

# GRADIENT BOOSTING CLASSIFIER

In [80]:
from sklearn.ensemble import GradientBoostingClassifier 

start_time = time.clock()
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=10, random_state=0)
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',
              max_depth=10, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=0, subsample=1.0, verbose=0, warm_start=False)
time spent to train 0.9: 1751.570423 seconds.


In [82]:
clf.score(X_test, y_test)

0.86299999999999999

## TODO: 

* Try different values for C in SVC