In [1]:
from sklearn import cross_validation
from scikits.statsmodels.tools import categorical
from sklearn import linear_model
import time

In [2]:
execfile('ML_Challenge_data_preprocessing.py')

('Generated X with shape ', (347698, 1000))
('Generated y with shape ', (347698,))


Remove observations with unknown nucleobases.

In [34]:
X2 = []
y2 = []
for i, x in enumerate(X):
    if 'N' not in X[i,:]:
        X2.append(x)
        y2.append(y[i])
        
X2 = np.array(X2)
y2 = np.array(y2)
print X2.shape
print y2.shape

(347683, 1000)
(347683,)


**Categorical independent variables are converted to binary variables.** The result is a feature vector of length 4000, where each variable represents the presence of the base nucleobases 'A', 'T', 'C', 'G'. The 4000 length array can be interpreted as the sequence item is the index / 5 and the modulo of the index with 5 represents the base pair as the index into the array of the base pairs.

In [35]:
num_pair_types = 4 # ATCG
sample_size = 10000
random_sample_idcs = np.random.choice(X2.shape[0], sample_size)
X_sample = np.zeros((sample_size, X2.shape[1]*num_pair_types))
y_sample = np.zeros((sample_size))

for i in range(sample_size):
    rand_idx = random_sample_idcs[i]
    one_hot = categorical(X2[i,:], drop = True)
    one_hot = one_hot.flatten()
    X_sample[i,:] = one_hot
    y_sample[i] = y2[i]

print X_sample.shape
print y_sample.shape

(10000, 4000)
(10000,)


In [36]:
test_size = 0.1
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X_sample, y_sample, test_size=test_size, random_state=0)
print X_train.shape
print y_train.shape

(9000, 4000)
(9000,)


# LOGISTIC REGRESSION

In [37]:
import time

start_time = time.clock()
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

time spent to train 0.9: 5.461551 seconds.


In [38]:
logreg.score(X_test, y_test)

0.84099999999999997

# SUPPORT VECTOR MACHINE - RBF

In [39]:
from sklearn.svm import SVC

start_time = time.clock()
clf = SVC(kernel = 'rbf')
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
time spent to train 0.9: 110.086111 seconds.


In [40]:
clf.score(X_test, y_test)

0.88100000000000001

# SUPPORT VECTOR MACHINE - LINEAR

In [41]:
from sklearn.svm import SVC

start_time = time.clock()
clf = SVC(kernel = 'linear')
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
time spent to train 0.9: 97.833125 seconds.


In [42]:
clf.score(X_test, y_test)

0.83499999999999996

# K-NEAREST NEIGHBORS

In [50]:
from sklearn import neighbors

max_neighbors=42
inc_by = 3
# starting from 2 - 20 neighbors went approximately linearly from 0.83 to 0.877
num_neighbors = range(3,max_neighbors,inc_by)

accuracies = []
for k in num_neighbors:
    clf = neighbors.KNeighborsClassifier(n_neighbors=k)#, weights='distance')
    clf.fit(X_train, y_train)
    start_fit = time.clock()
    score = clf.score(X_test, y_test)
    finish_fit = time.clock()
    accuracies.append(score)    
    print 'Accuracy for ' + str(k) + ' neighbors: ' + str(score)
    print 'Time spent calculating neighbors: ' + str(finish_fit - start_fit)
    print ''

Accuracy for 3 neighbors: 0.867
Time spent calculating neighbors: 55.09233

Accuracy for 6 neighbors: 0.863
Time spent calculating neighbors: 54.528943

Accuracy for 9 neighbors: 0.87
Time spent calculating neighbors: 54.589169

Accuracy for 12 neighbors: 0.872
Time spent calculating neighbors: 54.710428

Accuracy for 15 neighbors: 0.872
Time spent calculating neighbors: 54.57681

Accuracy for 18 neighbors: 0.87
Time spent calculating neighbors: 60.997336

Accuracy for 21 neighbors: 0.87
Time spent calculating neighbors: 65.221036

Accuracy for 24 neighbors: 0.865
Time spent calculating neighbors: 64.83859

Accuracy for 27 neighbors: 0.873
Time spent calculating neighbors: 65.23259

Accuracy for 30 neighbors: 0.872
Time spent calculating neighbors: 65.061446

Accuracy for 33 neighbors: 0.875
Time spent calculating neighbors: 65.0561

Accuracy for 36 neighbors: 0.87
Time spent calculating neighbors: 65.190548

Accuracy for 39 neighbors: 0.873
Time spent calculating neighbors: 65.218825


KNN plateaus at ~23 neighbors (0.88 for sample size of 5000, trained on 0.90%)

# GRADIENT BOOSTING CLASSIFIER

In [54]:
from sklearn.ensemble import GradientBoostingClassifier 

start_time = time.clock()
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0)
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',
              max_depth=2, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=0, subsample=1.0, verbose=0, warm_start=False)
time spent to train 0.9: 249.693189 seconds.


In [55]:
clf.score(X_test, y_test)

0.86499999999999999

# ADABOOST

In [56]:
from sklearn.ensemble import AdaBoostClassifier 

start_time = time.clock()
clf = AdaBoostClassifier()
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
time spent to train 0.9: 43.809017 seconds.


In [57]:
clf.score(X_test, y_test)

0.85299999999999998

# RANDOM FOREST

In [62]:
from sklearn.ensemble import RandomForestClassifier 

start_time = time.clock()
clf = RandomForestClassifier(n_estimators=10)
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
time spent to train 0.9: 0.997818 seconds.


In [63]:
clf.score(X_test, y_test)

0.85899999999999999