In [1]:
from sklearn import cross_validation
from scikits.statsmodels.tools import categorical
from sklearn import linear_model
import time

In [73]:
execfile('ML_Challenge_data_preprocessing.py')

('Generated X with shape ', (347698, 1000))
('Generated y with shape ', (347698,))


Remove observations with unknown nucleobases.

In [74]:
X2 = []
y2 = []
for i, x in enumerate(X):
    if 'N' not in X[i,:]:
        X2.append(x)
        y2.append(y[i])
        
X2 = np.array(X2)
y2 = np.array(y2)
print X2.shape
print y2.shape

(347683, 1000)
(347683,)


**Categorical independent variables are converted to binary variables.** The result is a feature vector of length 4000, where each variable represents the presence of the base nucleobases 'A', 'T', 'C', 'G'. The 4000 length array can be interpreted as the sequence item is the index / 5 and the modulo of the index with 5 represents the base pair as the index into the array of the base pairs.

In [75]:
num_pair_types = 4 # ATCG
sample_size = 10000
random_sample_idcs = np.random.choice(X2.shape[0], sample_size)
X_sample = np.zeros((sample_size, X2.shape[1]*num_pair_types))
y_sample = np.zeros((sample_size))

for i in range(sample_size):
    rand_idx = random_sample_idcs[i]
    one_hot = categorical(X2[rand_idx,:], drop = True)
    one_hot = one_hot.flatten()
    X_sample[i,:] = one_hot
    y_sample[i] = y2[rand_idx]

print X_sample.shape
print y_sample.shape

(10000, 4000)
(10000,)


In [76]:
test_size = 0.1
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X_sample, y_sample, test_size=test_size, random_state=0)
print X_train.shape
print y_train.shape

(9000, 4000)
(9000,)


# LOGISTIC REGRESSION

In [77]:
import time

start_time = time.clock()
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

time spent to train 0.9: 13.840786 seconds.


In [78]:
logreg.score(X_test, y_test)

0.77700000000000002

# SUPPORT VECTOR MACHINE - RBF

In [79]:
from sklearn.svm import SVC

start_time = time.clock()
clf = SVC(kernel = 'rbf')
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
time spent to train 0.9: 168.742586 seconds.


In [80]:
clf.score(X_test, y_test)

0.82499999999999996

# SUPPORT VECTOR MACHINE - LINEAR

In [81]:
from sklearn.svm import SVC

start_time = time.clock()
clf = SVC(kernel = 'linear')
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
time spent to train 0.9: 170.224178 seconds.


In [82]:
clf.score(X_test, y_test)

0.78000000000000003

# K-NEAREST NEIGHBORS

In [85]:
from sklearn import neighbors

max_neighbors=42
inc_by = 3
num_neighbors = range(3,max_neighbors,inc_by)

accuracies = []
for k in num_neighbors:
    clf = neighbors.KNeighborsClassifier(k)
    clf.fit(X_train, y_train)
    start_fit = time.clock()
    score = clf.score(X_test, y_test)
    finish_fit = time.clock()
    accuracies.append(score)    
    print 'Accuracy for ' + str(k) + ' neighbors: ' + str(score)
    print 'Time spent calculating neighbors: ' + str(finish_fit - start_fit)
    print ''

Accuracy for 3 neighbors: 0.822
Time spent calculating neighbors: 62.200299

Accuracy for 6 neighbors: 0.843
Time spent calculating neighbors: 60.113138

Accuracy for 9 neighbors: 0.84
Time spent calculating neighbors: 60.016146

Accuracy for 12 neighbors: 0.829
Time spent calculating neighbors: 60.20239

Accuracy for 15 neighbors: 0.833
Time spent calculating neighbors: 60.547729

Accuracy for 18 neighbors: 0.827
Time spent calculating neighbors: 60.268797

Accuracy for 21 neighbors: 0.837
Time spent calculating neighbors: 60.953661

Accuracy for 24 neighbors: 0.832
Time spent calculating neighbors: 63.05509

Accuracy for 27 neighbors: 0.839
Time spent calculating neighbors: 61.980521

Accuracy for 30 neighbors: 0.832
Time spent calculating neighbors: 62.111088

Accuracy for 33 neighbors: 0.832
Time spent calculating neighbors: 61.995101

Accuracy for 36 neighbors: 0.834
Time spent calculating neighbors: 61.833016

Accuracy for 39 neighbors: 0.832
Time spent calculating neighbors: 61.

# GRADIENT BOOSTING CLASSIFIER

In [86]:
from sklearn.ensemble import GradientBoostingClassifier 

start_time = time.clock()
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2)
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',
              max_depth=2, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=0, subsample=1.0, verbose=0, warm_start=False)
time spent to train 0.9: 239.749799 seconds.


In [87]:
clf.score(X_test, y_test)

0.78700000000000003

# DECISION TREE CLASSIFIER

In [88]:
from sklearn.tree import DecisionTreeClassifier

start_time = time.clock()
clf = DecisionTreeClassifier(max_depth=5)
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')
time spent to train 0.9: 4.589201 seconds.


In [89]:
clf.score(X_test, y_test)

0.78300000000000003

# ADABOOST

In [90]:
from sklearn.ensemble import AdaBoostClassifier 

start_time = time.clock()
clf = AdaBoostClassifier()
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
time spent to train 0.9: 45.955662 seconds.


In [91]:
clf.score(X_test, y_test)

0.79600000000000004

# RANDOM FOREST

In [92]:
from sklearn.ensemble import RandomForestClassifier 

start_time = time.clock()
clf = RandomForestClassifier(n_estimators=10)
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
time spent to train 0.9: 1.019835 seconds.


In [93]:
clf.score(X_test, y_test)

0.80000000000000004

# NAIVE BAYES

In [94]:
from sklearn.naive_bayes import GaussianNB

start_time = time.clock()
clf = GaussianNB()
print clf.fit(X_train, y_train)
end_time = time.clock()

print 'time spent to train ' + str(1 - test_size) + ': ' + str(end_time - start_time) + ' seconds.'

GaussianNB()
time spent to train 0.9: 0.588723 seconds.


In [95]:
clf.score(X_test, y_test)

0.81200000000000006