In [1]:
from sklearn import cross_validation
from sklearn import linear_model
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from time import time
import csv

execfile('report.py')

In [2]:
execfile('ML_Challenge_data_preprocessing.py')

('Generated X with shape ', (347698, 1000))
('Generated y with shape ', (347698,))


In [3]:
bigrams = []
with open('possible_bigrams.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        bigrams = row

bigrams.sort()
bigrams_dict = {key: i for i, key in enumerate(bigrams)}
bigrams_dict

{'AA': 0,
 'AC': 1,
 'AG': 2,
 'AT': 3,
 'CA': 4,
 'CC': 5,
 'CG': 6,
 'CT': 7,
 'GA': 8,
 'GC': 9,
 'GG': 10,
 'GT': 11,
 'TA': 12,
 'TC': 13,
 'TG': 14,
 'TT': 15}

In [4]:
X2 = []
y2 = []
for i, x in enumerate(X):
    if 'N' not in X[i,:]:
        X2.append(x)
        y2.append(y[i])
        
X2 = np.array(X2)
y2 = np.array(y2)
print X2.shape
print y2.shape

(347683, 1000)
(347683,)


In [29]:
execfile('ngram_features.py')
# for every observation, create a new feature vector or length len(bigrams_dict.keys())

sample_size = 20000#X.shape[0])

random_sample_idcs = np.random.choice(X2.shape[0], sample_size)
X_sample = np.zeros((sample_size, len(bigrams_dict)))
y_sample = np.zeros((sample_size))

start = time()

for i in range(sample_size):
    rand_idx = random_sample_idcs[i]    
    X_sample[i,:] = ngram_features(X2[rand_idx,:], 2, bigrams_dict)
    y_sample[i] = y2[rand_idx]

finish = time()
print 'time spent creating new features for {0} observations: {1}'.format(sample_size, finish - start)
# will take about 11 minutes for the whole data set
print X_sample.shape
print y_sample.shape

time spent creating new features for 20000 observations: 37.8225741386
(20000, 16)
(20000,)


# Logistic Regression

In [30]:
param_dist = {"penalty": ['l1','l2'],
              "C": [0.1,0.5,1.0,100,1000],
              "solver": ['newton-cg', 'lbfgs', 'liblinear']}

clf = linear_model.LogisticRegression()
random_search = GridSearchCV(clf, param_grid=param_dist, cv=10)
start = time()
random_search.fit(X_sample, y_sample)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

RandomizedSearchCV took 67.07 seconds for 30 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.851 (std: 0.008)
Parameters: {'penalty': 'l1', 'C': 100, 'solver': 'lbfgs'}

Model with rank: 2
Mean validation score: 0.851 (std: 0.008)
Parameters: {'penalty': 'l2', 'C': 100, 'solver': 'lbfgs'}

Model with rank: 3
Mean validation score: 0.851 (std: 0.008)
Parameters: {'penalty': 'l1', 'C': 1000, 'solver': 'lbfgs'}



# SUPPORT VECTOR MACHINE - Linear

In [31]:
from sklearn.svm import LinearSVC

param_dist = {"penalty": ['l1','l2'], # 'l1' penalty not available with hinge loss
              "loss": ["squared_hinge"],
              "dual": [False],
              "C": [0.1,0.5,1.0,100,1000]}

clf = LinearSVC()
random_search = GridSearchCV(clf, param_grid=param_dist, cv=10)
start = time()
random_search.fit(X_sample, y_sample)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

RandomizedSearchCV took 119.50 seconds for 30 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.850 (std: 0.006)
Parameters: {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 1.0, 'dual': False}

Model with rank: 2
Mean validation score: 0.850 (std: 0.006)
Parameters: {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 0.1, 'dual': False}

Model with rank: 3
Mean validation score: 0.850 (std: 0.006)
Parameters: {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 1000, 'dual': False}



# K Nearest Neighbors

In [33]:
from sklearn import neighbors

param_dist = {"n_neighbors": range(3,9,3),
              "weights": ["uniform", "distance"],
              "algorithm": ["ball_tree", "kd_tree"],
              "leaf_size": [2,5,30],
              "p": [1,2]}

clf = neighbors.KNeighborsClassifier()
random_search = GridSearchCV(clf, param_grid=param_dist, cv=10)
start = time()
random_search.fit(X_sample, y_sample)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

RandomizedSearchCV took 394.45 seconds for 30 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.846 (std: 0.008)
Parameters: {'n_neighbors': 6, 'weights': 'distance', 'leaf_size': 2, 'algorithm': 'kd_tree', 'p': 1}

Model with rank: 2
Mean validation score: 0.846 (std: 0.008)
Parameters: {'n_neighbors': 6, 'weights': 'distance', 'leaf_size': 2, 'algorithm': 'ball_tree', 'p': 1}

Model with rank: 3
Mean validation score: 0.846 (std: 0.008)
Parameters: {'n_neighbors': 6, 'weights': 'distance', 'leaf_size': 30, 'algorithm': 'ball_tree', 'p': 1}



# Adaboost

In [34]:
from sklearn.ensemble import AdaBoostClassifier

param_dist = {"n_estimators": range(2,102,102),
              "learning_rate": np.arange(0.1,1.0,0.1)}

clf = AdaBoostClassifier()
random_search = GridSearchCV(clf, param_grid=param_dist, cv=10)
start = time()
random_search.fit(X_sample, y_sample)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

RandomizedSearchCV took 3.25 seconds for 30 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.835 (std: 0.008)
Parameters: {'n_estimators': 2, 'learning_rate': 0.10000000000000001}

Model with rank: 2
Mean validation score: 0.835 (std: 0.008)
Parameters: {'n_estimators': 2, 'learning_rate': 0.20000000000000001}

Model with rank: 3
Mean validation score: 0.835 (std: 0.008)
Parameters: {'n_estimators': 2, 'learning_rate': 0.30000000000000004}



# Final Classifier

In [64]:
X_new = np.zeros((X2.shape[0], len(bigrams_dict)))
start = time()

for i in range(X2.shape[0]):
    X_new[i,:] = ngram_features(X2[i,:], 2, bigrams_dict)

finish = time()
print('time spent creating new features for {0} observations: {1}'.format(X_new.shape[0], finish - start))
# will take about 11 minutes for the whole data set
print(X_new.shape)
print(y2.shape)

time spent creating new features for 347683 observations: 667.224198103
(347683, 16)
(347683,)


In [65]:
start = time()
clf = linear_model.LogisticRegression(penalty = 'l2', C = 100, solver = 'lbfgs')
scores = cross_validation.cross_val_score(clf, X_new, y2, cv=10)
finish = time()

print("Total time: " + str(finish-start))

Total time: 22.7128388882


In [66]:
np.mean(scores)

0.8515719147846571

In [84]:
clf.fit(X_new,y2)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0)

In [100]:
from tabulate import tabulate

coeffs = list(clf.coef_[0])
coeffs_zipped = zip(list(bigrams_dict.keys()), map(lambda x: round(x, 3), coeffs))
print(tabulate(coeffs_zipped, headers=['Bigram', 'Coefficient'], tablefmt="pipe"))

| Bigram   |   Coefficient |
|:---------|--------------:|
| AA       |        -0.002 |
| AC       |        -0.021 |
| GT       |        -0.02  |
| AG       |        -0.004 |
| CC       |         0.03  |
| CA       |         0.003 |
| CG       |        -0.068 |
| TT       |        -0.015 |
| GG       |         0.004 |
| GC       |         0.006 |
| AT       |         0.004 |
| GA       |        -0.019 |
| TG       |         0.023 |
| TA       |        -0.002 |
| TC       |         0.026 |
| CT       |        -0.002 |
