In [1]:
from sklearn import cross_validation
from sklearn import linear_model
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from time import time
import csv

execfile('report.py')

In [2]:
execfile('ML_Challenge_data_preprocessing.py')

('Generated X with shape ', (347698, 1000))
('Generated y with shape ', (347698,))


In [3]:
bigrams = []
with open('possible_bigrams.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        bigrams = row

bigrams.sort()
bigrams_dict = {key: i for i, key in enumerate(bigrams)}
bigrams_dict

{'AA': 0,
 'AC': 1,
 'AG': 2,
 'AT': 3,
 'CA': 4,
 'CC': 5,
 'CG': 6,
 'CT': 7,
 'GA': 8,
 'GC': 9,
 'GG': 10,
 'GT': 11,
 'TA': 12,
 'TC': 13,
 'TG': 14,
 'TT': 15}

In [4]:
X2 = []
y2 = []
for i, x in enumerate(X):
    if 'N' not in X[i,:]:
        X2.append(x)
        y2.append(y[i])
        
X2 = np.array(X2)
y2 = np.array(y2)
print X2.shape
print y2.shape

(347683, 1000)
(347683,)


In [29]:
execfile('ngram_features.py')
# for every observation, create a new feature vector or length len(bigrams_dict.keys())

sample_size = 20000#X.shape[0])

random_sample_idcs = np.random.choice(X2.shape[0], sample_size)
X_sample = np.zeros((sample_size, len(bigrams_dict)))
y_sample = np.zeros((sample_size))

start = time()

for i in range(sample_size):
    rand_idx = random_sample_idcs[i]    
    X_sample[i,:] = ngram_features(X2[rand_idx,:], 2, bigrams_dict)
    y_sample[i] = y2[rand_idx]

finish = time()
print 'time spent creating new features for {0} observations: {1}'.format(sample_size, finish - start)
# will take about 11 minutes for the whole data set
print X_sample.shape
print y_sample.shape

time spent creating new features for 20000 observations: 37.8225741386
(20000, 16)
(20000,)


# Logistic Regression

In [30]:
param_dist = {"penalty": ['l1','l2'],
              "C": [0.1,0.5,1.0,100,1000],
              "solver": ['newton-cg', 'lbfgs', 'liblinear']}

clf = linear_model.LogisticRegression()
random_search = GridSearchCV(clf, param_grid=param_dist, cv=10)
start = time()
random_search.fit(X_sample, y_sample)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

RandomizedSearchCV took 67.07 seconds for 30 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.851 (std: 0.008)
Parameters: {'penalty': 'l1', 'C': 100, 'solver': 'lbfgs'}

Model with rank: 2
Mean validation score: 0.851 (std: 0.008)
Parameters: {'penalty': 'l2', 'C': 100, 'solver': 'lbfgs'}

Model with rank: 3
Mean validation score: 0.851 (std: 0.008)
Parameters: {'penalty': 'l1', 'C': 1000, 'solver': 'lbfgs'}



# SUPPORT VECTOR MACHINE - Linear

In [31]:
from sklearn.svm import LinearSVC

param_dist = {"penalty": ['l1','l2'], # 'l1' penalty not available with hinge loss
              "loss": ["squared_hinge"],
              "dual": [False],
              "C": [0.1,0.5,1.0,100,1000]}

clf = LinearSVC()
random_search = GridSearchCV(clf, param_grid=param_dist, cv=10)
start = time()
random_search.fit(X_sample, y_sample)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

RandomizedSearchCV took 119.50 seconds for 30 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.850 (std: 0.006)
Parameters: {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 1.0, 'dual': False}

Model with rank: 2
Mean validation score: 0.850 (std: 0.006)
Parameters: {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 0.1, 'dual': False}

Model with rank: 3
Mean validation score: 0.850 (std: 0.006)
Parameters: {'penalty': 'l2', 'loss': 'squared_hinge', 'C': 1000, 'dual': False}



# K Nearest Neighbors

In [None]:
from sklearn import neighbors

param_dist = {"n_neighbors": range(3,21,3), # 'l1' penalty not available with hinge loss
              "weights": ["uniform", "distance"],
              "algorithm": ["ball_tree", "kd_tree"],
              "leaf_size": [2,5,30,100],
              "p": [1,2,3]}

clf = neighbors.KNeighborsClassifier()
random_search = GridSearchCV(clf, param_grid=param_dist, cv=10)
start = time()
random_search.fit(X_sample, y_sample)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

# Adaboost

In [25]:
from sklearn.ensemble import AdaBoostClassifier

param_dist = {"n_estimators": range(2,102,102),
              "learning_rate": np.arange(0.1,1.0,0.1)}

clf = AdaBoostClassifier()
random_search = GridSearchCV(clf, param_grid=param_dist, cv=10)
start = time()
random_search.fit(X_sample, y_sample)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

RandomizedSearchCV took 1.59 seconds for 30 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.848 (std: 0.000)
Parameters: {'n_estimators': 2, 'learning_rate': 0.10000000000000001}

Model with rank: 2
Mean validation score: 0.848 (std: 0.000)
Parameters: {'n_estimators': 2, 'learning_rate': 0.20000000000000001}

Model with rank: 3
Mean validation score: 0.848 (std: 0.000)
Parameters: {'n_estimators': 2, 'learning_rate': 0.30000000000000004}

