In [1]:
%pylab

Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
import json
import pandas
import sklearn as sk
from sklearn import *
from pandas.tools.plotting import *
import pydot
import io



### Helper functions

In [3]:
def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

def rules(clf, features, labels, node_index=0):
    """Structure of rules in a fit decision tree classifier

    Parameters
    ----------
    clf : DecisionTreeClassifier
        A tree that has already been fit.

    features, labels : lists of str
        The names of the features and labels, respectively.

    """
    node = {}
    if clf.tree_.children_left[node_index] == -1:  # indicates leaf
        count_labels = zip(clf.tree_.value[node_index, 0], labels)
        node['name'] = ', '.join(('{} of {}'.format(int(count), label)
                                  for count, label in count_labels))
    else:
        feature = features[clf.tree_.feature[node_index]]
        threshold = clf.tree_.threshold[node_index]
        node['name'] = '{} > {}'.format(feature, threshold)
        left_index = clf.tree_.children_left[node_index]
        right_index = clf.tree_.children_right[node_index]
        node['children'] = [rules(clf, features, labels, right_index),
                            rules(clf, features, labels, left_index)]
    return node

def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')
        
def cvAdvanced(clf,X,Y,**kwargs):
    fscores_min = []
    precision_min = []
    recall_min = []
    roc_min = []
    
    classification = not kwargs.get("regression")

    mistake_counts = np.zeros(len(Y))

    for size in log_progress( np.linspace(2000,len(Y),50) ):
        size = int(size)
        ids = np.arange(len(Y))
        np.random.shuffle(ids)
        ids = ids[:size]
        Y_base = Y[ids]
        X_base = X[ids]

        fold = sk.cross_validation.StratifiedKFold(Y_base,n_folds=3,shuffle=True,random_state=42)
        fscores = []
        precisions = []
        recalls = []
        rocs = []
        for train_ids, test_ids in fold:
            X_train = X_base[train_ids]
            Y_train = Y_base[train_ids]
            X_test = X_base[test_ids]
            Y_test = Y_base[test_ids]

            clf.fit(X_train,Y_train)

            Y_pred = clf.predict(X_test)

            if (classification):
                fscore = sk.metrics.f1_score(Y_test,Y_pred)
                precision = sk.metrics.precision_score(Y_test,Y_pred)
                recall = sk.metrics.recall_score(Y_test,Y_pred)
            roc_auc = sk.metrics.roc_auc_score(Y_test,Y_pred) #clf.score(X_test,Y_test)

            mistake_counts[ ids[ test_ids[Y_test != Y_pred] ] ] += 1 
            if (classification):
                fscores.append(fscore)
                precisions.append(precision)
                recalls.append(recall)
            rocs.append(roc_auc)

        if (classification):
            fscores_min.append( min(fscores) )
            precision_min.append( min(precisions) )
            recall_min.append( min(recalls) )
        roc_min.append(min(rocs))

    if (classification):
        fscores_min = np.array(fscores_min)
        fscores_min = np.mean(rolling_window(fscores_min, 2), -1)

        precision_min = np.array(precision_min)
        precision_min = np.mean(rolling_window(precision_min, 2), -1)

        recall_min = np.array(recall_min)
        recall_min = np.mean(rolling_window(recall_min, 2), -1)

    roc_min = np.array(roc_min)
    roc_min = np.mean(rolling_window(roc_min,2),-1)

    if (classification):
        print("f score:",fscores[-1])
        print("precision:",precision_min[-1])
        print("recall:",recall_min[-1])
        print("roc :",roc_min[-1])
        return fscores_min, precision_min, recall_min, roc_min, mistake_counts
    else:
        print("score: ",roc_min[-1])
        return None, None, None, roc_min, mistake_counts

# Connections

### Load data

In [8]:
datafiles_number = 1
cons = pandas.DataFrame()
for i in range(datafiles_number):
    new_cons = pandas.read_csv("../data/connections_dataset_"+str(i)+".csv")
    cons = cons.append( new_cons, ignore_index=True )

print( cons.columns.values )
#hist(cons['angle1'],bins=50)

['Y' 'x1' 'y1' 'x2' 'y2' 'angle1' 'angle2' 'distance' 'score'
 'cons_score_max' 'cons_score_mean' 'cons_curMax' 'inters_score_max'
 'inters_score_mean']


In [9]:
cons.describe()

Unnamed: 0,Y,x1,y1,x2,y2,angle1,angle2,distance,score,cons_score_max,cons_score_mean,cons_curMax,inters_score_max,inters_score_mean
count,6373.0,6373.0,6373.0,6373.0,6373.0,6373.0,6373.0,6373.0,6373.0,6373.0,6373.0,6373.0,6373.0,6373.0
mean,0.069983,1768.536361,1028.387907,1770.88758,1030.098229,-0.633503,-0.688845,65.908688,-1.173142,-3.193156,-1.167114,0.074376,-278.679215,-0.69144
std,0.255138,950.658092,1162.402927,950.505805,1162.421872,1.159902,1.181231,53.017305,1.594379,67.405287,0.593001,0.262403,449.081614,0.803545
min,0.0,70.0,-1573.8,70.1,-1573.8,-3.0,-3.0,0.0,-5.626827,-1000.0,-5.037965,0.0,-1000.0,-5.456238
25%,0.0,838.376983,22.2,830.7,25.157834,-1.513748,-1.644391,26.352538,-2.040231,1.220146,-1.533506,0.0,-1000.0,-1.164366
50%,0.0,1853.728061,1090.398374,1871.904998,1102.035125,-0.463744,-0.5358,49.546711,-1.332919,1.655857,-1.139492,0.0,0.708066,-0.540388
75%,0.0,2542.384784,1961.184131,2548.865973,1962.697074,0.357455,0.349809,91.511841,-0.014756,1.902973,-0.762315,0.0,1.446857,0.0
max,1.0,3547.0,2936.6,3547.0,2936.6,1.0,1.0,288.773146,1.9665,1.9665,1.169223,1.0,1.953001,1.407033


In [None]:
#Scatter plot all data
scatter_matrix(cons, alpha=0.2, figsize=(6, 6), diagonal='kde')
show()

In [None]:
#Group by Y and scatter
scatter_matrix(cons.ix[ cons.ix[:,0]==1, 1 : ], alpha=0.2, figsize=(8,8), diagonal='kde')
show()
scatter_matrix(cons.ix[ cons.ix[:,0]==0, 1 : ], alpha=0.2, figsize=(8,8), diagonal='kde')
show()


### Format data

In [33]:
cons.columns.values

array(['Y', 'x1', 'y1', 'x2', 'y2', 'angle1', 'angle2', 'distance',
       'score', 'cons_score_max', 'cons_score_mean', 'cons_curMax',
       'inters_score_max', 'inters_score_mean'], dtype=object)

In [87]:
# 1 angle1 0.141016591082
# 2 angle2 0.140071976028
# 3 distance -0.00104193567356
# 4 DTS 0.000173396848109
# 5 I_COUNT -0.00334494362288
# 6 IsWithingEdge -0.0969711886275
# 7 parallel_score 0.122432937273
# 8 score 0.0859794673862

#             features[cons_score_max] =  -1000;
#             features[cons_score_mean] = 0;
#             features[cons_curMax] = 0;
#             features[inters_score_max] = -1000;
#             features[inters_score_mean] = 0;
            
feature_names = ['angle1', 'angle2', 'distance', 'score', 'cons_score_max', 'cons_score_mean',
       'cons_curMax', 'inters_score_max',
       'inters_score_mean',]

#feature_names = ['angle1', 'angle2', 'distance']

X = np.array(cons[feature_names],dtype=float)
Y = np.array(cons['Y'],dtype=float)

print(len(feature_names))
print(X.shape)
print(Y.shape)

9
(6065, 9)
(6065,)


In [59]:
X

array([[  6.23955986e-01,   8.96401724e-01,   3.62254593e+01, ...,
          1.00000000e+00,  -1.00000000e+03,   0.00000000e+00],
       [  6.30477759e-01,   4.54822462e-01,   9.77668382e+01, ...,
          0.00000000e+00,   1.16284365e+00,   9.90169049e-01],
       [  7.94227837e-01,   9.28247894e-01,   1.91501471e+02, ...,
          0.00000000e+00,   1.19716101e+00,   1.05916637e+00],
       ..., 
       [  4.71662721e-01,  -9.98582767e-02,   1.02089313e+02, ...,
          0.00000000e+00,   1.34625584e+00,   6.83540050e-01],
       [  7.08453513e-01,  -1.71435777e-01,   1.03778790e+02, ...,
          0.00000000e+00,   1.12826982e+00,   4.91295216e-01],
       [  1.00000000e+00,   1.00000000e+00,   1.29349554e+01, ...,
          1.00000000e+00,  -1.00000000e+03,   0.00000000e+00]])

### Generate Draft score

In [203]:
linear_feature_names = ['angle1', 'angle2', 'distance']

X_linear = np.array(cons[linear_feature_names],dtype=float)
Y_linear = np.array(cons['Y'],dtype=float)


es = []
coefs = []
scores = []

for train_ids, test_ids in sk.cross_validation.StratifiedKFold(Y,n_folds=5):

    X_train = X_linear[train_ids]
    Y_train = Y_linear[train_ids]
    X_test = X_linear[test_ids]
    Y_test = Y_linear[test_ids]

    clf = sk.linear_model.LogisticRegression()
    clf.fit(X_train,Y_train)

    #print( 'score: ',clf.score(X_test,Y_test) )

    scores.append(clf.score(X_test,Y_test))
    es.append( clf.intercept_[0] )
    coefs.append(clf.coef_[0])
    
    #print('e:',clf.intercept_[0])
    #for name,weight in zip(linear_feature_names,clf.coef_[0]):
        #print(name,': ',weight,sep='')

#     prop1 = clf.predict_proba([x])[0][0]
#     log_odds = -math.exp((x*clf.coef_).sum()+clf.intercept_[0])
#     odds = exp(log_odds)
#     prob = odds
#     deviation.append(prop1-prob)
    
es = np.array(es)
coefs = np.array(coefs)

print('scores: ',scores)
print('e:',es.mean())
for name,weight in zip(linear_feature_names,coefs.mean(axis=0)):
    print(name,': ',weight,sep='')


scores:  [0.96537510305028851, 0.96537510305028851, 0.95960428689200328, 0.96042868920032975, 0.96372629843363566]
e: -0.550459829065
angle1: 1.62793116524
angle2: 1.88947971323
distance: -0.0630682678325


### Test goodness-of-fit

In [83]:
# fscores_min = []
# precision_min = []
# recall_min = []
# roc_min = []

# mistake_counts = np.zeros(len(Y))

#clf = sk.tree.DecisionTreeClassifier(random_state=42,min_samples_leaf=3,max_depth=2,class_weight='balanced')
clf = sk.ensemble.AdaBoostClassifier(random_state=42,n_estimators=100)
#clf = sk.linear_model.LinearRegression()

fscores_min, precision_min, recall_min, roc_min, mistake_counts = cvAdvanced(clf,X,Y,regression=False)

#print("f score:",fscores_min[-1])
#print("precision:",precision_min[-1])
#print("recall:",recall_min[-1])
print("roc :",roc_min[-1])

f score: 0.832786885246
precision: 0.846773759673
recall: 0.799782733606
roc : 0.883039493823
roc : 0.883039493823


In [84]:
plot(fscores_min,'k')
plot(precision_min,'r')
plot(recall_min,'b')
plot(roc_min,'y')
grid(True)

In [17]:
for name, val in zip( feature_names,clf.coef_):
    print(name,val)

AttributeError: 'RandomForestClassifier' object has no attribute 'coef_'

In [260]:
feature_values = []
for name, val in zip( feature_names,clf.feature_importances_):
    feature_values.append( (name,val) )

feature_values.sort(key=lambda x: x[1])

feature_values

[('angle2', 0.0),
 ('cons_score_mean', 0.0),
 ('cons_curMax', 0.0),
 ('angle1', 0.011237259532150091),
 ('score', 0.077520830298469443),
 ('inters_score_mean', 0.084101313942202896),
 ('cons_score_max', 0.12213004879433871),
 ('inters_score_max', 0.27400836521308325),
 ('distance', 0.43100218221975561)]

In [569]:
plot(roc_min)
grid(True)

In [54]:
fail_threshold = np.percentile(mistake_counts,95)
ok_threshold = np.percentile(mistake_counts,20)

print("fail_threshold: ",fail_threshold)
print("ok_threshold: ",ok_threshold)

path_to_map = "C:/Users/Artyom.Fomenko/maps3d/connections_dataset_0_map.json"

with open(path_to_map,'r') as f:
    MAP = json.load(f)

for isoline in MAP:
    line = np.array( [i for i in zip(isoline['lineString']['xs'],isoline['lineString']['ys'])] )
    plot(*line.T)
    
Y_pred = clf_final.predict(cons[feature_names])
print("pred count: ",Y_pred.sum())

for x1,y1,x2,y2,mistakes,answer,pred in zip(cons['x1'],cons['y1'],cons['x2'],cons['y2'],mistake_counts,Y,Y_pred):
    
    
    if answer == 1:
        plot([x1,x2],[y1,y2],'g',linewidth=2)
        
    if pred == 1:
        plot([x1,x2],[y1,y2],'r--',linewidth=3.0)
        
        

show()

fail_threshold:  50.0
ok_threshold:  50.0
pred count:  0.0


### Final fit and it's score

In [23]:
clf_final = sk.ensemble.RandomForestClassifier(random_state=42,n_estimators=1000,\
                                         min_samples_leaf=250,max_features=4,max_depth=4,max_leaf_nodes=14)
clf_final.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features=4, max_leaf_nodes=14,
            min_samples_leaf=250, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [24]:
scores = sk.cross_validation.cross_val_score(clf_final, X, Y, cv=5, scoring='f1_weighted')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [25]:
scores

array([ 0.81573021,  0.81478415,  0.81478415,  0.81478415,  0.82140249])

In [26]:
scores

array([ 0.81573021,  0.81478415,  0.81478415,  0.81478415,  0.82140249])

In [27]:
clf_final.score(X,Y)

0.87473684210526315

In [47]:
tree = clf_final.estimators_[0]
#tree.predict(np.array(X[0:1],dtype=np.float32))
tree.tree_.predict(np.array(X[0:10],dtype=np.float32))
# sk.tree.export.export_graphviz(tree,
#                                feature_names=feature_names,
#                                class_names=["False","True"],
#                                label='all',
#                                out_file='C:/Users/Artyom.Fomenko/Desktop/tree.dot')

array([[ 0.95442359],
       [ 0.95442359],
       [ 0.95442359],
       [ 0.95442359],
       [ 0.43561208],
       [ 0.95442359],
       [ 0.95442359],
       [ 0.84339623],
       [ 0.95442359],
       [ 0.95442359]])

In [158]:
clf_final.predict(X[0:10])

array([ 0.92999098,  0.92999098,  0.9438858 ,  0.94391358,  0.60597564,
        0.9438858 ,  0.9438858 ,  0.72059584,  0.9438858 ,  0.92999098])

In [157]:
for x in X[0:10]:
    print("System.out.println( clf.predict(new double[]{",end='')
    print(*x,sep=", ",end='')
    print("}));")

System.out.println( clf.predict(new double[]{1.0, 1.0, 0.51452525198, 1.96774775394, -1000.0, 0.0, 0.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{1.0, 1.0, 0.432822733482, 1.96792913353, -1000.0, 0.0, 0.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{0.999999999991, 0.99999999999, 0.251325986557, 1.96833205629, 0.565146632863, 0.565146632863, 1.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{1.0, 0.999999999998, 0.341717755557, 1.96813138658, 0.926953898949, 0.926953898949, 1.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{1.0, -1.02946293777, 0.25, 0.182407614765, 0.804550757751, 0.804550757751, 0.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{1.0, 1.0, 0.398084848481, 1.96800625164, 0.814962413376, 0.814962413376, 1.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{0.999999999998, 0.999999999998, 0.26170108356, 1.96830902359, 0.563662760697, 0.563662760697, 1.0, -1000.0, 0.0}));
Sy

In [106]:
def predictOne(tree,sample):
    left = tree.children_left
    right = tree.children_right
    threshold = tree.threshold
    feature = tree.feature
    value = tree.value

    node_id = 0

    while (feature[node_id] != -2):
        if (sample[feature[node_id]] > threshold[node_id]):
            node_id = right[node_id]
        else:
            node_id = left[node_id]

    return tree.value[node_id][0][0]

def predict(tree, samples):
    ret = []
    for sample in samples:
        ret.append(predictOne(tree,sample))
    return np.array(ret)

#predict(tree.tree_,X)
#tree.predict(X)

In [138]:
def writeArrayToFile(f, arr):
    
    for i,val in enumerate(arr):
        f.write( "{}".format(val) )
        if (i != len(arr)-1):
            f.write(" ")
        else:
            f.write("\n")

def writeTreeToFile(f, tree):

    f.write("{}\n".format(tree.node_count))
    writeArrayToFile(f,tree.children_left)
    writeArrayToFile(f,tree.children_right)
    writeArrayToFile(f,tree.threshold)
    writeArrayToFile(f,tree.feature)
    writeArrayToFile(f,tree.value[:,:,0].T[0])
    
def writeForestToFile(f,forest):
    
    f.write("{}\n".format(forest[0].n_features_))
    f.write("{}\n".format(len(forest)))
    for tree in forest:
        writeTreeToFile(f,tree.tree_)
        
with open("C:/Users/Artyom.Fomenko/Desktop/forest.txt",'w') as f:
    writeForestToFile(f,clf_final.estimators_)

False

In [1]:
np.array( X[0],dtype=np.float32)

NameError: name 'np' is not defined

In [177]:
clf = sk.linear_model.LinearRegression()
clf.fit(X,Y)

print(clf.intercept_)
print(feature_names)
print(clf.coef_)

Y_pred = clf_final.predict(X)
true_positive = []
true_negative = []
f1 = []
#false_negative = []
values_x = []

total_positives = (Y==1).sum()
total_negatives = (Y==0).sum()

precision = []
recall = []

for threshold in np.linspace(0.01,0.95,200):

    values_x.append(threshold)
    
    Y_bin = Y_pred > threshold
    
        
    false_positive_count = ( np.logical_and( (Y_bin == 1), (Y == 0) ) ) .sum()
    false_negative_count = ( np.logical_and( (Y_bin == 0), (Y == 1) ) ) .sum()
    true_positive_count = ( np.logical_and( (Y_bin == 1), (Y == 1) ) ) .sum()
    true_negative_count = ( np.logical_and( (Y_bin == 0), (Y == 0) ) ) .sum()
    
    #false_negative.append(false_negative_count/total_positives)
    true_positive.append(true_positive_count/total_positives)
    true_negative.append(true_negative_count/total_negatives)
    
    precision_score = true_positive_count/(true_positive_count+false_positive_count)
    recall_score = true_positive_count/(true_positive_count+false_negative_count)
    f1_score = precision_score*recall_score/(precision_score+recall_score)*2
    precision.append( precision_score )
    recall.append( recall_score )
    f1.append( f1_score )
    
    
plot(values_x,f1,'ko-')
plot(values_x,precision,'ro-')
plot(values_x,recall,'bo-')
show()
print(values_x)
# scores = clf.predict(X)
# fpr, tpr, thresholds = metrics.roc_curve(Y, scores)
# len(thresholds)

0.224018251621
['angle1', 'angle2', 'distance', 'score', 'cons_score_max', 'cons_score_mean', 'cons_curMax', 'inters_score_max', 'inters_score_mean']
[  5.48845582e-02   4.51298321e-02  -6.76192408e-03   8.79667531e-02
  -3.21897595e-04  -1.28781349e-01   4.42307948e-01  -1.33020572e-04
  -9.23697360e-02]
[0.01, 0.014723618090452261, 0.01944723618090452, 0.024170854271356783, 0.028894472361809045, 0.033618090452261308, 0.038341708542713564, 0.043065326633165826, 0.047788944723618089, 0.052512562814070352, 0.057236180904522614, 0.06195979899497487, 0.066683417085427132, 0.071407035175879388, 0.076130653266331644, 0.080854271356783913, 0.085577889447236169, 0.090301507537688425, 0.095025125628140694, 0.09974874371859295, 0.10447236180904522, 0.10919597989949748, 0.11391959798994973, 0.118643216080402, 0.12336683417085426, 0.12809045226130653, 0.1328140703517588, 0.13753768844221106, 0.14226130653266331, 0.14698492462311558, 0.15170854271356785, 0.15643216080402009, 0.16115577889447236, 0

In [36]:
clf = sk.tree.DecisionTreeClassifier(random_state=42,min_samples_leaf=5,max_depth=2)
#clf = sk.ensemble.RandomForestClassifier(random_state=42,n_estimators=10)
#clf = sk.linear_model.RidgeClassifier(random_state=42)
clf.fit(X,Y)
sk.tree.export.export_graphviz(clf,
                               feature_names=feature_names,
                               class_names=["False","True"],
                               label='all',
                               out_file='C:/Users/Artyom.Fomenko/Desktop/tree.dot')

# Nearby relationships

In [100]:

fscores_min = []
precision_min = []
recall_min = []

for i in log_progress( range(250) ):

    ids = np.arange(len(Y_nonan))
    np.random.shuffle(ids)
    
    size = (i*4)+50
    Y_base = Y_nonan[ids[:size]]
    X_base = X_nonan[ids[:size]]
    
    fold = sk.cross_validation.StratifiedKFold(Y_base,n_folds=5,shuffle=True,random_state=42)
    fscores = []
    precisions = []
    recalls = []
    for train_ids, test_ids in fold:
        X_train = X_base[train_ids]
        Y_train = Y_base[train_ids]
        X_test = X_base[test_ids]
        Y_test = Y_base[test_ids]
        

        #clf = sk.ensemble.RandomForestClassifier(random_state=42,n_estimators=20)
        clf = sk.linear_model.RidgeClassifier(class_weight='balanced')
        clf.fit(X_train,Y_train)

        Y_pred = clf.predict(X_test)
        
        fscore = sk.metrics.f1_score(Y_test,Y_pred)
        precision = sk.metrics.precision_score(Y_test,Y_pred)
        recall = sk.metrics.recall_score(Y_test,Y_pred)
        
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)
        
    fscores_min.append( min(fscores) )
    precision_min.append( min(precisions) )
    recall_min.append( min(recalls) )
    
fscores_min = np.array(fscores_min)
fscores_min = np.mean(rolling_window(fscores_min, 5), -1)

precision_min = np.array(precision_min)
precision_min = np.mean(rolling_window(precision_min, 5), -1)

recall_min = np.array(recall_min)
recall_min = np.mean(rolling_window(recall_min, 5), -1)



In [103]:
fold = sk.cross_validation.StratifiedKFold(Y_nonan,n_folds=5,shuffle=True,random_state=42)
fscores = []
precisions = []
recalls = []
coefs = []
for train_ids, test_ids in fold:
    X_train = X_nonan[train_ids]
    Y_train = Y_nonan[train_ids]
    X_test = X_nonan[test_ids]
    Y_test = Y_nonan[test_ids]


    #clf = sk.ensemble.RandomForestClassifier(random_state=42,n_estimators=20)
    clf = sk.linear_model.RidgeClassifier(class_weight='balanced')
    clf.fit(X_train,Y_train)
    
    coefs.append(clf.coef_)

    Y_pred = clf.predict(X_test)

    fscores.append(sk.metrics.f1_score(Y_test,Y_pred))
    precisions.append(sk.metrics.precision_score(Y_test,Y_pred))
    recalls.append(sk.metrics.recall_score(Y_test,Y_pred))

print("fscore: ",min(fscores))
print("precision: ",min(precisions))
print("recall: ",min(recalls))
np.array(coefs).mean(axis=0)

fscore:  0.836567485985
precision:  0.940476190476
recall:  0.752229546336


array([[-0.00121034, -0.00508861,  0.40264552,  0.03940161,  0.03502284,
        -0.02276605]])

In [106]:
plot(fscores_min,'k')
plot(precision_min,'r')
plot(recall_min,'b')
grid(True)

In [289]:
clf = sk.ensemble.RandomForestClassifier(random_state=42,n_estimators=10)
clf.fit(X_nonan,Y_nonan)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [193]:
y_pred = clf.predict(X_nonan)

In [194]:
y = Y_nonan
scores = y_pred
fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)



In [204]:
sk.metrics.precision_score(Y_nonan,y_pred)

1.0

In [205]:
sk.metrics.recall_score(Y_nonan,y_pred)

0.99955752212389382