In [1]:
%pylab

Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib


In [2]:
import json
import pandas
import sklearn as sk
from sklearn import *
from pandas.tools.plotting import *
import pydot
import io
import seaborn as sns
from itertools import *



### Helper functions

In [54]:
def foo(x,v_from,v_to):
    return 1/(1+math.exp(-((x-(v_to+v_from)*0.5)/(v_to-v_from)*12)))

In [55]:
x = np.linspace(-6,6,100)
plot(x,[ foo(i,0,4) for i in x])

[<matplotlib.lines.Line2D at 0x21c0482d6d8>]

In [3]:
sns.set()
def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

def rules(clf, features, labels, node_index=0):
    """Structure of rules in a fit decision tree classifier

    Parameters
    ----------
    clf : DecisionTreeClassifier
        A tree that has already been fit.

    features, labels : lists of str
        The names of the features and labels, respectively.

    """
    node = {}
    if clf.tree_.children_left[node_index] == -1:  # indicates leaf
        count_labels = zip(clf.tree_.value[node_index, 0], labels)
        node['name'] = ', '.join(('{} of {}'.format(int(count), label)
                                  for count, label in count_labels))
    else:
        feature = features[clf.tree_.feature[node_index]]
        threshold = clf.tree_.threshold[node_index]
        node['name'] = '{} > {}'.format(feature, threshold)
        left_index = clf.tree_.children_left[node_index]
        right_index = clf.tree_.children_right[node_index]
        node['children'] = [rules(clf, features, labels, right_index),
                            rules(clf, features, labels, left_index)]
    return node

def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = size / 200     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')
        
def cvAdvanced(clf,X,Y,**kwargs):
    fscores_min = []
    precision_min = []
    recall_min = []
    roc_min = []
    
    classification = not kwargs.get("regression")

    mistake_counts = np.zeros(len(Y))

    for size in log_progress( np.linspace(2000,len(Y),50) ):
        size = int(size)
        ids = np.arange(len(Y))
        np.random.shuffle(ids)
        ids = ids[:size]
        Y_base = Y[ids]
        X_base = X[ids]

        fold = sk.cross_validation.StratifiedKFold(Y_base,n_folds=3,shuffle=True,random_state=42)
        fscores = []
        precisions = []
        recalls = []
        rocs = []
        for train_ids, test_ids in fold:
            X_train = X_base[train_ids]
            Y_train = Y_base[train_ids]
            X_test = X_base[test_ids]
            Y_test = Y_base[test_ids]

            clf.fit(X_train,Y_train)

            Y_pred = clf.predict(X_test)

            if (classification):
                fscore = sk.metrics.f1_score(Y_test,Y_pred)
                precision = sk.metrics.precision_score(Y_test,Y_pred)
                recall = sk.metrics.recall_score(Y_test,Y_pred)
            roc_auc = sk.metrics.roc_auc_score(Y_test,Y_pred) #clf.score(X_test,Y_test)

            mistake_counts[ ids[ test_ids[Y_test != Y_pred] ] ] += 1 
            if (classification):
                fscores.append(fscore)
                precisions.append(precision)
                recalls.append(recall)
            rocs.append(roc_auc)

        if (classification):
            fscores_min.append( min(fscores) )
            precision_min.append( min(precisions) )
            recall_min.append( min(recalls) )
        roc_min.append(min(rocs))

    if (classification):
        fscores_min = np.array(fscores_min)
        fscores_min = np.mean(rolling_window(fscores_min, 2), -1)

        precision_min = np.array(precision_min)
        precision_min = np.mean(rolling_window(precision_min, 2), -1)

        recall_min = np.array(recall_min)
        recall_min = np.mean(rolling_window(recall_min, 2), -1)

    roc_min = np.array(roc_min)
    roc_min = np.mean(rolling_window(roc_min,2),-1)

    if (classification):
        print("f score:",fscores[-1])
        print("precision:",precision_min[-1])
        print("recall:",recall_min[-1])
        print("roc :",roc_min[-1])
        return fscores_min, precision_min, recall_min, roc_min, mistake_counts
    else:
        print("score: ",roc_min[-1])
        return None, None, None, roc_min, mistake_counts

# Connections

### Load data

In [4]:
datafiles_number = 3
cons = pandas.DataFrame()
for i in range(datafiles_number):
    new_cons = pandas.read_csv("../data/connections_dataset_"+str(i)+".csv")
    cons = cons.append( new_cons, ignore_index=True )

print( cons.columns.values )
#hist(cons['angle1'],bins=50)
before = len(cons)
cons = cons.drop_duplicates()
after = len(cons)
print(before-after,'duplicates dropped')
cons.describe()

['Y' 'x1' 'y1' 'x2' 'y2' 'signed_angle_sum' 'angle_sum' 'angle_sum_equals'
 'signed_angle_more_than_PI' 'distance' 'distance_closest_line' 'score'
 'cons_score_max' 'cons_score_mean' 'cons_curMax' 'inters_score_max'
 'inters_score_mean' 'strong' 'GroupID']
16392 duplicates dropped


Unnamed: 0,Y,x1,y1,x2,y2,signed_angle_sum,angle_sum,angle_sum_equals,signed_angle_more_than_PI,distance,distance_closest_line,score,cons_score_max,cons_score_mean,cons_curMax,inters_score_max,inters_score_mean,strong,GroupID
count,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0,135824.0
mean,0.050234,2238.391176,-1625.320838,2238.09026,-1625.122274,2.485204,2.226474,0.431743,0.431743,51.131191,0.0,0.05004195,-4.616268,0.0,0.0,-197.350984,0.0,0.670029,0.568257
std,0.218429,4297.113297,4186.021636,4297.330842,4185.880085,1.934537,1.178969,0.495321,0.495321,38.375287,0.0,0.132016,67.78637,0.0,0.0,398.000929,0.0,0.470204,0.495321
min,0.0,-2223.6,-10809.194757,-2223.6,-10794.030906,5.26832e-08,0.001621,0.0,0.0,4.0,0.0,2.619712e-09,-1000.0,0.0,0.0,-1000.0,0.0,0.0,0.0
25%,0.0,-921.782943,-1510.668203,-922.214339,-1509.931984,0.6200404,1.299609,0.0,0.0,22.827946,0.0,0.001136672,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,145.990332,456.458104,147.748306,454.811033,2.014698,2.249285,0.0,0.0,40.304366,0.0,0.006092083,0.0,0.0,0.0,0.0,0.0,1.0,1.0
75%,0.0,3546.9,994.276147,3546.9,993.915597,4.287078,3.019123,1.0,1.0,68.550586,0.0,0.02862586,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,1.0,11206.3,2936.6,11206.3,2936.6,6.26843,6.26595,1.0,1.0,199.959803,0.0,0.9097619,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [9]:
#Scatter plot all data
# scatter_matrix(cons, alpha=0.2, figsize=(6, 6), diagonal='kde')
# show()

sns.pairplot(cons, hue="Y", vars=['signed_angle_sum','angle_sum','distance','distance_closest_line'])

<seaborn.axisgrid.PairGrid at 0x261246c4ef0>

In [6]:
#Extract meanfull groups

conditions = []

#Split cases, when 
conditions.append( (cons['angle_sum_equals'] == 0) & (cons['signed_angle_more_than_PI'] == 0) ) #0
conditions.append( (cons['angle_sum_equals'] == 0) & (cons['signed_angle_more_than_PI'] == 1) ) #1
conditions.append( (cons['angle_sum_equals'] == 1) & (cons['signed_angle_more_than_PI'] == 0) ) #2
conditions.append( (cons['angle_sum_equals'] == 1) & (cons['signed_angle_more_than_PI'] == 1) ) #3

#Separate strong and weak lines
#conditions.append( ( cons['strong'] == 1 ) ) #4
#conditions.append( ( cons['strong'] == 0 ) ) #5

#Filter out mid extreme-values
conditions.append( ( abs( cons['signed_angle_sum'] - math.pi ) < 0.001 ) ) #4

#Filter out low extreme-values
conditions.append( abs( cons['angle_sum'] ) < 0.001 ) #5
conditions.append( abs( cons['signed_angle_sum'] < 0.01 ) ) #6

#Filter out high extreme-values
conditions.append( ( cons['angle_sum'] > (math.pi*2)-0.01) ) #7
conditions.append( ( cons['signed_angle_sum'] > (math.pi*2)-0.01)  ) #8

#Print group info

print('Base conditions:')
print('{:<8}{:<8}{:<8}'.format('ID','Count','True/All'))
for i, condition in enumerate(conditions):
    match = condition
    percent = cons['Y'].ix[match].sum()/(match.sum())*100
    print('{:<8}{:<8}{:<8}'.format(i,condition.sum(),percent))
    
total = len(conditions[0])

conditions_match = []

conditions = np.array(conditions).T

def unique_rows(a):
    b = np.ascontiguousarray(a).view(np.dtype((np.void, a.dtype.itemsize * a.shape[1])))
    _, idx = np.unique(b, return_index=True)
    return idx

unique_conditions = conditions[unique_rows(conditions)]

def matchConditionGroup(condition_group,conditions):
    return (conditions == condition_group).all(axis=1)


true_groups = []
false_groups = []
meanfull_conditions = []

print('\nWhole dataset falls into {} groups'.format(len(unique_conditions)))

print('{:>8} {:>8} {:>9} {:>9}'.format('Group_id','size','True/All','Condition id\'s'))

g_id = 0
for i,condition in enumerate(unique_conditions):
    match = matchConditionGroup(condition,conditions)
    percent = cons['Y'].ix[match].sum()/(match.sum())*100
    
    #if percent > 0.001:
    meanfull_conditions.append(condition)
    ids = np.where(condition)[0]
    print('{:>8} {:>8} {:>8.4f}% '.format(g_id,match.sum(),percent,ids),end='')
    print(ids)
    g_id += 1
    #elif percent > 0.5:
        #true_groups.append(condition)
    #else:
        #false_groups.append(condition)
print('\n Total meanfull groups:',len(meanfull_conditions))
meanfull_conditions = np.array(meanfull_conditions)

# form groups
groups = []

for condition in meanfull_conditions:
    match = matchConditionGroup(condition,conditions)
    groups.append(cons.ix[match])

Base conditions:
ID      Count   True/All
0       77183   7.6998820983895415
1       0       nan     
2       0       nan     
3       58641   1.5006565372350404
4       0       nan     
5       0       nan     
6       760     25.263157894736842
7       0       nan     
8       0       nan     

Whole dataset falls into 3 groups
Group_id     size  True/All Condition id's
       0    58641   1.5007% [3]
       1    76423   7.5252% [0]
       2      760  25.2632% [0 6]

 Total meanfull groups: 3


In [None]:
#Manually name_groups



In [10]:
g = groups[0]

print(len(g))
print(g['Y'].sum())
sns.pairplot(g, hue="Y", vars=['signed_angle_sum','angle_sum','distance','distance_closest_line'])

58641
880


<seaborn.axisgrid.PairGrid at 0x2612f616fd0>

In [501]:
cons['Y'].sum()/len(cons)*100

4.743390357698289

In [7]:
plot( *np.array([[0.0, 6.49365234375],[0.01, 7.911751314484358],[0.02, 8.46074893031668],[0.03, 8.661321038991751],[0.04, 8.841076767823335],[0.05, 9.020832496654917],[0.060000000000000005, 9.200588225486499],[0.07, 9.380343954318082],[0.08, 9.532112720576091],[0.09, 9.640920221459899],[0.09999999999999999, 9.749291650675575],[0.10999999999999999, 9.85766307989125],[0.11999999999999998, 9.966034509106924],[0.12999999999999998, 10.074405938322599],[0.13999999999999999, 10.182777367538273],[0.15, 10.29114879675395],[0.16, 10.399520225969622],[0.17, 10.50789842899093],[0.18000000000000002, 10.616300763342092],[0.19000000000000003, 10.72470446743784],[0.20000000000000004, 10.833108171533587],[0.21000000000000005, 10.941511875629335],[0.22000000000000006, 11.04991557972508],[0.23000000000000007, 11.15831928382083],[0.24000000000000007, 11.266722987916577],[0.25000000000000006, 11.375126692012323],[0.26000000000000006, 11.48784653793859],[0.2700000000000001, 11.659360049137014],[0.2800000000000001, 11.838613878335316],[0.2900000000000001, 12.017867707533616],[0.3000000000000001, 12.197121536731919],[0.3100000000000001, 12.376375365930219],[0.3200000000000001, 12.692567016634225],[0.3300000000000001, 13.2336721777448],[0.34000000000000014, 408.29909244112577],[0.35000000000000014, 612.0957517724576],[0.36000000000000015, 612.7208188206004],[0.37000000000000016, 613.2627997334923],[0.38000000000000017, 613.7719677482778],[0.3900000000000002, 614.2750686959772],[0.4000000000000002, 614.7761336527743],[0.4100000000000002, 615.2794871309707],[0.4200000000000002, 615.7835834920618],[0.4300000000000002, 616.290237523866],[0.4400000000000002, 616.7976930986247],[0.45000000000000023, 617.3079366500452],[0.46000000000000024, 617.8191032412706],[0.47000000000000025, 618.3332321080939],[0.48000000000000026, 618.8484421391827],[0.49000000000000027, 619.3668023592027],[0.5000000000000002, 619.8863980561309],[0.5100000000000002, 620.4093486123775],[0.5200000000000002, 620.9336651034414],[0.5300000000000002, 621.4615476519594],[0.5400000000000003, 621.9908715743702],[0.5500000000000003, 622.523965171828],[0.5600000000000003, 623.0587479823901],[0.5700000000000003, 623.5969940304639],[0.5800000000000003, 624.1406299938687],[0.5900000000000003, 624.6907122755581],[0.6000000000000003, 625.3070475800007],[0.6100000000000003, 626.052364568021],[0.6200000000000003, 767.545941832097],[0.6300000000000003, 996.8086369012408],[0.6400000000000003, 997.3636706189104],[0.6500000000000004, 997.7273769147007],[0.6600000000000004, 998.0090485219137],[0.6700000000000004, 998.2907201291266],[0.6800000000000004, 998.5605872409743],[0.6900000000000004, 998.7736541714464],[0.7000000000000004, 998.9867211019186],[0.7100000000000004, 999.1997880323909],[0.7200000000000004, 999.412854962863],[0.7300000000000004, 999.6194230234096],[0.7400000000000004, 999.8190427527464],[0.7500000000000004, 1000.0186624820834],[0.7600000000000005, 1000.2182822114205],[0.7700000000000005, 1000.4179019407575],[0.7800000000000005, 1000.6165918220662],[0.7900000000000005, 1000.8142689252654],[0.8000000000000005, 1001.0119460284648],[0.8100000000000005, 1001.2096231316639],[0.8200000000000005, 1001.407300234863],[0.8300000000000005, 1001.604733407132],[0.8400000000000005, 1001.8018363397516],[0.8500000000000005, 1001.9989392723717],[0.8600000000000005, 1002.1960422049913],[0.8700000000000006, 1002.3931451376111],[0.8800000000000006, 1002.5925059036181],[0.8900000000000006, 1002.795945655833],[0.9000000000000006, 1002.9993854080479],[0.9100000000000006, 1003.2028251602629],[0.9200000000000006, 1003.4062649124779],[0.9300000000000006, 1003.6292587801763],[0.9400000000000006, 1003.8765768689628],[0.9500000000000006, 1004.1238949577491],[0.9600000000000006, 1004.3712130465356],[0.9700000000000006, 1004.687139348294],[0.9800000000000006, 1005.1033530511329],[0.9900000000000007, 1005.5239246780657],[1.0000000000000007, 1022.50048828125],]).T)

[<matplotlib.lines.Line2D at 0x1fd03b6e710>]

In [10]:
plot( *np.array([[0.0, 0.0],[0.01, 0.0],[0.02, 0.0],[0.03, 0.0],[0.04, 0.0],[0.05, 0.0],[0.060000000000000005, 0.0],[0.07, 0.0],[0.08, 0.0],[0.09, 0.0],[0.09999999999999999, 0.0],[0.10999999999999999, 0.0],[0.11999999999999998, 0.0],[0.12999999999999998, 0.0],[0.13999999999999999, 0.0],[0.15, 0.0],[0.16, 0.0],[0.17, 0.0],[0.18000000000000002, 0.0],[0.19000000000000003, 0.0],[0.20000000000000004, 0.0],[0.21000000000000005, 0.0],[0.22000000000000006, 0.0],[0.23000000000000007, 0.0],[0.24000000000000007, 0.0],[0.25000000000000006, 0.0],[0.26000000000000006, 0.0],[0.2700000000000001, 0.0],[0.2800000000000001, 0.0],[0.2900000000000001, 0.0],[0.3000000000000001, 0.0],[0.3100000000000001, 0.0],[0.3200000000000001, 0.0],[0.3300000000000001, 0.0],[0.34000000000000014, 0.0],[0.35000000000000014, 0.0],[0.36000000000000015, 0.0],[0.37000000000000016, 0.0],[0.38000000000000017, 0.0],[0.3900000000000002, 0.0],[0.4000000000000002, 0.0],[0.4100000000000002, 0.0],[0.4200000000000002, 0.0],[0.4300000000000002, 0.0],[0.4400000000000002, 0.0],[0.45000000000000023, 0.0],[0.46000000000000024, 0.0],[0.47000000000000025, 0.0],[0.48000000000000026, 0.0],[0.49000000000000027, 0.0],[0.5000000000000002, 0.0],[0.5100000000000002, 0.0],[0.5200000000000002, 0.0],[0.5300000000000002, 0.0],[0.5400000000000003, 0.0],[0.5500000000000003, 0.0],[0.5600000000000003, 0.0],[0.5700000000000003, 0.0],[0.5800000000000003, 0.0],[0.5900000000000003, 0.0],[0.6000000000000003, 0.0],[0.6100000000000003, 0.0],[0.6200000000000003, 0.0],[0.6300000000000003, 0.0],[0.6400000000000003, 0.0],[0.6500000000000004, 0.0],[0.6600000000000004, 0.0],[0.6700000000000004, 0.0],[0.6800000000000004, 0.0],[0.6900000000000004, 0.0],[0.7000000000000004, 0.0],[0.7100000000000004, 0.0],[0.7200000000000004, 0.0],[0.7300000000000004, 0.0],[0.7400000000000004, 0.0],[0.7500000000000004, 0.0],[0.7600000000000005, 0.0],[0.7700000000000005, 0.0],[0.7800000000000005, 0.0],[0.7900000000000005, 0.0],[0.8000000000000005, 0.0],[0.8100000000000005, 0.0],[0.8200000000000005, 0.0],[0.8300000000000005, 0.0],[0.8400000000000005, 0.0],[0.8500000000000005, 0.0],[0.8600000000000005, 0.0],[0.8700000000000006, 0.0],[0.8800000000000006, 0.0],[0.8900000000000006, 0.0],[0.9000000000000006, 0.0],[0.9100000000000006, 0.0],[0.9200000000000006, 0.0],[0.9300000000000006, 0.0],[0.9400000000000006, 0.0],[0.9500000000000006, 0.0],[0.9600000000000006, 0.0],[0.9700000000000006, 0.0],[0.9800000000000006, 0.0],[0.9900000000000007, 0.0],[1.0000000000000007, 0.0],]).T)


[<matplotlib.lines.Line2D at 0x1fd03cbdb70>]

In [39]:
plot(arr)
arr = np.ones(100)
plot(arr)

[<matplotlib.lines.Line2D at 0x26147011f60>]

In [42]:
arr = np.array(arr)
plot(arr.cumsum())

[<matplotlib.lines.Line2D at 0x2614ae518d0>]

### Format data

In [33]:
cons.columns.values

array(['Y', 'x1', 'y1', 'x2', 'y2', 'angle1', 'angle2', 'distance',
       'score', 'cons_score_max', 'cons_score_mean', 'cons_curMax',
       'inters_score_max', 'inters_score_mean'], dtype=object)

In [87]:
# 1 angle1 0.141016591082
# 2 angle2 0.140071976028
# 3 distance -0.00104193567356
# 4 DTS 0.000173396848109
# 5 I_COUNT -0.00334494362288
# 6 IsWithingEdge -0.0969711886275
# 7 parallel_score 0.122432937273
# 8 score 0.0859794673862

#             features[cons_score_max] =  -1000;
#             features[cons_score_mean] = 0;
#             features[cons_curMax] = 0;
#             features[inters_score_max] = -1000;
#             features[inters_score_mean] = 0;
            
feature_names = ['angle1', 'angle2', 'distance', 'score', 'cons_score_max', 'cons_score_mean',
       'cons_curMax', 'inters_score_max',
       'inters_score_mean',]

#feature_names = ['angle1', 'angle2', 'distance']

X = np.array(cons[feature_names],dtype=float)
Y = np.array(cons['Y'],dtype=float)

print(len(feature_names))
print(X.shape)
print(Y.shape)

9
(6065, 9)
(6065,)


In [59]:
X

array([[  6.23955986e-01,   8.96401724e-01,   3.62254593e+01, ...,
          1.00000000e+00,  -1.00000000e+03,   0.00000000e+00],
       [  6.30477759e-01,   4.54822462e-01,   9.77668382e+01, ...,
          0.00000000e+00,   1.16284365e+00,   9.90169049e-01],
       [  7.94227837e-01,   9.28247894e-01,   1.91501471e+02, ...,
          0.00000000e+00,   1.19716101e+00,   1.05916637e+00],
       ..., 
       [  4.71662721e-01,  -9.98582767e-02,   1.02089313e+02, ...,
          0.00000000e+00,   1.34625584e+00,   6.83540050e-01],
       [  7.08453513e-01,  -1.71435777e-01,   1.03778790e+02, ...,
          0.00000000e+00,   1.12826982e+00,   4.91295216e-01],
       [  1.00000000e+00,   1.00000000e+00,   1.29349554e+01, ...,
          1.00000000e+00,  -1.00000000e+03,   0.00000000e+00]])

### Generate Draft score

In [63]:
cons.columns.values

array(['Y', 'x1', 'y1', 'x2', 'y2', 'signed_angle_sum', 'angle_sum',
       'angle_sum_equals', 'signed_angle_more_than_PI', 'distance',
       'score', 'cons_score_max', 'cons_score_mean', 'cons_curMax',
       'inters_score_max', 'inters_score_mean'], dtype=object)

In [510]:
#linear_feature_names = ['signed_angle_sum', 'angle_sum', 'distance']

def getLinearModel(X,Y,linear_feature_names):

    es = []
    coefs = []
    scores = []

    for train_ids, test_ids in sk.cross_validation.StratifiedKFold(Y,n_folds=5):


        X_train = X[train_ids]
        Y_train = Y[train_ids]
        X_test = X[test_ids]
        Y_test = Y[test_ids]
        
        clf = sk.linear_model.LogisticRegression()
        clf.fit(X_train,Y_train)

        #print( 'score: ',clf.score(X_test,Y_test) )

        scores.append(clf.score(X_test,Y_test))
        es.append( clf.intercept_[0] )
        coefs.append(clf.coef_[0])

        #print('e:',clf.intercept_[0])
        #for name,weight in zip(linear_feature_names,clf.coef_[0]):
            #print(name,': ',weight,sep='')

    #     prop1 = clf.predict_proba([x])[0][0]
    #     log_odds = -math.exp((x*clf.coef_).sum()+clf.intercept_[0])
    #     odds = exp(log_odds)
    #     prob = odds
    #     deviation.append(prop1-prob)

    es = np.array(es)
    coefs = np.array(coefs)

#     print('scores: ',scores)
#     print('e:',es.mean())
#     for name,weight in zip(linear_feature_names,coefs.mean(axis=0)):
#         print(name,': ',weight,sep='')

    clf = sk.linear_model.LogisticRegression()
    clf.fit(X,Y)
        
    return clf,clf.score(X,Y),np.array(scores).min()



In [536]:
#Generate group 0 coefficients

linear_feature_names = ['angle_sum', 'distance']
X_linear = np.array( groups[0][linear_feature_names] )
Y_linear = np.array( groups[0]['Y'] )
clf,score,cv_score = getLinearModel(X_linear,Y_linear,linear_feature_names)

print(clf)
print(score,cv_score)

print('e:',clf.intercept_[0])
for name,weight in zip(linear_feature_names,clf.coef_[0]):
    print(name,': ',weight,sep='')

def predict_proba_group0(x):
    return 1/(1+exp(-(clf.intercept_[0]+(x*clf.coef_[0]).sum())))

p0s = []
p1s = []
means = []
diffs = []
for x in X_linear:
    p0 = clf.predict_proba([x])[0][1]
    p1 = predict_proba_group0(x)
    p0s.append(p0)
    p1s.append(p1)
    diffs.append(p0-p1)
    means.append((p0+p1)*0.5)
    
# plot(p0s,'r-')
# plot(p1s,'b--')
#print( clf.predict_proba([X_linear[0]])[0][1] )
#print( clf.predict([X_linear[0]]) )


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.986378535075 0.986329572593
e: -1.21071517609
angle_sum: -1.56471653345
distance: -0.0247778080897


In [544]:
#Group 1 regression

linear_feature_names = ['angle_sum','signed_angle_sum', 'distance']
X_linear = np.array( groups[1][linear_feature_names] )
Y_linear = np.array( groups[1]['Y'] )
clf,score,cv_score = getLinearModel(X_linear,Y_linear,linear_feature_names)

print(clf)
print(score,cv_score)

print('e:',clf.intercept_[0])
for name,weight in zip(linear_feature_names,clf.coef_[0]):
    print(name,': ',weight,sep='')

def predict_proba_group0(x):
    return 1/(1+exp(-(clf.intercept_[0]+(x*clf.coef_[0]).sum())))

p0s = []
p1s = []
means = []
diffs = []
for x in X_linear:
    p0 = clf.predict_proba([x])[0][1]
    p1 = predict_proba_group0(x)
    p0s.append(p0)
    p1s.append(p1)
    diffs.append(p0-p1)
    means.append((p0+p1)*0.5)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.942203263415 0.929468725465
e: 2.56578216542
angle_sum: -1.44898066454
signed_angle_sum: -0.947030816533
distance: -0.0648483976278


In [542]:
#Group 2 regression

linear_feature_names = ['angle_sum', 'distance']
X_linear = np.array( groups[2][linear_feature_names] )
Y_linear = np.array( groups[2]['Y'] )
clf,score,cv_score = getLinearModel(X_linear,Y_linear,linear_feature_names)

print(clf)
print(score,cv_score)

print('e:',clf.intercept_[0])
for name,weight in zip(linear_feature_names,clf.coef_[0]):
    print(name,': ',weight,sep='')

def predict_proba_group0(x):
    return 1/(1+exp(-(clf.intercept_[0]+(x*clf.coef_[0]).sum())))

p0s = []
p1s = []
means = []
diffs = []
for x in X_linear:
    p0 = clf.predict_proba([x])[0][1]
    p1 = predict_proba_group0(x)
    p0s.append(p0)
    p1s.append(p1)
    diffs.append(p0-p1)
    means.append((p0+p1)*0.5)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.836842105263 0.754966887417
e: 1.95433308346
angle_sum: -1.39372625195
distance: -0.0466163855841


In [547]:
#Calculate recall thresholds


In [396]:
# Calc Logistic regression for each group:
cval_scores = []
classifiers = []

for g in groups:
    X = np.array(g[linear_feature_names],dtype=float)
    Y = np.array(g['Y'],dtype=float)
    clf,score = getLinearModel(X,Y)
    cval_scores.append(score)
    
print(cval_scores)
    



ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0

### Test goodness-of-fit

In [83]:
# fscores_min = []
# precision_min = []
# recall_min = []
# roc_min = []

# mistake_counts = np.zeros(len(Y))

#clf = sk.tree.DecisionTreeClassifier(random_state=42,min_samples_leaf=3,max_depth=2,class_weight='balanced')
clf = sk.ensemble.AdaBoostClassifier(random_state=42,n_estimators=100)
#clf = sk.linear_model.LinearRegression()

fscores_min, precision_min, recall_min, roc_min, mistake_counts = cvAdvanced(clf,X,Y,regression=False)

#print("f score:",fscores_min[-1])
#print("precision:",precision_min[-1])
#print("recall:",recall_min[-1])
print("roc :",roc_min[-1])

f score: 0.832786885246
precision: 0.846773759673
recall: 0.799782733606
roc : 0.883039493823
roc : 0.883039493823


In [84]:
plot(fscores_min,'k')
plot(precision_min,'r')
plot(recall_min,'b')
plot(roc_min,'y')
grid(True)

In [17]:
for name, val in zip( feature_names,clf.coef_):
    print(name,val)

AttributeError: 'RandomForestClassifier' object has no attribute 'coef_'

In [260]:
feature_values = []
for name, val in zip( feature_names,clf.feature_importances_):
    feature_values.append( (name,val) )

feature_values.sort(key=lambda x: x[1])

feature_values

[('angle2', 0.0),
 ('cons_score_mean', 0.0),
 ('cons_curMax', 0.0),
 ('angle1', 0.011237259532150091),
 ('score', 0.077520830298469443),
 ('inters_score_mean', 0.084101313942202896),
 ('cons_score_max', 0.12213004879433871),
 ('inters_score_max', 0.27400836521308325),
 ('distance', 0.43100218221975561)]

In [569]:
plot(roc_min)
grid(True)

In [54]:
fail_threshold = np.percentile(mistake_counts,95)
ok_threshold = np.percentile(mistake_counts,20)

print("fail_threshold: ",fail_threshold)
print("ok_threshold: ",ok_threshold)

path_to_map = "C:/Users/Artyom.Fomenko/maps3d/connections_dataset_0_map.json"

with open(path_to_map,'r') as f:
    MAP = json.load(f)

for isoline in MAP:
    line = np.array( [i for i in zip(isoline['lineString']['xs'],isoline['lineString']['ys'])] )
    plot(*line.T)
    
Y_pred = clf_final.predict(cons[feature_names])
print("pred count: ",Y_pred.sum())

for x1,y1,x2,y2,mistakes,answer,pred in zip(cons['x1'],cons['y1'],cons['x2'],cons['y2'],mistake_counts,Y,Y_pred):
    
    
    if answer == 1:
        plot([x1,x2],[y1,y2],'g',linewidth=2)
        
    if pred == 1:
        plot([x1,x2],[y1,y2],'r--',linewidth=3.0)
        
        

show()

fail_threshold:  50.0
ok_threshold:  50.0
pred count:  0.0


### Final fit and it's score

In [23]:
clf_final = sk.ensemble.RandomForestClassifier(random_state=42,n_estimators=1000,\
                                         min_samples_leaf=250,max_features=4,max_depth=4,max_leaf_nodes=14)
clf_final.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features=4, max_leaf_nodes=14,
            min_samples_leaf=250, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [24]:
scores = sk.cross_validation.cross_val_score(clf_final, X, Y, cv=5, scoring='f1_weighted')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [25]:
scores

array([ 0.81573021,  0.81478415,  0.81478415,  0.81478415,  0.82140249])

In [26]:
scores

array([ 0.81573021,  0.81478415,  0.81478415,  0.81478415,  0.82140249])

In [27]:
clf_final.score(X,Y)

0.87473684210526315

In [47]:
tree = clf_final.estimators_[0]
#tree.predict(np.array(X[0:1],dtype=np.float32))
tree.tree_.predict(np.array(X[0:10],dtype=np.float32))
# sk.tree.export.export_graphviz(tree,
#                                feature_names=feature_names,
#                                class_names=["False","True"],
#                                label='all',
#                                out_file='C:/Users/Artyom.Fomenko/Desktop/tree.dot')

array([[ 0.95442359],
       [ 0.95442359],
       [ 0.95442359],
       [ 0.95442359],
       [ 0.43561208],
       [ 0.95442359],
       [ 0.95442359],
       [ 0.84339623],
       [ 0.95442359],
       [ 0.95442359]])

In [158]:
clf_final.predict(X[0:10])

array([ 0.92999098,  0.92999098,  0.9438858 ,  0.94391358,  0.60597564,
        0.9438858 ,  0.9438858 ,  0.72059584,  0.9438858 ,  0.92999098])

In [157]:
for x in X[0:10]:
    print("System.out.println( clf.predict(new double[]{",end='')
    print(*x,sep=", ",end='')
    print("}));")

System.out.println( clf.predict(new double[]{1.0, 1.0, 0.51452525198, 1.96774775394, -1000.0, 0.0, 0.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{1.0, 1.0, 0.432822733482, 1.96792913353, -1000.0, 0.0, 0.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{0.999999999991, 0.99999999999, 0.251325986557, 1.96833205629, 0.565146632863, 0.565146632863, 1.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{1.0, 0.999999999998, 0.341717755557, 1.96813138658, 0.926953898949, 0.926953898949, 1.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{1.0, -1.02946293777, 0.25, 0.182407614765, 0.804550757751, 0.804550757751, 0.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{1.0, 1.0, 0.398084848481, 1.96800625164, 0.814962413376, 0.814962413376, 1.0, -1000.0, 0.0}));
System.out.println( clf.predict(new double[]{0.999999999998, 0.999999999998, 0.26170108356, 1.96830902359, 0.563662760697, 0.563662760697, 1.0, -1000.0, 0.0}));
Sy

In [106]:
def predictOne(tree,sample):
    left = tree.children_left
    right = tree.children_right
    threshold = tree.threshold
    feature = tree.feature
    value = tree.value

    node_id = 0

    while (feature[node_id] != -2):
        if (sample[feature[node_id]] > threshold[node_id]):
            node_id = right[node_id]
        else:
            node_id = left[node_id]

    return tree.value[node_id][0][0]

def predict(tree, samples):
    ret = []
    for sample in samples:
        ret.append(predictOne(tree,sample))
    return np.array(ret)

#predict(tree.tree_,X)
#tree.predict(X)

In [138]:
def writeArrayToFile(f, arr):
    
    for i,val in enumerate(arr):
        f.write( "{}".format(val) )
        if (i != len(arr)-1):
            f.write(" ")
        else:
            f.write("\n")

def writeTreeToFile(f, tree):

    f.write("{}\n".format(tree.node_count))
    writeArrayToFile(f,tree.children_left)
    writeArrayToFile(f,tree.children_right)
    writeArrayToFile(f,tree.threshold)
    writeArrayToFile(f,tree.feature)
    writeArrayToFile(f,tree.value[:,:,0].T[0])
    
def writeForestToFile(f,forest):
    
    f.write("{}\n".format(forest[0].n_features_))
    f.write("{}\n".format(len(forest)))
    for tree in forest:
        writeTreeToFile(f,tree.tree_)
        
with open("C:/Users/Artyom.Fomenko/Desktop/forest.txt",'w') as f:
    writeForestToFile(f,clf_final.estimators_)

False

In [1]:
np.array( X[0],dtype=np.float32)

NameError: name 'np' is not defined

In [177]:
clf = sk.linear_model.LinearRegression()
clf.fit(X,Y)

print(clf.intercept_)
print(feature_names)
print(clf.coef_)

Y_pred = clf_final.predict(X)
true_positive = []
true_negative = []
f1 = []
#false_negative = []
values_x = []

total_positives = (Y==1).sum()
total_negatives = (Y==0).sum()

precision = []
recall = []

for threshold in np.linspace(0.01,0.95,200):

    values_x.append(threshold)
    
    Y_bin = Y_pred > threshold
    
        
    false_positive_count = ( np.logical_and( (Y_bin == 1), (Y == 0) ) ) .sum()
    false_negative_count = ( np.logical_and( (Y_bin == 0), (Y == 1) ) ) .sum()
    true_positive_count = ( np.logical_and( (Y_bin == 1), (Y == 1) ) ) .sum()
    true_negative_count = ( np.logical_and( (Y_bin == 0), (Y == 0) ) ) .sum()
    
    #false_negative.append(false_negative_count/total_positives)
    true_positive.append(true_positive_count/total_positives)
    true_negative.append(true_negative_count/total_negatives)
    
    precision_score = true_positive_count/(true_positive_count+false_positive_count)
    recall_score = true_positive_count/(true_positive_count+false_negative_count)
    f1_score = precision_score*recall_score/(precision_score+recall_score)*2
    precision.append( precision_score )
    recall.append( recall_score )
    f1.append( f1_score )
    
    
plot(values_x,f1,'ko-')
plot(values_x,precision,'ro-')
plot(values_x,recall,'bo-')
show()
print(values_x)
# scores = clf.predict(X)
# fpr, tpr, thresholds = metrics.roc_curve(Y, scores)
# len(thresholds)

0.224018251621
['angle1', 'angle2', 'distance', 'score', 'cons_score_max', 'cons_score_mean', 'cons_curMax', 'inters_score_max', 'inters_score_mean']
[  5.48845582e-02   4.51298321e-02  -6.76192408e-03   8.79667531e-02
  -3.21897595e-04  -1.28781349e-01   4.42307948e-01  -1.33020572e-04
  -9.23697360e-02]
[0.01, 0.014723618090452261, 0.01944723618090452, 0.024170854271356783, 0.028894472361809045, 0.033618090452261308, 0.038341708542713564, 0.043065326633165826, 0.047788944723618089, 0.052512562814070352, 0.057236180904522614, 0.06195979899497487, 0.066683417085427132, 0.071407035175879388, 0.076130653266331644, 0.080854271356783913, 0.085577889447236169, 0.090301507537688425, 0.095025125628140694, 0.09974874371859295, 0.10447236180904522, 0.10919597989949748, 0.11391959798994973, 0.118643216080402, 0.12336683417085426, 0.12809045226130653, 0.1328140703517588, 0.13753768844221106, 0.14226130653266331, 0.14698492462311558, 0.15170854271356785, 0.15643216080402009, 0.16115577889447236, 0

In [36]:
clf = sk.tree.DecisionTreeClassifier(random_state=42,min_samples_leaf=5,max_depth=2)
#clf = sk.ensemble.RandomForestClassifier(random_state=42,n_estimators=10)
#clf = sk.linear_model.RidgeClassifier(random_state=42)
clf.fit(X,Y)
sk.tree.export.export_graphviz(clf,
                               feature_names=feature_names,
                               class_names=["False","True"],
                               label='all',
                               out_file='C:/Users/Artyom.Fomenko/Desktop/tree.dot')

# Nearby relationships

In [100]:

fscores_min = []
precision_min = []
recall_min = []

for i in log_progress( range(250) ):

    ids = np.arange(len(Y_nonan))
    np.random.shuffle(ids)
    
    size = (i*4)+50
    Y_base = Y_nonan[ids[:size]]
    X_base = X_nonan[ids[:size]]
    
    fold = sk.cross_validation.StratifiedKFold(Y_base,n_folds=5,shuffle=True,random_state=42)
    fscores = []
    precisions = []
    recalls = []
    for train_ids, test_ids in fold:
        X_train = X_base[train_ids]
        Y_train = Y_base[train_ids]
        X_test = X_base[test_ids]
        Y_test = Y_base[test_ids]
        

        #clf = sk.ensemble.RandomForestClassifier(random_state=42,n_estimators=20)
        clf = sk.linear_model.RidgeClassifier(class_weight='balanced')
        clf.fit(X_train,Y_train)

        Y_pred = clf.predict(X_test)
        
        fscore = sk.metrics.f1_score(Y_test,Y_pred)
        precision = sk.metrics.precision_score(Y_test,Y_pred)
        recall = sk.metrics.recall_score(Y_test,Y_pred)
        
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)
        
    fscores_min.append( min(fscores) )
    precision_min.append( min(precisions) )
    recall_min.append( min(recalls) )
    
fscores_min = np.array(fscores_min)
fscores_min = np.mean(rolling_window(fscores_min, 5), -1)

precision_min = np.array(precision_min)
precision_min = np.mean(rolling_window(precision_min, 5), -1)

recall_min = np.array(recall_min)
recall_min = np.mean(rolling_window(recall_min, 5), -1)



In [103]:
fold = sk.cross_validation.StratifiedKFold(Y_nonan,n_folds=5,shuffle=True,random_state=42)
fscores = []
precisions = []
recalls = []
coefs = []
for train_ids, test_ids in fold:
    X_train = X_nonan[train_ids]
    Y_train = Y_nonan[train_ids]
    X_test = X_nonan[test_ids]
    Y_test = Y_nonan[test_ids]


    #clf = sk.ensemble.RandomForestClassifier(random_state=42,n_estimators=20)
    clf = sk.linear_model.RidgeClassifier(class_weight='balanced')
    clf.fit(X_train,Y_train)
    
    coefs.append(clf.coef_)

    Y_pred = clf.predict(X_test)

    fscores.append(sk.metrics.f1_score(Y_test,Y_pred))
    precisions.append(sk.metrics.precision_score(Y_test,Y_pred))
    recalls.append(sk.metrics.recall_score(Y_test,Y_pred))

print("fscore: ",min(fscores))
print("precision: ",min(precisions))
print("recall: ",min(recalls))
np.array(coefs).mean(axis=0)

fscore:  0.836567485985
precision:  0.940476190476
recall:  0.752229546336


array([[-0.00121034, -0.00508861,  0.40264552,  0.03940161,  0.03502284,
        -0.02276605]])

In [106]:
plot(fscores_min,'k')
plot(precision_min,'r')
plot(recall_min,'b')
grid(True)

In [289]:
clf = sk.ensemble.RandomForestClassifier(random_state=42,n_estimators=10)
clf.fit(X_nonan,Y_nonan)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [193]:
y_pred = clf.predict(X_nonan)

In [194]:
y = Y_nonan
scores = y_pred
fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)



In [204]:
sk.metrics.precision_score(Y_nonan,y_pred)

1.0

In [205]:
sk.metrics.recall_score(Y_nonan,y_pred)

0.99955752212389382