In [50]:
import os
import sys
import glob

import numpy as np
import scipy as sp
import pandas as pd
import sklearn as sk
from sklearn.cross_validation import train_test_split
from sklearn import tree
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import ggplot

from IPython.display import HTML

%matplotlib inline

In [18]:
DATA_PATH = "/Users/beingzy/Documents/Projects/learning_dist_metrics_exp"
RANDOM_SEED = 123

df = pd.read_csv(DATA_PATH + "/udiff_df.csv", header = 0)
print df.head(5)

feat_columns = [col for col in df.columns if not col in ["class", "uid_a", "uid_b"]]
print feat_columns

   class  uid_a  uid_b  birthday  education_classes_id  \
0      1    236    186       NaN                   NaN   
1      1    122    285       NaN                   NaN   
2      1     24    346       NaN                   NaN   
3      1    271    304       NaN                   NaN   
4      1    176      9       NaN                   NaN   

   education_concentration_id  education_degree_id  education_school_id  \
0                         NaN                  NaN                  NaN   
1                         NaN                  NaN                    0   
2                           5                  NaN                    0   
3                         NaN                  NaN                  NaN   
4                         NaN                  NaN                  NaN   

   education_type  education_with_id      ...       political  religion  \
0             NaN                NaN      ...             NaN       NaN   
1               2                NaN      ...     

In [4]:
def summarize_null_ptg(ds):
    """Return the percentage of obs. with missing value
       per column
       
       Parameters:
       ----------
       ds: {matrix, pd.DataFrame}
    """
    na_ptg = dict()
    for col in ds.columns:
        na_ptg[col] = ds[col].isnull().sum() * 1.0 / df.shape[0]
    return na_ptg


print summarize_null_ptg(df)
print "#obs: {0}, #feats: {1}".format(df.shape[0], df.shape[1])

{'last_name': 0.82320836157385935, 'locale': 0.29243659258730781, 'education_type': 0.33015107195091498, 'hometown_id': 0.85051546391752575, 'education_concentration_id': 0.8773543680662077, 'location_id': 0.77205436449898335, 'first_name': 0.99290122355794952, 'work_employer_id': 0.96290532586594368, 'middle_name': 0.99885848821032353, 'education_classes_id': 0.99754307423393862, 'political': 1.0, 'work_with_id': 0.99997324581742941, 'work_position_id': 0.98398316270110231, 'religion': 1.0, 'education_with_id': 0.99963435950486923, 'work_location_id': 0.97319676809474542, 'work_end_date': 0.94297345985088998, 'uid_b': 0.0, 'uid_a': 0.0, 'work_start_date': 0.90594121214283163, 'education_school_id': 0.38457353832982555, 'birthday': 0.75150715228480724, 'work_projects_id': 0.99908143973174468, 'class': 0.0, 'name': 1.0, 'gender': 0.29489351835336924, 'education_year_id': 0.56773713123818359, 'work_from_id': 0.99998216387828631, 'education_degree_id': 0.98419719616166657, 'languages_id':

In [20]:
## Missing value handling:
is_feat_na = df[feat_columns].isnull().sum(axis = 1) == len(feat_columns)
df_reduced = df[ ~is_feat_na ] #
print "#obs: {0}, #feats: {1}".format(df_reduced.shape[0], df_reduced.shape[1])
## count the distribution of {0, 1}
print df_reduced["class"].value_counts()

#obs: 162054, #feats: 30
1    110366
0     51688
dtype: int64


In [21]:
df_x = df_reduced[feat_columns].fillna(0)
df_y = df_reduced["class"]

train_x, test_x, train_y, test_y = \
    train_test_split(df_x, df_y, test_size = .2, random_state = RANDOM_SEED)

print "#obs (training): {0}, #obs (testing): {1}".format(train_x.shape[0], test_x.shape[0])

#obs (training): 129643, #obs (testing): 32411


In [7]:
clf = tree.DecisionTreeClassifier()
clf.fit(train_x, train_y)

DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')

In [10]:
from sklearn.externals.six import StringIO  
import pydot 
dot_data = StringIO() 
tree.export_graphviz(clf, out_file=dot_data) 
#graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
#graph.write_pdf("iris.pdf") 

In [11]:
from sklearn import metrics
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True):
    y_pred=clf.predict(X)   
    if show_accuracy:
        print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n"

    if show_classification_report:
        print "Classification report"
        print metrics.classification_report(y,y_pred),"\n"
        
    if show_confusion_matrix:
        print "Confusion matrix"
        print metrics.confusion_matrix(y,y_pred),"\n"
        
measure_performance(train_x, train_y, clf, show_classification_report=True, show_confusion_matrix=True)

Accuracy:0.951 

Classification report
             precision    recall  f1-score   support

          0       0.97      0.88      0.92     41385
          1       0.94      0.99      0.96     88258

avg / total       0.95      0.95      0.95    129643


Confusion matrix
[[36280  5105]
 [ 1218 87040]] 



In [12]:
measure_performance(test_x, test_y, clf, show_classification_report=True, show_confusion_matrix=True)

Accuracy:0.875 

Classification report
             precision    recall  f1-score   support

          0       0.87      0.71      0.78     10303
          1       0.87      0.95      0.91     22108

avg / total       0.87      0.87      0.87     32411


Confusion matrix
[[ 7289  3014]
 [ 1048 21060]] 



In [16]:
from sklearn import pipeline
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import auc

#clf_pipeline = pipeline([( 'clf_tree', tree.DecisionTreeClassifier() )])
clf_tree = tree.DecisionTreeClassifier()
param_grids = [
    {"clf_tree__criterion": ["gini"]},
    #{"clf_tree__criterion": ["gini"], "clf_tree__splitter": ["best"], "clf_tree__min_samples_leaf": [10, 20, 50, 100]},
    #{"clf_tree__criterion": ["entropy"], "clf_tree__splitter": ["best"], "clf_tree__min_samples_leaf": [10, 20, 50, 100]}
    ]
score_func =  [
    ('precision', precision_score),
    ('recall', recall_score),
]

grid_ridge = GridSearchCV(clf_tree, 
                          param_grid = param_grids, 
                          refit = True,
                          n_jobs = -1, 
                          scoring = score_func,
                          cv = StratifiedKFold(train_y, n_folds=5))
perf_report = grid_ridge.fit(train_x, train_y)

An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line statement', (2, 0))
An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line statement', (2, 0))
An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line statement', (2, 0))





JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
    ...........................................................................
/Applications/anaconda/lib/python2.7/runpy.py in _run_module_as_main(mod_name='IPython.kernel.__main__', alter_argv=1)
    157     pkg_name = mod_name.rpartition('.')[0]
    158     main_globals = sys.modules["__main__"].__dict__
    159     if alter_argv:
    160         sys.argv[0] = fname
    161     return _run_code(code, main_globals, None,
--> 162                      "__main__", fname, loader, pkg_name)
        fname = '/Applications/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = 'IPython.kernel'
    163 
    164 def run_module(mod_name, init_globals=None,
    165                run_name=None, alter_sys=False):
    166     """Execute a module's code without importing it

...........................................................................
/Applications/anaconda/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x1032d1d30, file "/App...ite-packages/IPython/kernel/__main__.py", line 1>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/Applications/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'IPython.kernel', 'app': <module 'IPython.kernel.zmq.kernelapp' from '/Ap.../site-packages/IPython/kernel/zmq/kernelapp.pyc'>}, init_globals=None, mod_name='__main__', mod_fname='/Applications/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='IPython.kernel')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x1032d1d30, file "/App...ite-packages/IPython/kernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/Applications/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'IPython.kernel', 'app': <module 'IPython.kernel.zmq.kernelapp' from '/Ap.../site-packages/IPython/kernel/zmq/kernelapp.pyc'>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from IPython.kernel.zmq import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/IPython/config/application.py in launch_instance(cls=<class 'IPython.kernel.zmq.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    569         
    570         If a global instance already exists, this reinitializes and starts it
    571         """
    572         app = cls.instance(**kwargs)
    573         app.initialize(argv)
--> 574         app.start()
        app.start = <bound method IPKernelApp.start of <IPython.kernel.zmq.kernelapp.IPKernelApp object>>
    575 
    576 #-----------------------------------------------------------------------------
    577 # utility functions, for convenience
    578 #-----------------------------------------------------------------------------

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/IPython/kernel/zmq/kernelapp.py in start(self=<IPython.kernel.zmq.kernelapp.IPKernelApp object>)
    369     def start(self):
    370         if self.poller is not None:
    371             self.poller.start()
    372         self.kernel.start()
    373         try:
--> 374             ioloop.IOLoop.instance().start()
    375         except KeyboardInterrupt:
    376             pass
    377 
    378 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    146             PollIOLoop.configure(ZMQIOLoop)
    147         return PollIOLoop.instance()
    148     
    149     def start(self):
    150         try:
--> 151             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    152         except ZMQError as e:
    153             if e.errno == ETERM:
    154                 # quietly return on ETERM
    155                 pass

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    835                 self._events.update(event_pairs)
    836                 while self._events:
    837                     fd, events = self._events.popitem()
    838                     try:
    839                         fd_obj, handler_func = self._handlers[fd]
--> 840                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    841                     except (OSError, IOError) as e:
    842                         if errno_from_exception(e) == errno.EPIPE:
    843                             # Happens when the client closes the connection
    844                             pass

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    428             # dispatch events:
    429             if events & IOLoop.ERROR:
    430                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    431                 return
    432             if events & IOLoop.READ:
--> 433                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    434                 if not self.socket:
    435                     return
    436             if events & IOLoop.WRITE:
    437                 self._handle_send()

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    460                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    461         else:
    462             if self._recv_callback:
    463                 callback = self._recv_callback
    464                 # self._recv_callback = None
--> 465                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    466                 
    467         # self.update_state()
    468         
    469 

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    402         close our socket."""
    403         try:
    404             # Use a NullContext to ensure that all StackContexts are run
    405             # inside our blanket exception handler rather than outside.
    406             with stack_context.NullContext():
--> 407                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    408         except:
    409             gen_log.error("Uncaught exception, closing connection.",
    410                           exc_info=True)
    411             # Close the socket on an uncaught exception from a user callback

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/IPython/kernel/zmq/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    247         if self.control_stream:
    248             self.control_stream.on_recv(self.dispatch_control, copy=False)
    249 
    250         def make_dispatcher(stream):
    251             def dispatcher(msg):
--> 252                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    253             return dispatcher
    254 
    255         for s in self.shell_streams:
    256             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/IPython/kernel/zmq/kernelbase.py in dispatch_shell(self=<IPython.kernel.zmq.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn import pipeline\nfrom sklearn.cross_...))\nperf_report = grid_ridge.fit(train_x, train_y)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'msg_id': '9E14933726D24A35AEC819FBBDB5070C', 'msg_type': 'execute_request', 'session': '50D74FEAC60649579A854D265B5DC92C', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '9E14933726D24A35AEC819FBBDB5070C', 'msg_type': 'execute_request', 'parent_header': {}})
    208         else:
    209             # ensure default_int_handler during handler call
    210             sig = signal(SIGINT, default_int_handler)
    211             self.log.debug("%s: %s", msg_type, msg)
    212             try:
--> 213                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <IPython.kernel.zmq.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['50D74FEAC60649579A854D265B5DC92C']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn import pipeline\nfrom sklearn.cross_...))\nperf_report = grid_ridge.fit(train_x, train_y)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'msg_id': '9E14933726D24A35AEC819FBBDB5070C', 'msg_type': 'execute_request', 'session': '50D74FEAC60649579A854D265B5DC92C', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '9E14933726D24A35AEC819FBBDB5070C', 'msg_type': 'execute_request', 'parent_header': {}}
    214             except Exception:
    215                 self.log.error("Exception in message handler:", exc_info=True)
    216             finally:
    217                 signal(SIGINT, sig)

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/IPython/kernel/zmq/kernelbase.py in execute_request(self=<IPython.kernel.zmq.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['50D74FEAC60649579A854D265B5DC92C'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'from sklearn import pipeline\nfrom sklearn.cross_...))\nperf_report = grid_ridge.fit(train_x, train_y)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'msg_id': '9E14933726D24A35AEC819FBBDB5070C', 'msg_type': 'execute_request', 'session': '50D74FEAC60649579A854D265B5DC92C', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': '9E14933726D24A35AEC819FBBDB5070C', 'msg_type': 'execute_request', 'parent_header': {}})
    357         if not silent:
    358             self.execution_count += 1
    359             self._publish_execute_input(code, parent, self.execution_count)
    360         
    361         reply_content = self.do_execute(code, silent, store_history,
--> 362                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    363 
    364         # Flush output before sending the reply.
    365         sys.stdout.flush()
    366         sys.stderr.flush()

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/IPython/kernel/zmq/ipkernel.py in do_execute(self=<IPython.kernel.zmq.ipkernel.IPythonKernel object>, code=u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    176 
    177         reply_content = {}
    178         # FIXME: the shell calls the exception handler itself.
    179         shell._reply_content = None
    180         try:
--> 181             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <I....kernel.zmq.zmqshell.ZMQInteractiveShell object>>
        code = u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)'
        store_history = True
        silent = False
    182         except:
    183             status = u'error'
    184             # FIXME: this code right now isn't being used yet by default,
    185             # because the run_cell() call above directly fires off exception

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_cell(self=<IPython.kernel.zmq.zmqshell.ZMQInteractiveShell object>, raw_cell=u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)', store_history=True, silent=False, shell_futures=True)
   2863                 self.displayhook.exec_result = result
   2864 
   2865                 # Execute the user code
   2866                 interactivity = "none" if silent else self.ast_node_interactivity
   2867                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2868                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2869 
   2870                 # Reset this so later displayed values do not modify the
   2871                 # ExecutionResult
   2872                 self.displayhook.exec_result = None

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<IPython.kernel.zmq.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>], cell_name='<ipython-input-16-7bea703e915f>', interactivity='none', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2967 
   2968         try:
   2969             for i, node in enumerate(to_run_exec):
   2970                 mod = ast.Module([node])
   2971                 code = compiler(mod, cell_name, "exec")
-> 2972                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <I....kernel.zmq.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x10c049d30, file "<ipython-input-16-7bea703e915f>", line 29>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   2973                     return True
   2974 
   2975             for i, node in enumerate(to_run_interactive):
   2976                 mod = ast.Interactive([node])

...........................................................................
/Applications/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_code(self=<IPython.kernel.zmq.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x10c049d30, file "<ipython-input-16-7bea703e915f>", line 29>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3027         outflag = 1  # happens in more places, so it's easier as default
   3028         try:
   3029             try:
   3030                 self.hooks.pre_run_code_hook()
   3031                 #rprint('Running code', repr(code_obj)) # dbg
-> 3032                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x10c049d30, file "<ipython-input-16-7bea703e915f>", line 29>
        self.user_global_ns = {'DATA_PATH': '/Users/beingzy/Documents/Projects/learning_dist_metrics_exp', 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'HTML': <class 'IPython.core.display.HTML'>, 'In': ['', u"import os\nimport sys\nimport glob\n\nimport n...TML\n\nget_ipython().magic(u'matplotlib inline')", u'DATA_PATH = "/Users/beingzy/Documents/Projects...["class", "uid_a", "uid_b"]]\nprint feat_columns', u'DATA_PATH = "/Users/beingzy/Documents/Projects...["class", "uid_a", "uid_b"]]\nprint feat_columns', u'def summarize_null_ptg(ds):\n    """Return the...}, #feats: {1}".format(df.shape[0], df.shape[1])', u'## Missing value handling:\nis_feat_na = df[fe...{0, 1}\nprint df_reduced["class"].value_counts()', u'df_x = df_reduced[feat_columns].fillna(0)\ndf_...: {1}".format(train_x.shape[0], test_x.shape[0])', u'clf = tree.DecisionTreeClassifier()\nclf.fit(train_x, train_y)', u'from sklearn.externals.six import StringIO  \n...data.getvalue()) \n#graph.write_pdf("iris.pdf") ', u'from sklearn.externals.six import StringIO  \n...data.getvalue()) \n#graph.write_pdf("iris.pdf") ', u'from sklearn.externals.six import StringIO  \n...data.getvalue()) \n#graph.write_pdf("iris.pdf") ', u'from sklearn import metrics\ndef measure_perfo...ication_report=True, show_confusion_matrix=True)', u'measure_performance(test_x, test_y, clf, show_...ication_report=True, show_confusion_matrix=True)', u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)', u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)', u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)', u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)'], 'Out': {7: DecisionTreeClassifier(compute_importances=None,...,
            random_state=None, splitter='best')}, 'RANDOM_SEED': 123, 'StratifiedKFold': <class 'sklearn.cross_validation.StratifiedKFold'>, 'StringIO': <class StringIO.StringIO>, '_': DecisionTreeClassifier(compute_importances=None,...,
            random_state=None, splitter='best'), '_7': DecisionTreeClassifier(compute_importances=None,...,
            random_state=None, splitter='best'), ...}
        self.user_ns = {'DATA_PATH': '/Users/beingzy/Documents/Projects/learning_dist_metrics_exp', 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'HTML': <class 'IPython.core.display.HTML'>, 'In': ['', u"import os\nimport sys\nimport glob\n\nimport n...TML\n\nget_ipython().magic(u'matplotlib inline')", u'DATA_PATH = "/Users/beingzy/Documents/Projects...["class", "uid_a", "uid_b"]]\nprint feat_columns', u'DATA_PATH = "/Users/beingzy/Documents/Projects...["class", "uid_a", "uid_b"]]\nprint feat_columns', u'def summarize_null_ptg(ds):\n    """Return the...}, #feats: {1}".format(df.shape[0], df.shape[1])', u'## Missing value handling:\nis_feat_na = df[fe...{0, 1}\nprint df_reduced["class"].value_counts()', u'df_x = df_reduced[feat_columns].fillna(0)\ndf_...: {1}".format(train_x.shape[0], test_x.shape[0])', u'clf = tree.DecisionTreeClassifier()\nclf.fit(train_x, train_y)', u'from sklearn.externals.six import StringIO  \n...data.getvalue()) \n#graph.write_pdf("iris.pdf") ', u'from sklearn.externals.six import StringIO  \n...data.getvalue()) \n#graph.write_pdf("iris.pdf") ', u'from sklearn.externals.six import StringIO  \n...data.getvalue()) \n#graph.write_pdf("iris.pdf") ', u'from sklearn import metrics\ndef measure_perfo...ication_report=True, show_confusion_matrix=True)', u'measure_performance(test_x, test_y, clf, show_...ication_report=True, show_confusion_matrix=True)', u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)', u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)', u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)', u'from sklearn import pipeline\nfrom sklearn.cro...\nperf_report = grid_ridge.fit(train_x, train_y)'], 'Out': {7: DecisionTreeClassifier(compute_importances=None,...,
            random_state=None, splitter='best')}, 'RANDOM_SEED': 123, 'StratifiedKFold': <class 'sklearn.cross_validation.StratifiedKFold'>, 'StringIO': <class StringIO.StringIO>, '_': DecisionTreeClassifier(compute_importances=None,...,
            random_state=None, splitter='best'), '_7': DecisionTreeClassifier(compute_importances=None,...,
            random_state=None, splitter='best'), ...}
   3033             finally:
   3034                 # Reset our crash handler in place
   3035                 sys.excepthook = old_excepthook
   3036         except SystemExit as e:

...........................................................................
/Users/beingzy/Documents/Projects/learning_dist_metrics_exp/<ipython-input-16-7bea703e915f> in <module>()
     24                           param_grid = param_grids, 
     25                           refit = True,
     26                           n_jobs = -1, 
     27                           scoring = score_func,
     28                           cv = StratifiedKFold(train_y, n_folds=5))
---> 29 perf_report = grid_ridge.fit(train_x, train_y)
     30 
     31 
     32 
     33 

...........................................................................
/Users/beingzy/.local/lib/python2.7/site-packages/sklearn/grid_search.py in fit(self=GridSearchCV(cv=sklearn.cross_validation.Stratif...recall_score at 0x1082cd6e0>)],
       verbose=0), X=array([[   0.,    0.,    0., ...,    0.,    0., ...[   4.,    0.,    0., ...,    0.,  105.,    0.]]), y=array([1, 0, 0, ..., 1, 1, 1]))
    591         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
    592             Target relative to X for classification or regression;
    593             None for unsupervised learning.
    594 
    595         """
--> 596         return self._fit(X, y, ParameterGrid(self.param_grid))
        self._fit = <bound method GridSearchCV._fit of GridSearchCV(...ecall_score at 0x1082cd6e0>)],
       verbose=0)>
        X = array([[   0.,    0.,    0., ...,    0.,    0., ...[   4.,    0.,    0., ...,    0.,  105.,    0.]])
        y = array([1, 0, 0, ..., 1, 1, 1])
        self.param_grid = [{'clf_tree__criterion': ['gini']}]
    597 
    598 
    599 class RandomizedSearchCV(BaseSearchCV):
    600     """Randomized search on hyper parameters.

...........................................................................
/Users/beingzy/.local/lib/python2.7/site-packages/sklearn/grid_search.py in _fit(self=GridSearchCV(cv=sklearn.cross_validation.Stratif...recall_score at 0x1082cd6e0>)],
       verbose=0), X=array([[   0.,    0.,    0., ...,    0.,    0., ...[   4.,    0.,    0., ...,    0.,  105.,    0.]]), y=array([1, 0, 0, ..., 1, 1, 1]), parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
    373             pre_dispatch=pre_dispatch
    374         )(
    375             delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
    376                                     train, test, self.verbose, parameters,
    377                                     self.fit_params, return_parameters=True)
--> 378             for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterGrid object>
    379             for train, test in cv)
    380 
    381         # Out is a list of triplet: score, estimator, n_test_samples
    382         n_fits = len(out)

...........................................................................
/Users/beingzy/.local/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<itertools.islice object>)
    655             if pre_dispatch == "all" or n_jobs == 1:
    656                 # The iterable was consumed all at once by the above for loop.
    657                 # No need to wait for async callbacks to trigger to
    658                 # consumption.
    659                 self._iterating = False
--> 660             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    661             # Make sure that we get a last message telling us we are done
    662             elapsed_time = time.time() - self._start_time
    663             self._print('Done %3i out of %3i | elapsed: %s finished',
    664                         (len(self._output),

    ---------------------------------------------------------------------------
    Sub-process traceback:
    ---------------------------------------------------------------------------
    ValueError                                         Mon Mar  9 22:48:30 2015
PID: 4476                   Python 2.7.9: /Applications/anaconda/bin/python
...........................................................................
/Users/beingzy/.local/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator=DecisionTreeClassifier(compute_importances=None,...,
            random_state=None, splitter='best'), X=array([[   0.,    0.,    0., ...,    0.,    0., ...[   4.,    0.,    0., ...,    0.,  105.,    0.]]), y=array([1, 0, 0, ..., 1, 1, 1]), scorer=[('precision', <function precision_score>), ('recall', <function recall_score>)], train=array([ 25761,  25762,  25763, ..., 129640, 129641, 129642]), test=array([    0,     1,     2, ..., 26288, 26292, 26293]), verbose=0, parameters={'clf_tree__criterion': 'gini'}, fit_params={}, return_train_score=False, return_parameters=True)
   1225     fit_params = dict([(k, np.asarray(v)[train]
   1226                        if hasattr(v, '__len__') and len(v) == n_samples else v)
   1227                        for k, v in fit_params.items()])
   1228 
   1229     if parameters is not None:
-> 1230         estimator.set_params(**parameters)
   1231 
   1232     start_time = time.time()
   1233 
   1234     X_train, y_train = _safe_split(estimator, X, y, train)

...........................................................................
/Users/beingzy/.local/lib/python2.7/site-packages/sklearn/base.pyc in set_params(self=DecisionTreeClassifier(compute_importances=None,...,
            random_state=None, splitter='best'), **params={'clf_tree__criterion': 'gini'})
    244             if len(split) > 1:
    245                 # nested objects case
    246                 name, sub_name = split
    247                 if not name in valid_params:
    248                     raise ValueError('Invalid parameter %s for estimator %s' %
--> 249                                      (name, self))
    250                 sub_object = valid_params[name]
    251                 sub_object.set_params(**{sub_name: value})
    252             else:
    253                 # simple objects case

ValueError: Invalid parameter clf_tree for estimator DecisionTreeClassifier(compute_importances=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            random_state=None, splitter='best')
___________________________________________________________________________

In [26]:
print feat_columns

['birthday', 'education_classes_id', 'education_concentration_id', 'education_degree_id', 'education_school_id', 'education_type', 'education_with_id', 'education_year_id', 'first_name', 'gender', 'hometown_id', 'languages_id', 'last_name', 'locale', 'location_id', 'middle_name', 'name', 'political', 'religion', 'work_employer_id', 'work_end_date', 'work_from_id', 'work_location_id', 'work_position_id', 'work_projects_id', 'work_start_date', 'work_with_id']


In [30]:
df.describe()

Unnamed: 0,class,uid_a,uid_b,birthday,education_classes_id,education_concentration_id,education_degree_id,education_school_id,education_type,education_with_id,...,political,religion,work_employer_id,work_end_date,work_from_id,work_location_id,work_position_id,work_projects_id,work_start_date,work_with_id
count,224264.0,224264.0,224264.0,55728.0,551.0,27505.0,3544.0,138018.0,150223.0,82.0,...,0.0,0.0,8319.0,12789.0,4,6011.0,3592.0,206.0,21094.0,6
mean,0.641414,1688.622414,1772.142894,87.816789,229.125227,185.561643,141.008183,161.6004,0.26255,9.280488,...,,,219.593341,74.976777,0,234.330727,286.505846,6.393204,150.509529,0
std,0.479586,831.244742,810.630781,191.001289,338.094119,211.882213,180.172678,229.809663,0.618928,62.462487,...,,,286.476229,174.076806,0,288.406133,327.466048,28.123868,224.860662,0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.0,0.0,0,0.0,0.0,0.0,0.0,0
25%,0.0,1118.0,1102.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,...,,,5.0,2.0,0,0.0,3.0,1.0,7.0,0
50%,1.0,1737.0,1931.0,1.0,8.0,78.0,5.0,31.0,0.0,0.0,...,,,85.0,9.0,0,51.0,11.5,4.0,36.0,0
75%,1.0,2340.0,2410.0,5.0,733.0,377.0,202.0,289.0,0.0,1.0,...,,,404.0,28.0,0,479.0,513.0,5.0,153.0,0
max,1.0,3213.0,3213.0,1006.0,747.0,1001.0,756.0,1153.0,2.0,536.0,...,,,1114.0,819.0,0,1060.0,986.0,289.0,1014.0,0


#### Variale Processing
* 

In [29]:
remove_feats = ['birthday', 'education_classes_id', 
                
def feature_processor(dataset):
    """
    """
ds = dataset.copy()
    #ds["birthday"] 
ds["education_classes_id"] = df["education_classes_id"].apply(lambda x: 1 if x > 0 else 0)
ds["education_concentration_id"] = df["education_concentration_id"].apply(lambda x: 1 if x > 0 else 0)
    #ds["education_degree_id"] 
ds["education_school_id"] = df["education_schoold_id"].apply(lambda x: 1 if x > 0 else 0)
ds["education_type_id"] = df["education_type_id"].apply(lambda x: 1 if x > 0 else 0)
ds["
                
        
#feat_columns.index("education_classes_id")

1

#### Clustering Analysis To Detect Underlying Pattern of The Data
* K-means Clustering

In [22]:
km_cluster = KMeans()
km_cluster.fit(train_x)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=8, n_init=10,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)

In [23]:
tran_class = km_cluster.predict(train_x)

In [24]:
km_cluster.inertia_

12801191981.687754

In [48]:
#df["education_classes_id"][:100].apply(lambda x: 1 if x > 0 else 0)
df["education_concentration_id"].unique

<bound method Series.unique of 0    NaN
1    NaN
2      5
3    NaN
4    NaN
5    NaN
6      0
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
12   NaN
13   NaN
14     0
...
224249    NaN
224250    NaN
224251    NaN
224252    NaN
224253    NaN
224254    NaN
224255    185
224256    NaN
224257    198
224258    NaN
224259    NaN
224260    NaN
224261    403
224262    NaN
224263    NaN
Name: education_concentration_id, Length: 224264, dtype: float64>

In [44]:
df["education_classes_id"].unique()

array([  nan,    3.,    1.,    0.,    2.,    4.,  746.,    5.,    8.,
         14.,  736.,  739.,   10.,  733.,   13.,  745.,  747.,    7.,
          6.,   11.,    9.,  732.,  738.,   15.,  741.,  731.,   16.,
         12.,  734.,  735.,  325.,  744.,  737.,  414.])