In [222]:
import numpy as np
import pandas as pd           # for reading file
import pandas_profiling as pp # statistical visualise
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
%matplotlib inline

In [223]:
train_df = pd.read_csv('../data/train.csv', index_col='Unnamed: 0', sep='\t')
test_df = pd.read_csv('../data/test.csv', index_col='Unnamed: 0', sep='\t')

In [224]:
# train_df.head(1)
# test_df.head(1)

In [225]:
print ("rows \t\t: ", train_df.shape[0])
print ("columns \t: ", train_df.shape[1])

rows 		:  30500
columns 	:  346


---
## 1. Опишите препроцессинг данных, инжиниринг фич и валидацию

#### Разбиваем выборку

In [226]:
X_train = train_df.drop(['0'], axis=1)
y_train = train_df['0']

X_validation = test_df.drop(['0'], axis=1)
y_validation = test_df['0']

In [227]:
print ("Train : ")
print ("\t", X_train.shape)
print ("\t", y_train.shape)
print ("Test : ")
print ("\t", X_validation.shape)
print ("\t", y_validation.shape)

Train : 
	 (30500, 345)
	 (30500,)
Test : 
	 (4166, 345)
	 (4166,)


#### Удаляем признаки у которых только 2 и меньше уникальных значения, и у которых вариабельность меньше 0.5%

In [228]:
def    clean_data(train, test):
    columns_name = []
    
    for i in test:
        if (len(train[i].unique()) == 1):
            columns_name.append(i)
        elif (len(train[i].unique()) == 2) and (len(test[i].unique()) <= 2):
            columns_name.append(i)
        elif (len(train[i].unique()) == 2) and min(train[i].value_counts()) / sum(train[i].value_counts()) * 100 < .5:
            columns_name.append(i)
    return columns_name

In [229]:
clean_columns = clean_data(X_train, X_validation)
X_train.drop(clean_columns, axis=1, inplace=True)
X_validation.drop(clean_columns, axis=1, inplace=True)

In [230]:
print ("Train : ")
print ("\t", X_train.shape)
print ("\t", y_train.shape)
print ("Test : ")
print ("\t", X_validation.shape)
print ("\t", y_validation.shape)

Train : 
	 (30500, 106)
	 (30500,)
Test : 
	 (4166, 106)
	 (4166,)


#### Удаляем признаки у которых уровень корреляция больше 99

In [199]:
# Удаляем признаки с корреляцией 1
def    check_corr_data(train, test, type_corr):
    corr_matrix = train.corr(method=type_corr).abs()
    
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    to_drop = []
    for column in upper.columns:
        if any(upper[column] > 0.99):
            train.drop(column, axis=1, inplace=True)
            test.drop(column, axis=1, inplace=True)
            print (column)
            return True
    return False

def     clean_corr_data(train, test, type_corr):
    while check_corr_data(train, test, type_corr):
        ;

In [231]:
clean_corr_data(X_train, X_validation, 'pearson')

329
333
334


In [232]:
print ("Train : ")
print ("\t", X_train.shape)
print ("\t", y_train.shape)
print ("Test : ")
print ("\t", X_validation.shape)
print ("\t", y_validation.shape)

Train : 
	 (30500, 103)
	 (30500,)
Test : 
	 (4166, 103)
	 (4166,)


#### Кодируем категориальные признаки one_hot_encoding

In [233]:
def one_hot_encoding(train, test):
    for i in train:
        if len(train[i].unique()) < 15 and len(train[i].unique()) > 1:
            train2 = pd.concat([train, pd.get_dummies(train[i], prefix=i)], axis=1)
            train.drop([i], axis=1, inplace=True)
            train = train2
            
            test2 = pd.concat([test, pd.get_dummies(train[i], prefix=i)], axis=1)
            test.drop([i], axis=1, inplace=True)
            test = test2
            
    return train, test

In [234]:
X_train, X_validation = one_hot_encoding(X_train, X_validation);

In [235]:
print ("Train : ")
print ("\t", X_train.shape)
print ("\t", y_train.shape)
print ("Test : ")
print ("\t", X_validation.shape)
print ("\t", y_validation.shape)

Train : 
	 (30500, 221)
	 (30500,)
Test : 
	 (30500, 221)
	 (4166,)


#### Опять чистим коррелирующие данные

In [236]:
clean_corr_data(X_train, X_validation, 'pearson')

120_0.0
127_0.0
212_1.0
312_0.0
312_0.39424
312_0.4324508
312_0.47097
312_0.5197096
312_0.5722851
312_0.588824
312_0.6608119
312_0.7575137
312_0.8421118000000001
312_0.9310675
312_1.0
313_0.0
313_0.016691400000000002
313_0.052069500000000005
313_0.0654514
313_0.1504556
313_0.2396119
313_0.277947
313_0.3472929
313_0.461724
313_0.6188458
313_1.0
314_0.0
314_0.1178385
314_0.1699219
314_0.202474
314_0.2057292
314_0.4466146
314_0.4889323
314_0.5833333
314_0.6126302
314_1.0
315_0.015435399999999997
315_0.033313800000000005
315_0.07876389999999997
315_0.3415823
315_0.4127147
315_0.4488797000000001
315_0.5901081
315_0.6529206
315_0.8525990999999999
315_0.9929259
315_1.0


In [237]:
print ("Train : ")
print ("\t", X_train.shape)
print ("\t", y_train.shape)
print ("Test : ")
print ("\t", X_validation.shape)
print ("\t", y_validation.shape)

Train : 
	 (30500, 174)
	 (30500,)
Test : 
	 (30500, 174)
	 (4166,)


---

In [210]:
# pp.ProfileReport(X_train)

In [17]:
# pp.ProfileReport(X_test)

---
## 2. Постройте логистическую регрессию. Укажите значение на валидации и публичном лидерборде

In [238]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# X_train_1
# y_train_1

scaler = StandardScaler()
logit = LogisticRegression(random_state=17, class_weight='balanced')
logit_pipe_params = {'logit__C': np.logspace(-8, 8, 17)}


In [239]:
log_reg = GridSearchCV(logit, logit_pipe_params, scoring='roc_auc', n_jobs=-1, verbose=1)

log_reg.fit(X_train, y_train)
print (lr_g.best_params_)
print (lr_g.best_score_)

Fitting 3 folds for each of 17 candidates, totalling 51 fits


JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x101190390, file "/Use...3.6/site-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/Users/alatyshe/virt/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/Users/alatyshe/virt/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/Users/alaty.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x101190390, file "/Use...3.6/site-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/Users/alatyshe/virt/lib/python3.6/site-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/Users/alatyshe/virt/lib/python3.6/site-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/site-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/Users/alaty.../python3.6/site-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    481         if self.poller is not None:
    482             self.poller.start()
    483         self.kernel.start()
    484         self.io_loop = ioloop.IOLoop.current()
    485         try:
--> 486             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    487         except KeyboardInterrupt:
    488             pass
    489 
    490 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/tornado/platform/asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    107         except RuntimeError:
    108             old_loop = None
    109         try:
    110             self._setup_logging()
    111             asyncio.set_event_loop(self.asyncio_loop)
--> 112             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Uni...EventLoop running=True closed=False debug=False>>
    113         finally:
    114             asyncio.set_event_loop(old_loop)
    115 
    116     def stop(self):

...........................................................................
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/base_events.py in run_forever(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
    416             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    417                                    finalizer=self._asyncgen_finalizer_hook)
    418         try:
    419             events._set_running_loop(self)
    420             while True:
--> 421                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_UnixS...EventLoop running=True closed=False debug=False>>
    422                 if self._stopping:
    423                     break
    424         finally:
    425             self._stopping = False

...........................................................................
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/base_events.py in _run_once(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
   1421                         logger.warning('Executing %s took %.3f seconds',
   1422                                        _format_handle(handle), dt)
   1423                 finally:
   1424                     self._current_handle = None
   1425             else:
-> 1426                 handle._run()
        handle._run = <bound method Handle._run of <Handle BaseAsyncIOLoop._handle_events(14, 1)>>
   1427         handle = None  # Needed to break cycles when an exception occurs.
   1428 
   1429     def _set_coroutine_wrapper(self, enabled):
   1430         try:

...........................................................................
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/events.py in _run(self=<Handle BaseAsyncIOLoop._handle_events(14, 1)>)
    122             self._callback = None
    123             self._args = None
    124 
    125     def _run(self):
    126         try:
--> 127             self._callback(*self._args)
        self._callback = <bound method BaseAsyncIOLoop._handle_events of <tornado.platform.asyncio.AsyncIOMainLoop object>>
        self._args = (14, 1)
    128         except Exception as exc:
    129             cb = _format_callback_source(self._callback, self._args)
    130             msg = 'Exception in callback {}'.format(cb)
    131             context = {

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/tornado/platform/asyncio.py in _handle_events(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, fd=14, events=1)
     97             self.writers.remove(fd)
     98         del self.handlers[fd]
     99 
    100     def _handle_events(self, fd, events):
    101         fileobj, handler_func = self.handlers[fd]
--> 102         handler_func(fileobj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fileobj = <zmq.sugar.socket.Socket object>
        events = 1
    103 
    104     def start(self):
    105         try:
    106             old_loop = asyncio.get_event_loop()

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    445             return
    446         zmq_events = self.socket.EVENTS
    447         try:
    448             # dispatch events:
    449             if zmq_events & zmq.POLLIN and self.receiving():
--> 450                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    451                 if not self.socket:
    452                     return
    453             if zmq_events & zmq.POLLOUT and self.sending():
    454                 self._handle_send()

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    475             else:
    476                 raise
    477         else:
    478             if self._recv_callback:
    479                 callback = self._recv_callback
--> 480                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    481         
    482 
    483     def _handle_send(self):
    484         """Handle a send event."""

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    427         close our socket."""
    428         try:
    429             # Use a NullContext to ensure that all StackContexts are run
    430             # inside our blanket exception handler rather than outside.
    431             with stack_context.NullContext():
--> 432                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    433         except:
    434             gen_log.error("Uncaught exception in ZMQStream callback",
    435                           exc_info=True)
    436             # Re-raise the exception so that IOLoop.handle_callback_exception

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'log_reg = GridSearchCV(logit, logit_pipe_params,...rint (lr_g.best_params_)\nprint (lr_g.best_score_)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 3, 25, 13, 23, 14, 937725, tzinfo=tzutc()), 'msg_id': '475843a8f6cf40428d1e7d02bc601b79', 'msg_type': 'execute_request', 'session': 'fe3dfffadecd4d8fa58b0b0378bc0f6f', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '475843a8f6cf40428d1e7d02bc601b79', 'msg_type': 'execute_request', 'parent_header': {}})
    228             self.log.warn("Unknown message type: %r", msg_type)
    229         else:
    230             self.log.debug("%s: %s", msg_type, msg)
    231             self.pre_handler_hook()
    232             try:
--> 233                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'fe3dfffadecd4d8fa58b0b0378bc0f6f']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'log_reg = GridSearchCV(logit, logit_pipe_params,...rint (lr_g.best_params_)\nprint (lr_g.best_score_)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 3, 25, 13, 23, 14, 937725, tzinfo=tzutc()), 'msg_id': '475843a8f6cf40428d1e7d02bc601b79', 'msg_type': 'execute_request', 'session': 'fe3dfffadecd4d8fa58b0b0378bc0f6f', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '475843a8f6cf40428d1e7d02bc601b79', 'msg_type': 'execute_request', 'parent_header': {}}
    234             except Exception:
    235                 self.log.error("Exception in message handler:", exc_info=True)
    236             finally:
    237                 self.post_handler_hook()

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'fe3dfffadecd4d8fa58b0b0378bc0f6f'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'log_reg = GridSearchCV(logit, logit_pipe_params,...rint (lr_g.best_params_)\nprint (lr_g.best_score_)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 3, 25, 13, 23, 14, 937725, tzinfo=tzutc()), 'msg_id': '475843a8f6cf40428d1e7d02bc601b79', 'msg_type': 'execute_request', 'session': 'fe3dfffadecd4d8fa58b0b0378bc0f6f', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '475843a8f6cf40428d1e7d02bc601b79', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code='log_reg = GridSearchCV(logit, logit_pipe_params,...rint (lr_g.best_params_)\nprint (lr_g.best_score_)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    203 
    204         self._forward_input(allow_stdin)
    205 
    206         reply_content = {}
    207         try:
--> 208             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = 'log_reg = GridSearchCV(logit, logit_pipe_params,...rint (lr_g.best_params_)\nprint (lr_g.best_score_)'
        store_history = True
        silent = False
    209         finally:
    210             self._restore_input()
    211 
    212         if res.error_before_exec is not None:

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=('log_reg = GridSearchCV(logit, logit_pipe_params,...rint (lr_g.best_params_)\nprint (lr_g.best_score_)',), **kwargs={'silent': False, 'store_history': True})
    532             )
    533         self.payload_manager.write_payload(payload)
    534 
    535     def run_cell(self, *args, **kwargs):
    536         self._last_traceback = None
--> 537         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ('log_reg = GridSearchCV(logit, logit_pipe_params,...rint (lr_g.best_params_)\nprint (lr_g.best_score_)',)
        kwargs = {'silent': False, 'store_history': True}
    538 
    539     def _showtraceback(self, etype, evalue, stb):
    540         # try to preserve ordering of tracebacks and print statements
    541         sys.stdout.flush()

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell='log_reg = GridSearchCV(logit, logit_pipe_params,...rint (lr_g.best_params_)\nprint (lr_g.best_score_)', store_history=True, silent=False, shell_futures=True)
   2723                 self.displayhook.exec_result = result
   2724 
   2725                 # Execute the user code
   2726                 interactivity = "none" if silent else self.ast_node_interactivity
   2727                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2728                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2729                 
   2730                 self.last_execution_succeeded = not has_raised
   2731                 self.last_execution_result = result
   2732 

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Expr object>, <_ast.Expr object>, <_ast.Expr object>], cell_name='<ipython-input-239-6968281f8290>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 10db7a9e8, execution_..._before_exec=None error_in_exec=None result=None>)
   2845 
   2846         try:
   2847             for i, node in enumerate(to_run_exec):
   2848                 mod = ast.Module([node])
   2849                 code = compiler(mod, cell_name, "exec")
-> 2850                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x10b591f60, file "<ipython-input-239-6968281f8290>", line 2>
        result = <ExecutionResult object at 10db7a9e8, execution_..._before_exec=None error_in_exec=None result=None>
   2851                     return True
   2852 
   2853             for i, node in enumerate(to_run_interactive):
   2854                 mod = ast.Interactive([node])

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x10b591f60, file "<ipython-input-239-6968281f8290>", line 2>, result=<ExecutionResult object at 10db7a9e8, execution_..._before_exec=None error_in_exec=None result=None>)
   2905         outflag = True  # happens in more places, so it's easier as default
   2906         try:
   2907             try:
   2908                 self.hooks.pre_run_code_hook()
   2909                 #rprint('Running code', repr(code_obj)) # dbg
-> 2910                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x10b591f60, file "<ipython-input-239-6968281f8290>", line 2>
        self.user_global_ns = {'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "import numpy as np\nimport pandas as pd          ..._ipython().run_line_magic('matplotlib', 'inline')", r"train_df = pd.read_csv('../data/train.csv', inde...data/test.csv', index_col='Unnamed: 0', sep='\t')", '# train_df.head(1)\n# test_df.head(1)', 'print ("rows \\t\\t: ", train_df.shape[0])\nprint ("columns \\t: ", train_df.shape[1])', "X_train = train_df.drop(['0'], axis=1)\ny_train =...test_df.drop(['0'], axis=1)\ny_test = test_df['0']", 'print ("Train : ")\nprint ("\\t", X_train.shape)\np...t ("\\t", X_test.shape)\nprint ("\\t", y_test.shape)', 'def    clean_data(train, test):\n    columns_name...   columns_name.append(i)\n    return columns_name', 'clean_columns = clean_data(X_train, X_test)\nX_tr...\nX_test.drop(clean_columns, axis=1, inplace=True)', 'print ("Train : ")\nprint ("\\t", X_train.shape)\np...t ("\\t", X_test.shape)\nprint ("\\t", y_test.shape)', '# Удаляем признаки с корреляцией 1\ndef    clean_...ny(upper[column] > 0.99)]\n    \n    return to_drop', 'to_drop = clean_corr_data(X_train, X_test)\nX_tra...s=1)\nX_test.drop(X_test.columns[to_drop], axis=1)', 'to_drop = clean_corr_data(X_train, X_test)\nX_tra...=True)\nX_test.drop(to_drop, axis=1, inplace=True)', 'print ("Train : ")\nprint ("\\t", X_train.shape)\np...t ("\\t", X_test.shape)\nprint ("\\t", y_test.shape)', "import numpy as np\nimport pandas as pd          ..._ipython().run_line_magic('matplotlib', 'inline')", r"train_df = pd.read_csv('../data/train.csv', inde...data/test.csv', index_col='Unnamed: 0', sep='\t')", '# train_df.head(1)\n# test_df.head(1)', 'print ("rows \\t\\t: ", train_df.shape[0])\nprint ("columns \\t: ", train_df.shape[1])', "X_train = train_df.drop(['0'], axis=1)\ny_train =...test_df.drop(['0'], axis=1)\ny_test = test_df['0']", 'print ("Train : ")\nprint ("\\t", X_train.shape)\np...t ("\\t", X_test.shape)\nprint ("\\t", y_test.shape)', ...], 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'Out': {50: <pandas_profiling.ProfileReport object>, 64: <pandas_profiling.ProfileReport object>, 68: <pandas_profiling.ProfileReport object>, 71:             7        13        66        67     ...3 -0.423753  1.000000  

[106 rows x 106 columns], 72:             7        13        66        67     ...3  0.423753  1.000000  

[106 rows x 106 columns], 73:             7        13        66        67     ...3 -0.423753  1.000000  

[106 rows x 106 columns], 74: <bound method NDFrame.abs of             7      ... -0.423753  1.000000  

[106 rows x 106 columns]>, 75:             7        13        66        67     ...3  0.423753  1.000000  

[106 rows x 106 columns], 76:       7        13        66        67        68 ...N       NaN       NaN  

[106 rows x 106 columns], 95: 0        0.783002
1        0.719187
2        0.6...0.463786
Name: 329, Length: 30500, dtype: float64, ...}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'StandardScaler': <class 'sklearn.preprocessing.data.StandardScaler'>, 'X_test':               7        13       66       67     ... 0  
30499        0  

[30500 rows x 174 columns], 'X_train':               7        13   66       67       68... 0  
30499        0  

[30500 rows x 174 columns], ...}
        self.user_ns = {'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "import numpy as np\nimport pandas as pd          ..._ipython().run_line_magic('matplotlib', 'inline')", r"train_df = pd.read_csv('../data/train.csv', inde...data/test.csv', index_col='Unnamed: 0', sep='\t')", '# train_df.head(1)\n# test_df.head(1)', 'print ("rows \\t\\t: ", train_df.shape[0])\nprint ("columns \\t: ", train_df.shape[1])', "X_train = train_df.drop(['0'], axis=1)\ny_train =...test_df.drop(['0'], axis=1)\ny_test = test_df['0']", 'print ("Train : ")\nprint ("\\t", X_train.shape)\np...t ("\\t", X_test.shape)\nprint ("\\t", y_test.shape)', 'def    clean_data(train, test):\n    columns_name...   columns_name.append(i)\n    return columns_name', 'clean_columns = clean_data(X_train, X_test)\nX_tr...\nX_test.drop(clean_columns, axis=1, inplace=True)', 'print ("Train : ")\nprint ("\\t", X_train.shape)\np...t ("\\t", X_test.shape)\nprint ("\\t", y_test.shape)', '# Удаляем признаки с корреляцией 1\ndef    clean_...ny(upper[column] > 0.99)]\n    \n    return to_drop', 'to_drop = clean_corr_data(X_train, X_test)\nX_tra...s=1)\nX_test.drop(X_test.columns[to_drop], axis=1)', 'to_drop = clean_corr_data(X_train, X_test)\nX_tra...=True)\nX_test.drop(to_drop, axis=1, inplace=True)', 'print ("Train : ")\nprint ("\\t", X_train.shape)\np...t ("\\t", X_test.shape)\nprint ("\\t", y_test.shape)', "import numpy as np\nimport pandas as pd          ..._ipython().run_line_magic('matplotlib', 'inline')", r"train_df = pd.read_csv('../data/train.csv', inde...data/test.csv', index_col='Unnamed: 0', sep='\t')", '# train_df.head(1)\n# test_df.head(1)', 'print ("rows \\t\\t: ", train_df.shape[0])\nprint ("columns \\t: ", train_df.shape[1])', "X_train = train_df.drop(['0'], axis=1)\ny_train =...test_df.drop(['0'], axis=1)\ny_test = test_df['0']", 'print ("Train : ")\nprint ("\\t", X_train.shape)\np...t ("\\t", X_test.shape)\nprint ("\\t", y_test.shape)', ...], 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'Out': {50: <pandas_profiling.ProfileReport object>, 64: <pandas_profiling.ProfileReport object>, 68: <pandas_profiling.ProfileReport object>, 71:             7        13        66        67     ...3 -0.423753  1.000000  

[106 rows x 106 columns], 72:             7        13        66        67     ...3  0.423753  1.000000  

[106 rows x 106 columns], 73:             7        13        66        67     ...3 -0.423753  1.000000  

[106 rows x 106 columns], 74: <bound method NDFrame.abs of             7      ... -0.423753  1.000000  

[106 rows x 106 columns]>, 75:             7        13        66        67     ...3  0.423753  1.000000  

[106 rows x 106 columns], 76:       7        13        66        67        68 ...N       NaN       NaN  

[106 rows x 106 columns], 95: 0        0.783002
1        0.719187
2        0.6...0.463786
Name: 329, Length: 30500, dtype: float64, ...}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, 'StandardScaler': <class 'sklearn.preprocessing.data.StandardScaler'>, 'X_test':               7        13       66       67     ... 0  
30499        0  

[30500 rows x 174 columns], 'X_train':               7        13   66       67       68... 0  
30499        0  

[30500 rows x 174 columns], ...}
   2911             finally:
   2912                 # Reset our crash handler in place
   2913                 sys.excepthook = old_excepthook
   2914         except SystemExit as e:

...........................................................................
/Users/alatyshe/Google Drive/Projects/Data_Science/ds_course_belka/week_5/homework/<ipython-input-239-6968281f8290> in <module>()
      1 log_reg = GridSearchCV(logit, logit_pipe_params, scoring='roc_auc', n_jobs=-1, verbose=1)
----> 2 log_reg.fit(X_train, y_train)
      3 print (lr_g.best_params_)
      4 print (lr_g.best_score_)

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ...core='warn',
       scoring='roc_auc', verbose=1), X=              7        13   66       67       68... 0  
30499        0  

[30500 rows x 174 columns], y=0        1
1        1
2        0
3        0
4   ...0
30499    0
Name: 0, Length: 30500, dtype: int64, groups=None, **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method StratifiedKFold.split of Stratifie...ld(n_splits=3, random_state=None, shuffle=False)>
        X =               7        13   66       67       68... 0  
30499        0  

[30500 rows x 174 columns]
        y = 0        1
1        1
2        0
3        0
4   ...0
30499    0
Name: 0, Length: 30500, dtype: int64
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Sun Mar 25 16:23:15 2018
PID: 40272                   Python 3.6.2: /Users/alatyshe/virt/bin/python3
...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (LogisticRegression(C=1.0, class_weight='balanced...linear', tol=0.0001, verbose=0, warm_start=False),               7        13   66       67       68... 0  
30499        0  

[30500 rows x 174 columns], 0        1
1        1
2        0
3        0
4   ...0
30499    0
Name: 0, Length: 30500, dtype: int64, {'score': make_scorer(roc_auc_score, needs_threshold=True)}, array([10121, 10122, 10123, ..., 30497, 30498, 30499]), array([    0,     1,     2, ..., 10361, 10366, 10373]), 1, {'logit__C': 1e-08}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (LogisticRegression(C=1.0, class_weight='balanced...linear', tol=0.0001, verbose=0, warm_start=False),               7        13   66       67       68... 0  
30499        0  

[30500 rows x 174 columns], 0        1
1        1
2        0
3        0
4   ...0
30499    0
Name: 0, Length: 30500, dtype: int64, {'score': make_scorer(roc_auc_score, needs_threshold=True)}, array([10121, 10122, 10123, ..., 30497, 30498, 30499]), array([    0,     1,     2, ..., 10361, 10366, 10373]), 1, {'logit__C': 1e-08})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=LogisticRegression(C=1.0, class_weight='balanced...linear', tol=0.0001, verbose=0, warm_start=False), X=              7        13   66       67       68... 0  
30499        0  

[30500 rows x 174 columns], y=0        1
1        1
2        0
3        0
4   ...0
30499    0
Name: 0, Length: 30500, dtype: int64, scorer={'score': make_scorer(roc_auc_score, needs_threshold=True)}, train=array([10121, 10122, 10123, ..., 30497, 30498, 30499]), test=array([    0,     1,     2, ..., 10361, 10366, 10373]), verbose=1, parameters={'logit__C': 1e-08}, fit_params={}, return_train_score='warn', return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    439                       for k, v in fit_params.items()])
    440 
    441     test_scores = {}
    442     train_scores = {}
    443     if parameters is not None:
--> 444         estimator.set_params(**parameters)
        estimator.set_params = <bound method BaseEstimator.set_params of Logist...inear', tol=0.0001, verbose=0, warm_start=False)>
        parameters = {'logit__C': 1e-08}
    445 
    446     start_time = time.time()
    447 
    448     X_train, y_train = _safe_split(estimator, X, y, train)

...........................................................................
/Users/alatyshe/virt/lib/python3.6/site-packages/sklearn/base.py in set_params(self=LogisticRegression(C=1.0, class_weight='balanced...linear', tol=0.0001, verbose=0, warm_start=False), **params={'logit__C': 1e-08})
    269             key, delim, sub_key = key.partition('__')
    270             if key not in valid_params:
    271                 raise ValueError('Invalid parameter %s for estimator %s. '
    272                                  'Check the list of available parameters '
    273                                  'with `estimator.get_params().keys()`.' %
--> 274                                  (key, self))
        key = 'logit'
        self = LogisticRegression(C=1.0, class_weight='balanced...linear', tol=0.0001, verbose=0, warm_start=False)
    275 
    276             if delim:
    277                 nested_params[key][sub_key] = value
    278             else:

ValueError: Invalid parameter logit for estimator LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=17,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False). Check the list of available parameters with `estimator.get_params().keys()`.
___________________________________________________________________________

---
## 3. Постройте бэггинг на логистических регрессиях. Укажите значение на валидации и публичном лидерборде

---
## 4. Постройте случайный лес. Укажите значение на валидации и публичном лидерборде

---
## 5. Подберите лучший вариант простого ансамбля. Опишите его и укажите значение на валидации и публичном лидерборде

---
## 6. Сделайте стекинг. Опишите его и укажите значение на валидации и публичном лидерборде

---
## 7. Укажите ваш ник, значение на lb и валидации, опишите коротко решение

In [67]:
import numpy as np
import pandas as p
# Create feature matrix with two highly correlated features
X = np.array([[1, 1, 1],
              [2, 2, 0],
              [3, 3, 1],
              [4, 4, 0],
              [5, 5, 1],
              [6, 6, 0],
              [7, 7, 1],
              [8, 7, 0],
              [9, 7, 1]])

# Convert feature matrix into DataFrame
df = pd.DataFrame(X)

# View the data frame
df

Unnamed: 0,0,1,2
0,1,1,1
1,2,2,0
2,3,3,1
3,4,4,0
4,5,5,1
5,6,6,0
6,7,7,1
7,8,7,0
8,9,7,1


In [79]:
# Create correlation matrix
df[1]

corr_matrix = df.corr()

# Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
# to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [80]:
corr_matrix

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0
